From 0ea7d6d59fc00fb9b51e9a5986526e33cc1e1695 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 11:08:38 -0500 Subject: [PATCH 001/128] added pattern mode to the config --- .../extraction_pipelines/ep_file_annotation.config.yaml | 1 + .../fn_file_annotation_launch/services/ConfigService.py | 1 + 2 files changed, 2 insertions(+) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index cf709dd6..b7457503 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -47,6 +47,7 @@ config: targetProperty: tags limit: 10000 launchFunction: + patternMode: True # NOTE: Set to false when pattern mode is not needed batchSize: 50 fileSearchProperty: aliases targetEntitiesSearchProperty: aliases diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 8c126a18..bf78bdcb 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -183,6 +183,7 @@ class PrepareFunction(BaseModel, alias_generator=to_camel): class LaunchFunction(BaseModel, alias_generator=to_camel): + pattern_mode: bool batch_size: int = Field(gt=0, le=50) primary_scope_property: str secondary_scope_property: Optional[str] = None From ab349c4268e8ac51225f025c6cd55eb6675afbc1 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 11:17:08 -0500 Subject: [PATCH 002/128] added pattern mode properties to the container and view --- .../data_models/hdm.container.yaml | 14 +++++++++++ .../data_models/hdm.view.yaml | 25 +++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml index 3aa8a3aa..55ebfd40 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml @@ -27,6 +27,20 @@ collation: ucs_basic list: false type: text + patternModeJobId: + nullable: true + type: + type: int64 + patternModeMessage: + nullable: true + type: + type: text + collation: ucs_basic + patternModeStatus: + nullable: true + type: + type: text + collation: ucs_basic attemptCount: autoIncrement: false immutable: false diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml index 18d059be..37e8bb9e 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml @@ -25,7 +25,7 @@ space: {{ annotationStateSchemaSpace }} type: container containerPropertyIdentifier: annotationMessage - description: Annotation message + description: Contains annotations applied or error message name: Annotation message annotationStatus: container: @@ -33,7 +33,7 @@ space: {{ annotationStateSchemaSpace }} type: container containerPropertyIdentifier: annotationStatus - description: Annotation status + description: Holds the status of the files diagram detect job name: Annotation status attemptCount: container: @@ -51,6 +51,27 @@ containerPropertyIdentifier: diagramDetectJobId description: Diagram detect job ID name: Diagram detect job ID + patternModeJobId: + nullable: true + type: + type: int64 + containerPropertyIdentifier: patternModeJobId + description: Diagram detect job ID with pattern mode + name: Pattern mode job ID + patternModeMessage: + nullable: true + type: + type: text + collation: ucs_basic + description: Contains entities found from pattern mode or error message + name: Pattern mode message + patternModeStatus: + nullable: true + type: + type: text + collation: ucs_basic + description: Holds the status of the files pattern mode job + name: Pattern mode status linkedFile: container: externalId: {{ annotationStateExternalId }} From db1ee7c15273ea45045056d531cb65ff47be8de7 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 11:40:02 -0500 Subject: [PATCH 003/128] pattern mode job creation --- .../services/AnnotationService.py | 149 +++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index 9437851a..e1363ffc 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -1,5 +1,6 @@ import abc -from typing import Any +import re +from typing import Any, Iterator from cognite.client import CogniteClient from services.ConfigService import Config @@ -21,6 +22,10 @@ class IAnnotationService(abc.ABC): def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: pass + @abc.abstractmethod + def run_pattern_mode_detect(self, files: list[FileReference], samples: list[dict[str, Any]]) -> int: + pass + # maybe a different class for debug mode and run mode? class GeneralAnnotationService(IAnnotationService): @@ -51,3 +56,145 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str return detect_job.job_id else: raise Exception(f"404 ---- No job Id was created") + + def run_pattern_mode_detect(self, files: list, samples: list[dict[str, Any]]) -> int: + """Generates patterns and runs the diagram detection job in pattern mode.""" + pattern_samples = self._generate_tag_samples_from_entities(samples) + self.logger.info(f"Generated {len(pattern_samples)} pattern samples for detection.") + + detect_job: DiagramDetectResults = self.client.diagrams.detect( + file_references=files, + entities=pattern_samples, # Use the generated patterns + partial_match=self.annotation_config.partial_match, + min_tokens=self.annotation_config.min_tokens, + search_field="sample", # The key in your generated samples + configuration=self.diagram_detect_config, + pattern_mode=True, # The crucial flag + ) + if detect_job.job_id: + return detect_job.job_id + else: + raise Exception("API call to diagram/detect in pattern mode did not return a job ID.") + + def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: + """ + Generates pattern samples from entity aliases by converting them into generalized templates. + This version analyzes the internal structure of each segment: + - Numbers are generalized to '0'. + - Letters are grouped into bracketed alternatives, even when mixed with numbers. + - Example: '629P' and '629X' will merge to create a pattern piece '000[P|X]'. + """ + # Structure: { resource_type: { full_template_key: list_of_collected_variable_parts } } + # where list_of_collected_variable_parts is [ [{'L1_alt1', 'L1_alt2'}], [{'L2_alt1'}], ... ] + pattern_builders: dict[str, dict[str, list[list[set[str]]]]] = {} + + def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: + """ + Parses an alias into a structural template key and its variable letter components. + A segment '629P' yields a template '000A' and a variable part ['P']. + """ + alias_parts = re.split(r"([ -])", alias) + full_template_key_parts: list[str] = [] + all_variable_parts: list[list[str]] = [] + + for i, part in enumerate(alias_parts): + if not part: + continue + # Handle delimiters + if part in [" ", "-"]: + full_template_key_parts.append(part) + continue + + # Handle fixed constants (override everything else) + left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) + right_ok = (i == len(alias_parts) - 1) or (alias_parts[i + 1] in [" ", "-"]) + if left_ok and right_ok and part == resource_type_key: + full_template_key_parts.append(f"[{part}]") + continue + + # --- Dissect the segment to create its template and find variable letters --- + # 1. Create the structural template for the segment (e.g., '629P' -> '000A') + segment_template = re.sub(r"\d", "0", part) + segment_template = re.sub(r"[A-Za-z]", "A", segment_template) + full_template_key_parts.append(segment_template) + + # 2. Extract all groups of letters from the segment + variable_letters = re.findall(r"[A-Za-z]+", part) + if variable_letters: + all_variable_parts.append(variable_letters) + return "".join(full_template_key_parts), all_variable_parts + + for entity in entities: + key = entity.get("resourceType") or entity.get("external_id") or "tag" + if key not in pattern_builders: + pattern_builders[key] = {} + + aliases = entity.get("aliases", []) + for alias in aliases: + if not alias: + continue + # NOTE: THESE are TEMP fixes. Please do not include in the way it is now as a final soln + if "_" in alias or "," in alias: + continue + if alias[0] == ".": + continue + if alias.isdigit(): + continue + if alias.isalpha(): + continue + if len(alias) <= 2: # accounts for 'T' or 'SP' + continue + if alias.count("-") == 1 and key == "Asset Annotation": + # accounts for 605-JT | 114-JT + temp = alias.split("-") + if temp[0].isdigit(): + continue + + template_key, variable_parts_from_alias = _parse_alias(alias, key) + + if template_key in pattern_builders[key]: + # Merge with existing variable parts + existing_variable_sets = pattern_builders[key][template_key] + for i, part_group in enumerate(variable_parts_from_alias): + for j, letter_group in enumerate(part_group): + existing_variable_sets[i][j].add(letter_group) + else: + # Create a new entry with the correct structure (list of lists of sets) + new_variable_sets = [] + for part_group in variable_parts_from_alias: + new_variable_sets.append([set([lg]) for lg in part_group]) + pattern_builders[key][template_key] = new_variable_sets + + # --- Build the final result from the processed patterns --- + result = [] + for resource_type, templates in pattern_builders.items(): + final_samples = [] + for template_key, collected_vars in templates.items(): + # Create an iterator for the collected letter groups + var_iter: Iterator[list[set[str]]] = iter(collected_vars) + + def build_segment(segment_template: str) -> str: + # This function rebuilds one segment, substituting 'A's with bracketed alternatives + if "A" not in segment_template: + return segment_template + try: + letter_groups_for_segment = next(var_iter) + letter_group_iter: Iterator[set[str]] = iter(letter_groups_for_segment) + + def replace_A(match): + alternatives = sorted(list(next(letter_group_iter))) + return f"[{'|'.join(alternatives)}]" + + return re.sub(r"A+", replace_A, segment_template) + except StopIteration: + return segment_template # Should not happen in normal flow + + # Split the full template by delimiters, process each part, then rejoin + final_pattern_parts = [ + build_segment(p) if p not in " -" else p for p in re.split(r"([ -])", template_key) + ] + final_samples.append("".join(final_pattern_parts)) + + if final_samples: + result.append({"sample": sorted(final_samples), "resourceType": resource_type}) + return result From 8c9e0cfb227fa6c6e2abc0e20a0c8912f7b57f1a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 11:53:55 -0500 Subject: [PATCH 004/128] function parameter change --- .../services/AnnotationService.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index e1363ffc..e3096173 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -23,7 +23,7 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str pass @abc.abstractmethod - def run_pattern_mode_detect(self, files: list[FileReference], samples: list[dict[str, Any]]) -> int: + def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: pass @@ -55,11 +55,10 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str if detect_job.job_id: return detect_job.job_id else: - raise Exception(f"404 ---- No job Id was created") + raise Exception(f"API call to diagram/detect in pattern mode did not return a job ID") - def run_pattern_mode_detect(self, files: list, samples: list[dict[str, Any]]) -> int: + def run_pattern_mode_detect(self, files: list, pattern_samples: list[dict[str, Any]]) -> int: """Generates patterns and runs the diagram detection job in pattern mode.""" - pattern_samples = self._generate_tag_samples_from_entities(samples) self.logger.info(f"Generated {len(pattern_samples)} pattern samples for detection.") detect_job: DiagramDetectResults = self.client.diagrams.detect( @@ -74,7 +73,7 @@ def run_pattern_mode_detect(self, files: list, samples: list[dict[str, Any]]) -> if detect_job.job_id: return detect_job.job_id else: - raise Exception("API call to diagram/detect in pattern mode did not return a job ID.") + raise Exception("API call to diagram/detect in pattern mode did not return a job ID") def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: """ From 58873f2fb01b1d7439be19e7e0bfaa04a1a02a74 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 12:03:27 -0500 Subject: [PATCH 005/128] run pattern mode in process batch --- .../services/LaunchService.py | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 51ab9803..a030d389 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -94,6 +94,7 @@ def __init__( self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.in_memory_cache: list[dict] = [] + self.in_memory_pattern_cache: list[dict] = [] self._cached_primary_scope: str | None = None self._cached_secondary_scope: str | None = None @@ -363,9 +364,11 @@ def _process_batch(self, batch: BatchOfPairedNodes): if batch.is_empty(): return - self.logger.info(f"Running diagram detect on {batch.size()} files with {len(self.in_memory_cache)} entities") - try: + # Run regular diagram detect + self.logger.info( + f"Running diagram detect on {batch.size()} files with {len(self.in_memory_cache)} entities" + ) job_id: int = self.annotation_service.run_diagram_detect( files=batch.file_references, entities=self.in_memory_cache ) @@ -374,11 +377,23 @@ def _process_batch(self, batch: BatchOfPairedNodes): "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, } + + # Run diagram detect on pattern mode + if self.config.launch_function.pattern_mode: + self.logger.info( + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern_cache)} entities" + ) + pattern_job_id = self.annotation_service.run_pattern_mode_detect( + files=batch.file_references, pattern_samples=self.in_memory_pattern_cache + ) + update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING + update_properties["patternModeJobId"] = pattern_job_id + batch.batch_states.update_node_properties( new_properties=update_properties, view_id=self.annotation_state_view.as_view_id(), ) - update_results = self.data_model_service.update_annotation_state(batch.batch_states.apply) + self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}", section="END", @@ -402,9 +417,11 @@ def _process_batch(self, batch: BatchOfPairedNodes): if batch.is_empty(): return - self.logger.info(f"Running diagram detect on {batch.size()} files with {len(self.in_memory_cache)} entities") - try: + # Run regular diagram detect + self.logger.info( + f"Running diagram detect on {batch.size()} files with {len(self.in_memory_cache)} entities" + ) job_id: int = self.annotation_service.run_diagram_detect( files=batch.file_references, entities=self.in_memory_cache ) @@ -413,11 +430,23 @@ def _process_batch(self, batch: BatchOfPairedNodes): "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, } + + # Run diagram detect on pattern mode + if self.config.launch_function.pattern_mode: + self.logger.info( + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern_cache)} entities" + ) + pattern_job_id = self.annotation_service.run_pattern_mode_detect( + files=batch.file_references, pattern_samples=self.in_memory_pattern_cache + ) + update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING + update_properties["patternModeJobId"] = pattern_job_id + batch.batch_states.update_node_properties( new_properties=update_properties, view_id=self.annotation_state_view.as_view_id(), ) - update_results = self.data_model_service.update_annotation_state(batch.batch_states.apply) + self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}", section="END", From 373e9befde286f7abacc8f8261466005dd110ac1 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 13:21:11 -0500 Subject: [PATCH 006/128] separated pattern samples to files and assets --- .../services/AnnotationService.py | 126 +---------- .../services/CacheService.py | 202 ++++++++++++++---- .../services/LaunchService.py | 15 +- .../utils/DataStructures.py | 5 +- 4 files changed, 168 insertions(+), 180 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index e3096173..204daf9f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -1,6 +1,5 @@ import abc -import re -from typing import Any, Iterator +from typing import Any from cognite.client import CogniteClient from services.ConfigService import Config @@ -74,126 +73,3 @@ def run_pattern_mode_detect(self, files: list, pattern_samples: list[dict[str, A return detect_job.job_id else: raise Exception("API call to diagram/detect in pattern mode did not return a job ID") - - def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: - """ - Generates pattern samples from entity aliases by converting them into generalized templates. - This version analyzes the internal structure of each segment: - - Numbers are generalized to '0'. - - Letters are grouped into bracketed alternatives, even when mixed with numbers. - - Example: '629P' and '629X' will merge to create a pattern piece '000[P|X]'. - """ - # Structure: { resource_type: { full_template_key: list_of_collected_variable_parts } } - # where list_of_collected_variable_parts is [ [{'L1_alt1', 'L1_alt2'}], [{'L2_alt1'}], ... ] - pattern_builders: dict[str, dict[str, list[list[set[str]]]]] = {} - - def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: - """ - Parses an alias into a structural template key and its variable letter components. - A segment '629P' yields a template '000A' and a variable part ['P']. - """ - alias_parts = re.split(r"([ -])", alias) - full_template_key_parts: list[str] = [] - all_variable_parts: list[list[str]] = [] - - for i, part in enumerate(alias_parts): - if not part: - continue - # Handle delimiters - if part in [" ", "-"]: - full_template_key_parts.append(part) - continue - - # Handle fixed constants (override everything else) - left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) - right_ok = (i == len(alias_parts) - 1) or (alias_parts[i + 1] in [" ", "-"]) - if left_ok and right_ok and part == resource_type_key: - full_template_key_parts.append(f"[{part}]") - continue - - # --- Dissect the segment to create its template and find variable letters --- - # 1. Create the structural template for the segment (e.g., '629P' -> '000A') - segment_template = re.sub(r"\d", "0", part) - segment_template = re.sub(r"[A-Za-z]", "A", segment_template) - full_template_key_parts.append(segment_template) - - # 2. Extract all groups of letters from the segment - variable_letters = re.findall(r"[A-Za-z]+", part) - if variable_letters: - all_variable_parts.append(variable_letters) - return "".join(full_template_key_parts), all_variable_parts - - for entity in entities: - key = entity.get("resourceType") or entity.get("external_id") or "tag" - if key not in pattern_builders: - pattern_builders[key] = {} - - aliases = entity.get("aliases", []) - for alias in aliases: - if not alias: - continue - # NOTE: THESE are TEMP fixes. Please do not include in the way it is now as a final soln - if "_" in alias or "," in alias: - continue - if alias[0] == ".": - continue - if alias.isdigit(): - continue - if alias.isalpha(): - continue - if len(alias) <= 2: # accounts for 'T' or 'SP' - continue - if alias.count("-") == 1 and key == "Asset Annotation": - # accounts for 605-JT | 114-JT - temp = alias.split("-") - if temp[0].isdigit(): - continue - - template_key, variable_parts_from_alias = _parse_alias(alias, key) - - if template_key in pattern_builders[key]: - # Merge with existing variable parts - existing_variable_sets = pattern_builders[key][template_key] - for i, part_group in enumerate(variable_parts_from_alias): - for j, letter_group in enumerate(part_group): - existing_variable_sets[i][j].add(letter_group) - else: - # Create a new entry with the correct structure (list of lists of sets) - new_variable_sets = [] - for part_group in variable_parts_from_alias: - new_variable_sets.append([set([lg]) for lg in part_group]) - pattern_builders[key][template_key] = new_variable_sets - - # --- Build the final result from the processed patterns --- - result = [] - for resource_type, templates in pattern_builders.items(): - final_samples = [] - for template_key, collected_vars in templates.items(): - # Create an iterator for the collected letter groups - var_iter: Iterator[list[set[str]]] = iter(collected_vars) - - def build_segment(segment_template: str) -> str: - # This function rebuilds one segment, substituting 'A's with bracketed alternatives - if "A" not in segment_template: - return segment_template - try: - letter_groups_for_segment = next(var_iter) - letter_group_iter: Iterator[set[str]] = iter(letter_groups_for_segment) - - def replace_A(match): - alternatives = sorted(list(next(letter_group_iter))) - return f"[{'|'.join(alternatives)}]" - - return re.sub(r"A+", replace_A, segment_template) - except StopIteration: - return segment_template # Should not happen in normal flow - - # Split the full template by delimiters, process each part, then rejoin - final_pattern_parts = [ - build_segment(p) if p not in " -" else p for p in re.split(r"([ -])", template_key) - ] - final_samples.append("".join(final_pattern_parts)) - - if final_samples: - result.append({"sample": sorted(final_samples), "resourceType": resource_type}) - return result diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 742e14c3..81bbe2ea 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -1,4 +1,6 @@ import abc +import re +from typing import Iterator from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient from cognite.client.data_classes import RowWrite, Row @@ -25,15 +27,19 @@ def get_entities( data_model_service: IDataModelService, primary_scope_value: str, secondary_scope_value: str | None, - ) -> list[dict]: + ) -> tuple[list[dict], list[dict]]: + pass + + @abc.abstractmethod + def update_cache(self, raw_db: str, raw_tbl: str, row_to_write: RowWrite) -> None: pass @abc.abstractmethod - def _update_cache(self) -> list[dict]: + def _validate_cache(self, last_update_datetime_str: str) -> bool: pass @abc.abstractmethod - def _validate_cache(self) -> bool: + def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: pass @@ -61,10 +67,11 @@ def get_entities( data_model_service: IDataModelService, primary_scope_value: str, secondary_scope_value: str | None, - ) -> list[dict]: + ) -> tuple[list[dict], list[dict]]: """ - Returns file and asset entities for use in diagram detect job - Ensures that the cache is up to date and valid + Returns file and asset entities for use in diagram detect job. + Ensures that the cache is up to date and valid. This method orchestrates + the fetching of data and the updating of the cache. """ entities: list[dict] = [] if secondary_scope_value: @@ -72,62 +79,64 @@ def get_entities( else: key = f"{primary_scope_value}" - cdf_raw = self.client.raw.rows - row: Row | None = cdf_raw.retrieve(db_name=self.db_name, table_name=self.tbl_name, key=key) + try: + row: Row | None = self.client.raw.rows.retrieve(db_name=self.db_name, table_name=self.tbl_name, key=key) + except: + row = None - if row and row.columns: - last_update_time_str = row.columns["LastUpdateTimeUtcIso"] - if self._validate_cache(last_update_time_str) == False: - self.logger.debug("Refreshing RAW entities cache") - entities = self._update_cache(data_model_service, key, primary_scope_value, secondary_scope_value) - else: - asset_entity: list[dict] = row.columns["AssetEntities"] - file_entity: list[dict] = row.columns["FileEntities"] - entities = asset_entity + file_entity - else: - entities = self._update_cache(data_model_service, key, primary_scope_value, secondary_scope_value) + # Attempt to retrieve from the cache + if row and row.columns and self._validate_cache(row.columns["LastUpdateTimeUtcIso"]): + self.logger.debug(f"Cache valid for key: {key}. Retrieving entities and patterns.") + asset_entities: list[dict] = row.columns.get("AssetEntities", []) + file_entities: list[dict] = row.columns.get("FileEntities", []) + asset_pattern_samples: list[dict] = row.columns.get("AssetPatternSamples", []) # Get patterns from cache + file_pattern_samples: list[dict] = row.columns.get("FilePatternSamples", []) # Get patterns from cache - return entities + return (asset_entities + file_entities), (asset_pattern_samples + file_pattern_samples) - def _update_cache( - self, - data_model_service: IDataModelService, - key: str, - primary_scope_value: str, - secondary_scope_value: str | None, - ) -> list[dict]: - """ - Creates (or overwrites) the cache for a given group. It fetches all relevant - contextualization entities for the files in the group from the data model - and stores them in the cache table. - """ - asset_instances: NodeList - file_instances: NodeList + self.logger.info(f"Refreshing RAW entities cache and patterns cache for key: {key}") + + # Fetch data asset_instances, file_instances = data_model_service.get_instances_entities( primary_scope_value, secondary_scope_value ) - asset_entities: list[dict] = [] - file_entities: list[dict] = [] + # Convert to entities for diagram detect job asset_entities, file_entities = self._convert_instances_to_entities(asset_instances, file_instances) + entities = asset_entities + file_entities + + # Generate pattern samples from the same entities + asset_pattern_samples = self._generate_tag_samples_from_entities(asset_entities) + file_pattern_samples = self._generate_tag_samples_from_entities(file_entities) + pattern_samples = asset_pattern_samples + file_pattern_samples - current_time_seconds = datetime.now(timezone.utc).isoformat() + # Update cache new_row = RowWrite( key=key, columns={ "AssetEntities": asset_entities, "FileEntities": file_entities, - "LastUpdateTimeUtcIso": current_time_seconds, + "AssetPatternSamples": asset_pattern_samples, + "FilePatternSamples": file_pattern_samples, + "LastUpdateTimeUtcIso": datetime.now(timezone.utc).isoformat(), }, ) + self._update_cache(new_row) + return entities, pattern_samples + + def _update_cache(self, row_to_write: RowWrite) -> None: + """ + Writes a single, fully-formed RowWrite object to the RAW cache table. + This method's only responsibility is the database insertion. + """ self.client.raw.rows.insert( db_name=self.db_name, table_name=self.tbl_name, - row=new_row, + row=row_to_write, + ensure_parent=True, ) - - entities = asset_entities + file_entities - return entities + self.logger.info(f"Successfully updated RAW cache") + return def _validate_cache(self, last_update_datetime_str: str) -> bool: """ @@ -190,3 +199,112 @@ def _convert_instances_to_entities( file_entities.append(file_entity.to_dict()) return target_entities, file_entities + + def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: + """ + Generates pattern samples from entity aliases by converting them into generalized templates. + This version analyzes the internal structure of each segment: + - Numbers are generalized to '0'. + - Letters are grouped into bracketed alternatives, even when mixed with numbers. + - Example: '629P' and '629X' will merge to create a pattern piece '000[P|X]'. + """ + # Structure: { resource_type: { full_template_key: list_of_collected_variable_parts } } + # where list_of_collected_variable_parts is [ [{'L1_alt1', 'L1_alt2'}], [{'L2_alt1'}], ... ] + pattern_builders: dict[str, dict[str, list[list[set[str]]]]] = {} + + def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: + """ + Parses an alias into a structural template key and its variable letter components. + A segment '629P' yields a template '000A' and a variable part ['P']. + """ + self.logger.info(f"Generating pattern samples from {len(entities)} entities.") + + alias_parts = re.split(r"([ -])", alias) + full_template_key_parts: list[str] = [] + all_variable_parts: list[list[str]] = [] + + for i, part in enumerate(alias_parts): + if not part: + continue + # Handle delimiters + if part in [" ", "-"]: + full_template_key_parts.append(part) + continue + + # Handle fixed constants (override everything else) + left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) + right_ok = (i == len(alias_parts) - 1) or (alias_parts[i + 1] in [" ", "-"]) + if left_ok and right_ok and part == resource_type_key: + full_template_key_parts.append(f"[{part}]") + continue + + # --- Dissect the segment to create its template and find variable letters --- + # 1. Create the structural template for the segment (e.g., '629P' -> '000A') + segment_template = re.sub(r"\d", "0", part) + segment_template = re.sub(r"[A-Za-z]", "A", segment_template) + full_template_key_parts.append(segment_template) + + # 2. Extract all groups of letters from the segment + variable_letters = re.findall(r"[A-Za-z]+", part) + if variable_letters: + all_variable_parts.append(variable_letters) + return "".join(full_template_key_parts), all_variable_parts + + for entity in entities: + key = entity.get("resourceType") or entity.get("external_id") or "tag" + if key not in pattern_builders: + pattern_builders[key] = {} + + aliases = entity.get("search_property", []) + for alias in aliases: + if not alias: + continue + + template_key, variable_parts_from_alias = _parse_alias(alias, key) + + if template_key in pattern_builders[key]: + # Merge with existing variable parts + existing_variable_sets = pattern_builders[key][template_key] + for i, part_group in enumerate(variable_parts_from_alias): + for j, letter_group in enumerate(part_group): + existing_variable_sets[i][j].add(letter_group) + else: + # Create a new entry with the correct structure (list of lists of sets) + new_variable_sets = [] + for part_group in variable_parts_from_alias: + new_variable_sets.append([set([lg]) for lg in part_group]) + pattern_builders[key][template_key] = new_variable_sets + + # --- Build the final result from the processed patterns --- + result = [] + for resource_type, templates in pattern_builders.items(): + final_samples = [] + for template_key, collected_vars in templates.items(): + # Create an iterator for the collected letter groups + var_iter: Iterator[list[set[str]]] = iter(collected_vars) + + def build_segment(segment_template: str) -> str: + # This function rebuilds one segment, substituting 'A's with bracketed alternatives + if "A" not in segment_template: + return segment_template + try: + letter_groups_for_segment = next(var_iter) + letter_group_iter: Iterator[set[str]] = iter(letter_groups_for_segment) + + def replace_A(match): + alternatives = sorted(list(next(letter_group_iter))) + return f"[{'|'.join(alternatives)}]" + + return re.sub(r"A+", replace_A, segment_template) + except StopIteration: + return segment_template # Should not happen in normal flow + + # Split the full template by delimiters, process each part, then rejoin + final_pattern_parts = [ + build_segment(p) if p not in " -" else p for p in re.split(r"([ -])", template_key) + ] + final_samples.append("".join(final_pattern_parts)) + + if final_samples: + result.append({"sample": sorted(final_samples), "resourceType": resource_type}) + return result diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index a030d389..7a75c0d5 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -52,9 +52,6 @@ def __init__( @abc.abstractmethod def prepare(self) -> str | None: - """ - Peronally think it's cleaner having this operate as a separate cognite function -> but due to mpc function constraints it wouldn't make sense for our project to go down this route (Jack) - """ pass @abc.abstractmethod @@ -94,7 +91,7 @@ def __init__( self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.in_memory_cache: list[dict] = [] - self.in_memory_pattern_cache: list[dict] = [] + self.in_memory_pattern: list[dict] = [] self._cached_primary_scope: str | None = None self._cached_secondary_scope: str | None = None @@ -338,7 +335,7 @@ def _ensure_cache_for_batch(self, primary_scope_value: str, secondary_scope_valu ): self.logger.info(f"Refreshing in memory cache") try: - self.in_memory_cache = self.cache_service.get_entities( + self.in_memory_cache, self.in_memory_patterns = self.cache_service.get_entities( self.data_model_service, primary_scope_value, secondary_scope_value ) self._cached_primary_scope = primary_scope_value @@ -381,10 +378,10 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern_cache)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern)} entities" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( - files=batch.file_references, pattern_samples=self.in_memory_pattern_cache + files=batch.file_references, pattern_samples=self.in_memory_pattern ) update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING update_properties["patternModeJobId"] = pattern_job_id @@ -434,10 +431,10 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern_cache)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern)} entities" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( - files=batch.file_references, pattern_samples=self.in_memory_pattern_cache + files=batch.file_references, pattern_samples=self.in_memory_pattern ) update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING update_properties["patternModeJobId"] = pattern_job_id diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index 0f7bc3f2..6314cce6 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -125,12 +125,9 @@ class entity: "external_id": file.external_id, "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, - search_property: file.properties[job_config.file_view.as_view_id()][ - search_property - ], "annotation_type_external_id": job_config.file_view.type, + "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } - Note: kind of prefer a generic variable name here as opposed to specific ones that changes based off config -> i.e.) for marathon the variable here would be aliases instead of search_property """ external_id: str From 6bd4598aded4654ef2e55853ffd43eda43d22576 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 13:50:27 -0500 Subject: [PATCH 007/128] added a file category property to the entities to support pattern mode --- .../ep_file_annotation.config.yaml | 5 ++++- .../services/CacheService.py | 20 +++++++++++++++++-- .../services/ConfigService.py | 4 +++- .../utils/DataStructures.py | 4 +++- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index b7457503..b8819007 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -47,12 +47,15 @@ config: targetProperty: tags limit: 10000 launchFunction: - patternMode: True # NOTE: Set to false when pattern mode is not needed batchSize: 50 fileSearchProperty: aliases targetEntitiesSearchProperty: aliases primaryScopeProperty: None secondaryScopeProperty: + # NOTE: below configurations are used by pattern mode + patternMode: True + fileCategoryProperty: + targetEntitiesCategoryProperty: dataModelService: getFilesToProcessQuery: targetView: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 81bbe2ea..c21496ef 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -162,17 +162,24 @@ def _convert_instances_to_entities( """ Convert the asset and file nodes into an entity """ + target_entities_category_property: str | None = self.config.launch_function.target_entities_category_property target_entities_search_property: str = self.config.launch_function.target_entities_search_property target_entities: list[dict] = [] + for instance in asset_instances: instance_properties = instance.properties.get(self.target_entities_view.as_view_id()) + if target_entities_category_property: + category: str = instance_properties[target_entities_category_property] + else: + category: str = self.target_entities_view.external_id if target_entities_search_property in instance_properties: asset_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, - search_property=instance_properties.get(target_entities_search_property), annotation_type_external_id=self.target_entities_view.annotation_type, + category_property=category, + search_property=instance_properties.get(target_entities_search_property), ) target_entities.append(asset_entity.to_dict()) else: @@ -181,19 +188,27 @@ def _convert_instances_to_entities( name=instance_properties.get("name"), space=instance.space, search_property=instance_properties.get("name"), + category_property=category, annotation_type_external_id=self.target_entities_view.annotation_type, ) target_entities.append(asset_entity.to_dict()) + file_category_property: str | None = self.config.launch_function.file_category_property file_search_property: str = self.config.launch_function.file_search_property file_entities: list[dict] = [] + for instance in file_instances: instance_properties = instance.properties.get(self.file_view.as_view_id()) + if target_entities_category_property: + category: str = instance_properties[file_category_property] + else: + category: str = self.file_view.external_id file_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, search_property=instance_properties.get(file_search_property), + category_property=category, annotation_type_external_id=self.file_view.annotation_type, ) file_entities.append(file_entity.to_dict()) @@ -251,7 +266,8 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str return "".join(full_template_key_parts), all_variable_parts for entity in entities: - key = entity.get("resourceType") or entity.get("external_id") or "tag" + # NOTE: + key = entity["category_property"] if key not in pattern_builders: pattern_builders[key] = {} diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index bf78bdcb..ac9186da 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -183,12 +183,14 @@ class PrepareFunction(BaseModel, alias_generator=to_camel): class LaunchFunction(BaseModel, alias_generator=to_camel): - pattern_mode: bool batch_size: int = Field(gt=0, le=50) primary_scope_property: str secondary_scope_property: Optional[str] = None file_search_property: str = "aliases" target_entities_search_property: str = "aliases" + pattern_mode: bool + file_category_property: Optional[str] = None + target_entities_category_property: Optional[str] = None data_model_service: DataModelServiceConfig cache_service: CacheServiceConfig annotation_service: AnnotationServiceConfig diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index 6314cce6..b3b9b7a5 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -126,7 +126,8 @@ class entity: "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, "annotation_type_external_id": job_config.file_view.type, - "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], + "search_property": file.properties[job_config.file_view.as_view_id()][{category_property}], + "category_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } """ @@ -134,6 +135,7 @@ class entity: name: str space: str annotation_type_external_id: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + category_property: str search_property: list[str] = field(default_factory=list) def to_dict(self): From e02c57522fe62d8a60efaecfd62302aabbae9d6d Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 13:51:28 -0500 Subject: [PATCH 008/128] adjustment in comments --- .../fn_file_annotation_launch/services/CacheService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index c21496ef..6816fc30 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -217,7 +217,7 @@ def _convert_instances_to_entities( def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: """ - Generates pattern samples from entity aliases by converting them into generalized templates. + Generates pattern samples from entity search property by converting them into generalized templates. This version analyzes the internal structure of each segment: - Numbers are generalized to '0'. - Letters are grouped into bracketed alternatives, even when mixed with numbers. From a61ec9eba24919b0539e879fd5d31ee4b85bd999 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 14:04:13 -0500 Subject: [PATCH 009/128] Corrected ICacheService implementation --- .../fn_file_annotation_launch/services/CacheService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 6816fc30..f1d6ac4b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -31,7 +31,7 @@ def get_entities( pass @abc.abstractmethod - def update_cache(self, raw_db: str, raw_tbl: str, row_to_write: RowWrite) -> None: + def _update_cache(self, row_to_write: RowWrite) -> None: pass @abc.abstractmethod From 579941cdb925dca286360afffa4609b5b5e22086 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 14:06:52 -0500 Subject: [PATCH 010/128] matched changes from launch to finalize --- .../fn_file_annotation_finalize/services/ConfigService.py | 3 +++ .../fn_file_annotation_finalize/utils/DataStructures.py | 7 +++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 8c126a18..ac9186da 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -188,6 +188,9 @@ class LaunchFunction(BaseModel, alias_generator=to_camel): secondary_scope_property: Optional[str] = None file_search_property: str = "aliases" target_entities_search_property: str = "aliases" + pattern_mode: bool + file_category_property: Optional[str] = None + target_entities_category_property: Optional[str] = None data_model_service: DataModelServiceConfig cache_service: CacheServiceConfig annotation_service: AnnotationServiceConfig diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py index 0f7bc3f2..b3b9b7a5 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py @@ -125,18 +125,17 @@ class entity: "external_id": file.external_id, "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, - search_property: file.properties[job_config.file_view.as_view_id()][ - search_property - ], "annotation_type_external_id": job_config.file_view.type, + "search_property": file.properties[job_config.file_view.as_view_id()][{category_property}], + "category_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } - Note: kind of prefer a generic variable name here as opposed to specific ones that changes based off config -> i.e.) for marathon the variable here would be aliases instead of search_property """ external_id: str name: str space: str annotation_type_external_id: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + category_property: str search_property: list[str] = field(default_factory=list) def to_dict(self): From 5b3d46e00b3131dc9e1087ebf7663a71a130d978 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 16:51:31 -0500 Subject: [PATCH 011/128] removed pattern mode status from data model and incorporated bits of post processing into the finalize function --- .../data_models/hdm.container.yaml | 16 +- .../data_models/hdm.view.yaml | 7 - .../services/ApplyService.py | 67 ++++++- .../services/FinalizeService.py | 176 ++++++++++-------- .../services/RetrieveService.py | 11 +- .../services/CacheService.py | 4 +- 6 files changed, 171 insertions(+), 110 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml index 55ebfd40..fa448add 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml @@ -27,20 +27,11 @@ collation: ucs_basic list: false type: text - patternModeJobId: - nullable: true - type: - type: int64 patternModeMessage: nullable: true type: type: text collation: ucs_basic - patternModeStatus: - nullable: true - type: - type: text - collation: ucs_basic attemptCount: autoIncrement: false immutable: false @@ -55,6 +46,13 @@ type: list: false type: int64 + patternModeJobId: + autoIncrement: false + immutable: false + nullable: true + type: + list: false + type: int64 linkedFile: autoIncrement: false immutable: false diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml index 37e8bb9e..89f593b7 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml @@ -65,13 +65,6 @@ collation: ucs_basic description: Contains entities found from pattern mode or error message name: Pattern mode message - patternModeStatus: - nullable: true - type: - type: text - collation: ucs_basic - description: Holds the status of the files pattern mode job - name: Pattern mode status linkedFile: container: externalId: {{ annotationStateExternalId }} diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 316ac61e..52f8648b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -39,6 +39,10 @@ class IApplyService(abc.ABC): def apply_annotations(self, result_item: dict, file_id: NodeId) -> tuple[list, list]: pass + @abc.abstractmethod + def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[RowWrite]: + pass + @abc.abstractmethod def update_nodes(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: pass @@ -54,7 +58,8 @@ class GeneralApplyService(IApplyService): """ EXTERNAL_ID_LIMIT = 256 - FUNCTION_ID = "fn_dm_context_annotation_finalize" + + FUNCTION_ID = "fn_file_annotation_finalize" def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): self.client: CogniteClient = client @@ -149,11 +154,11 @@ def _detect_annotation_to_edge_applies( ) doc_log = { - "external_id": external_id, - "start_source_id": source_id, - "start_node": file_instance_id.external_id, - "end_node": entity["external_id"], - "end_node_space": entity["space"], + "externalId": external_id, + "startSourceId": source_id, + "startNode": file_instance_id.external_id, + "endNode": entity["external_id"], + "endNodeSpace": entity["space"], "view_id": self.core_annotation_view_id.external_id, "view_space": self.core_annotation_view_id.space, "view_version": self.core_annotation_view_id.version, @@ -204,9 +209,9 @@ def _detect_annotation_to_edge_applies( diagram_annotations[edge_apply_key] = edge_apply_instance if entity["annotation_type_external_id"] == self.file_annotation_type: - doc_doc.append(RowWrite(key=doc_log["external_id"], columns=doc_log)) + doc_doc.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) else: - doc_tag.append(RowWrite(key=doc_log["external_id"], columns=doc_log)) + doc_tag.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) return diagram_annotations @@ -259,6 +264,52 @@ def delete_annotations_for_file( return doc_annotations_delete, tag_annotations_delete + def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[RowWrite]: + + if not result_item.get("annotations"): + return [] + + file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( + nodes=file_id, sources=self.file_view_id + ) + if not file_node: + return [] + + doc_patterns: list[RowWrite] = [] + source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) + for detect_annotation in result_item["annotations"]: + for entity in detect_annotation["entities"]: + if detect_annotation["confidence"] >= self.approve_threshold: + annotation_status = DiagramAnnotationStatus.APPROVED.value + elif detect_annotation["confidence"] >= self.suggest_threshold: + annotation_status = DiagramAnnotationStatus.SUGGESTED.value + else: + continue + + # TODO: need to change row columns to create reference catalog -> which will live as a DM not RAW tbl-> from there create the table needed for baseline report + baseline_properties = { + "startSourceId": source_id, + "startNode": file_id.external_id, + "text": detect_annotation["text"], + "category": entity["category_property"], + "confidence": detect_annotation["confidence"], + "status": annotation_status, + "startNodePageNumber": detect_annotation["region"]["page"], + "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), + "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), + "startNodeXMax": max(v["x"] for v in detect_annotation["region"]["vertices"]), + "startNodeYMax": max(v["y"] for v in detect_annotation["region"]["vertices"]), + "sourceCreatedUser": self.FUNCTION_ID, + "sourceUpdatedUser": self.FUNCTION_ID, + } + + row: RowWrite = RowWrite( + key=f"{baseline_properties["startSourceId"]}_{baseline_properties["text"]}_{baseline_properties["startSourceId"]}_{detect_annotation["region"]["page"]}_{min(v["x"] for v in detect_annotation["region"]["vertices"])}_{min(v["y"] for v in detect_annotation["region"]["vertices"])}", + columns=baseline_properties, + ) + doc_patterns.append(row) + return doc_patterns + def _list_annotations_for_file( self, node: NodeId, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 9b42001c..bd403633 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -4,6 +4,7 @@ from datetime import datetime, timezone from cognite.client import CogniteClient from cognite.client.exceptions import CogniteAPIError +from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ( Node, NodeId, @@ -114,7 +115,7 @@ def run(self) -> Literal["Done"] | None: section="START", ) try: - job_id, file_to_state_map = self.retrieve_service.get_job_id() + job_id, pattern_mode_job_id, file_to_state_map = self.retrieve_service.get_job_id() if not job_id or not file_to_state_map: self.logger.info(message="No diagram detect jobs found", section="END") return "Done" @@ -141,8 +142,12 @@ def run(self) -> Literal["Done"] | None: else: raise e + job_results: dict | None = None + pattern_mode_job_results: dict | None = None try: - job_results: dict | None = self.retrieve_service.get_diagram_detect_job_result(job_id) + job_results = self.retrieve_service.get_diagram_detect_job_result(job_id) + if pattern_mode_job_id: + pattern_mode_job_results = self.retrieve_service.get_diagram_detect_job_result(pattern_mode_job_id) except Exception as e: self.logger.info( message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) is a bad gateway", @@ -154,9 +159,9 @@ def run(self) -> Literal["Done"] | None: failed=True, ) - if job_results is None: + if not job_results or not pattern_mode_job_results: self.logger.info( - message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) is not complete yet", + message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) and/or pattern id ({pattern_mode_job_id} not complete)", section="END", ) self._update_batch_state( @@ -168,16 +173,31 @@ def run(self) -> Literal["Done"] | None: return self.logger.info( - message=f"Applying annotations to {len(job_results['items'])} files", + message=f"Both jobs ({job_id}, {pattern_mode_job_id}) complete. Applying all annotations.", section="END", ) + + # NOTE: Merge the results by file ID for easier processing + # Ensures that for each job, both the regular annotations and its pattern results will be updated within the same transaction. + # This prevents a scenario where the regular annotation is successfully processed but an error occurs before the pattern results are successfully processed. + # That would leave the file in a partially completed state. + merged_results = {item["fileInstanceId"]: {"regular": item} for item in job_results["items"]} + if pattern_mode_job_results: + for item in pattern_mode_job_results["items"]: + if item["fileInstanceId"] in merged_results: + merged_results[item["fileInstanceId"]]["pattern"] = item + else: + merged_results[item["fileInstanceId"]] = {"pattern": item} + count_retry = 0 count_failed = 0 + count_success = 0 annotation_state_node_applies: list[NodeApply] = [] failed_file_ids: list[NodeId] = [] - for diagram_detect_item in job_results["items"]: - file_id: NodeId = NodeId.load(diagram_detect_item["fileInstanceId"]) + # Loop through the merged results, processing one file at a time + for file_id_str, results in merged_results.items(): + file_id: NodeId = NodeId.load(file_id_str) annotation_state_node: Node = file_to_state_map[file_id] current_attempt_count: int = cast( @@ -186,85 +206,78 @@ def run(self) -> Literal["Done"] | None: ) next_attempt_count = current_attempt_count + 1 job_node_to_update: NodeApply | None = None - if diagram_detect_item.get("annotations") and len(diagram_detect_item["annotations"]) > 0: - try: - self.logger.info(f"Applying annotations to file NodeId - {str(file_id)}") + + try: + # Process Regular Annotations + regular_item = results.get("regular") + if regular_item and regular_item.get("annotations"): + self.logger.info(f"Applying annotations to file {str(file_id)}") if self.clean_old_annotations: - self.logger.info("Deleting old annotations") - doc_annotations_delete, tag_annotations_delete = self.apply_service.delete_annotations_for_file( - file_node=file_id - ) - self.logger.info( - f"\t- deleted {len(doc_annotations_delete)} document annotations\n- deleted {len(tag_annotations_delete)} tag annoations" - ) - self.report_service.delete_annotations(doc_annotations_delete, tag_annotations_delete) - - doc_annotations, tag_annotations = self.apply_service.apply_annotations( - diagram_detect_item, file_id + # This should only run once, so we tie it to the regular annotation processing + doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) + self.report_service.delete_annotations(doc_delete, tag_delete) + + doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_id) + self.report_service.add_annotations(doc_rows=doc_add, tag_rows=tag_add) + annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." + self.logger.info(f"\t- {annotation_msg}") + else: + annotation_msg: str = "Found no annotations to apply" + + # Process Pattern Mode Annotations + pattern_item = results.get("pattern") + if pattern_item and pattern_item.get("annotations"): + self.logger.info(f"Processing pattern mode results for file {str(file_id)}") + # responsible for converting pattern results into RAW rows and adding them to its internal batch for later upload. + pattern_add: list[RowWrite] = self.apply_service.process_pattern_results(pattern_item, file_id) + self.report_service.add_pattern_tags(pattern_rows=pattern_add) + pattern_msg: str = f"Processed {len(pattern_item['annotations'])} pattern annotations." + self.logger.info(f"\t- {pattern_msg}") + else: + pattern_msg: str = "Found no tags from pattern samples" + + # Determine Final State + page_count: int = regular_item["pageCount"] + annotated_page_count: int = self._check_all_pages_annotated(annotation_state_node, page_count) + + if annotated_page_count == page_count: + job_node_to_update = self._process_annotation_state( + node=annotation_state_node, + status=AnnotationStatus.ANNOTATED, + attempt_count=next_attempt_count, + annotated_page_count=annotated_page_count, + page_count=page_count, + annotation_message=annotation_msg, + pattern_mode_message=pattern_msg, ) - doc_msg = f"added/updated {len(doc_annotations)} document annotations" - tag_msg = f"added/updated {len(tag_annotations)} tag annotations" - - page_count: int = diagram_detect_item["pageCount"] - annotated_page_count: int = self._check_all_pages_annotated(annotation_state_node, page_count) - if annotated_page_count == page_count: - job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.ANNOTATED, - attempt_count=next_attempt_count, - annotated_page_count=annotated_page_count, - page_count=page_count, - annotation_message=f"{doc_msg} and {tag_msg}", - ) - else: - job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.NEW, - attempt_count=current_attempt_count, # NOTE: using current_attempt_count since don't want to increment this if not fully annotated - annotated_page_count=annotated_page_count, - page_count=page_count, - annotation_message=f"{doc_msg} and {tag_msg}", - ) - - self.report_service.add_annotations(doc_rows=doc_annotations, tag_rows=tag_annotations) - self.logger.info(f"\t- {doc_msg}\n- {tag_msg}") - - except Exception as e: - msg = str(e) - if next_attempt_count >= self.max_retries: - job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.FAILED, - attempt_count=next_attempt_count, - annotation_message=msg, - ) - count_failed += 1 - self.logger.info( - f"\t- set the annotation status to {AnnotationStatus.FAILED}\n- ran into the following error: {msg}" - ) - failed_file_ids.append(file_id) - else: - job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.RETRY, - attempt_count=next_attempt_count, - annotation_message=msg, - ) - count_retry += 1 - self.logger.info( - f"\t- set the annotation status to 'Retry'\n- ran into the following error: {msg}" - ) - else: - msg = f"found 0 annotations in diagram_detect_item for file {str(file_id)}" + count_success += 1 + else: + # File has more pages to process + job_node_to_update = self._process_annotation_state( + node=annotation_state_node, + status=AnnotationStatus.NEW, + attempt_count=current_attempt_count, # Do not increment attempt count + annotated_page_count=annotated_page_count, + page_count=page_count, + annotation_message="Processed page batch, more pages remaining", + pattern_mode_message=pattern_msg, + ) + # This is still a success for the current batch + count_success += 1 + + except Exception as e: + # If anything fails for this file, mark it for retry or failure + msg = f"Failed to process annotations for file {str(file_id)}: {str(e)}" + self.logger.error(msg) if next_attempt_count >= self.max_retries: job_node_to_update = self._process_annotation_state( node=annotation_state_node, status=AnnotationStatus.FAILED, attempt_count=next_attempt_count, annotation_message=msg, + pattern_mode_message=msg, ) count_failed += 1 - self.logger.info(f"\t- set the annotation status to 'Failed'\n- {msg}") failed_file_ids.append(file_id) else: job_node_to_update = self._process_annotation_state( @@ -272,9 +285,10 @@ def run(self) -> Literal["Done"] | None: status=AnnotationStatus.RETRY, attempt_count=next_attempt_count, annotation_message=msg, + pattern_mode_message=msg, ) count_retry += 1 - self.logger.info(f"\t- set the annotation status to 'Retry'\n- {msg}") + if job_node_to_update: annotation_state_node_applies.append(job_node_to_update) @@ -305,7 +319,6 @@ def run(self) -> Literal["Done"] | None: if annotation_state_node_applies: node_count = len(annotation_state_node_applies) - count_annotated = node_count - count_retry - count_failed self.logger.info( message=f"Updating {node_count} annotation state instances", section="START", @@ -313,7 +326,7 @@ def run(self) -> Literal["Done"] | None: try: self.apply_service.update_nodes(list_node_apply=annotation_state_node_applies) self.logger.info( - f"\t- {count_annotated} set to Annotated\n- {count_retry} set to retry\n- {count_failed} set to failed" + f"\t- {count_success} set to Annotated\n- {count_retry} set to retry\n- {count_failed} set to failed" ) except Exception as e: self.logger.error( @@ -321,7 +334,7 @@ def run(self) -> Literal["Done"] | None: section="END", ) - self.tracker.add_files(success=count_annotated, failed=(count_failed + count_retry)) + self.tracker.add_files(success=count_success, failed=(count_failed + count_retry)) def _process_annotation_state( self, @@ -331,6 +344,7 @@ def _process_annotation_state( annotated_page_count: int | None = None, page_count: int | None = None, annotation_message: str | None = None, + pattern_mode_message: str | None = None, ) -> NodeApply: """ Create a node apply from the node passed into the function. @@ -354,6 +368,7 @@ def _process_annotation_state( "annotationStatus": status, "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "annotationMessage": annotation_message, + "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, "diagramDetectJobId": None, # clear the job id } @@ -362,6 +377,7 @@ def _process_annotation_state( "annotationStatus": status, "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "annotationMessage": annotation_message, + "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, "diagramDetectJobId": None, # clear the job id "annotatedPageCount": annotated_page_count, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py index 97bb7cba..c98f9dc1 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py @@ -34,7 +34,7 @@ def get_diagram_detect_job_result(self, job_id: int) -> dict | None: pass @abc.abstractmethod - def get_job_id(self) -> tuple[int, dict[NodeId, Node]] | tuple[None, None]: + def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: pass @@ -69,7 +69,7 @@ def get_diagram_detect_job_result(self, job_id: int) -> dict | None: self.logger.debug(f"{job_id} - Request to get job result failed with {response.status_code} code") return - def get_job_id(self) -> tuple[int, dict[NodeId, Node]] | tuple[None, None]: + def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: """ To ensure threads are protected, we do the following... 1. Query for an available job id @@ -100,13 +100,16 @@ def get_job_id(self) -> tuple[int, dict[NodeId, Node]] | tuple[None, None]: ) if len(annotation_state_instance) == 0: - return None, None + return None, None, None job_node: Node = annotation_state_instance.pop(-1) job_id: int = cast( int, job_node.properties[self.annotation_state_view.as_view_id()]["diagramDetectJobId"], ) + pattern_mode_job_id: int | None = job_node.properties[self.annotation_state_view.as_view_id()].get( + "patternModeJobId" + ) filter_job_id = Equals( property=self.annotation_state_view.as_property_ref("diagramDetectJobId"), @@ -132,7 +135,7 @@ def get_job_id(self) -> tuple[int, dict[NodeId, Node]] | tuple[None, None]: file_node_id = NodeId(space=file_reference["space"], external_id=file_reference["externalId"]) file_to_state_map[file_node_id] = node - return job_id, file_to_state_map + return job_id, pattern_mode_job_id, file_to_state_map def _attempt_to_claim(self, list_job_nodes_to_claim: NodeApplyList) -> None: """ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index f1d6ac4b..763d1952 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -293,7 +293,7 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str # --- Build the final result from the processed patterns --- result = [] - for resource_type, templates in pattern_builders.items(): + for category, templates in pattern_builders.items(): final_samples = [] for template_key, collected_vars in templates.items(): # Create an iterator for the collected letter groups @@ -322,5 +322,5 @@ def replace_A(match): final_samples.append("".join(final_pattern_parts)) if final_samples: - result.append({"sample": sorted(final_samples), "resourceType": resource_type}) + result.append({"sample": sorted(final_samples), "category_property": category}) return result From c0f061b2d9be42e540fde36dc5f68478e4bb0bf2 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 16:52:27 -0500 Subject: [PATCH 012/128] no reason to not keep job_id --- .../fn_file_annotation_finalize/services/FinalizeService.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index bd403633..cf3b2fa4 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -370,7 +370,6 @@ def _process_annotation_state( "annotationMessage": annotation_message, "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, - "diagramDetectJobId": None, # clear the job id } else: update_properties = { @@ -379,7 +378,6 @@ def _process_annotation_state( "annotationMessage": annotation_message, "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, - "diagramDetectJobId": None, # clear the job id "annotatedPageCount": annotated_page_count, "pageCount": page_count, } From 1ea2bcc8c00a2f433c7031981a711eadc57820fe Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 17:17:30 -0500 Subject: [PATCH 013/128] added pattern table to ep config and code to update pattern table --- .../cdf_file_annotation/default.config.yaml | 1 + .../ep_file_annotation.config.yaml | 5 ++- .../services/ApplyService.py | 2 +- .../services/ConfigService.py | 1 + .../services/ReportService.py | 38 ++++++++++++++----- .../services/ConfigService.py | 1 + 6 files changed, 35 insertions(+), 13 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index ec62175d..aa4bb008 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -14,6 +14,7 @@ fileVersion: rawDb: db_file_annotation rawTableDocTag: annotation_documents_tags rawTableDocDoc: annotation_documents_docs +rawTableDocPattern: annotation_documents_patterns rawTableCache: annotation_entities_cache # used in /extraction_pipelines diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index b8819007..09bb956d 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -54,8 +54,8 @@ config: secondaryScopeProperty: # NOTE: below configurations are used by pattern mode patternMode: True - fileCategoryProperty: - targetEntitiesCategoryProperty: + fileCategoryProperty: # optional + targetEntitiesCategoryProperty: # optional dataModelService: getFilesToProcessQuery: targetView: @@ -131,4 +131,5 @@ config: rawDb: {{ rawDb }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} + rawTableDocPattern: {{ rawTableDocPattern }} rawBatchSize: 10000 diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 52f8648b..86ffe9d9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -277,6 +277,7 @@ def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[Ro doc_patterns: list[RowWrite] = [] source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) + # TODO: Lots of potential here to create annotation edges from the results pattern mode for detect_annotation in result_item["annotations"]: for entity in detect_annotation["entities"]: if detect_annotation["confidence"] >= self.approve_threshold: @@ -302,7 +303,6 @@ def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[Ro "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, } - row: RowWrite = RowWrite( key=f"{baseline_properties["startSourceId"]}_{baseline_properties["text"]}_{baseline_properties["startSourceId"]}_{detect_annotation["region"]["page"]}_{min(v["x"] for v in detect_annotation["region"]["vertices"])}_{min(v["y"] for v in detect_annotation["region"]["vertices"])}", columns=baseline_properties, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index ac9186da..453666aa 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -210,6 +210,7 @@ class ReportServiceConfig(BaseModel, alias_generator=to_camel): raw_db: str raw_table_doc_tag: str raw_table_doc_doc: str + raw_table_doc_pattern: str raw_batch_size: int diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py index c991b405..1f3243b9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py @@ -18,11 +18,11 @@ def add_annotations(self, doc_rows: list[RowWrite], tag_rows: list[RowWrite]) -> pass @abc.abstractmethod - def delete_annotations( - self, - doc_row_keys: list[str], - tag_row_keys: list[str], - ) -> None: + def add_pattern_tags(self, pattern_rows: list[RowWrite]) -> None: + pass + + @abc.abstractmethod + def delete_annotations(self, doc_row_keys: list[str], tag_row_keys: list[str]) -> None: pass @abc.abstractmethod @@ -44,13 +44,18 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.db: str = config.finalize_function.report_service.raw_db self.doc_table: tuple[str, list[RowWrite], list[str]] = ( config.finalize_function.report_service.raw_table_doc_doc, - [], - [], + [], # NOTE: rows to upload + [], # NOTE: rows to delete -> holds list of keys ) self.tag_table: tuple[str, list[RowWrite], list[str]] = ( config.finalize_function.report_service.raw_table_doc_tag, - [], - [], + [], # NOTE: rows to upload + [], # NOTE: rows to delete -> holds list of keys + ) + self.pattern_table: tuple[str, list[RowWrite], list[str]] = ( + config.finalize_function.report_service.raw_table_doc_pattern, + [], # NOTE: rows to upload + [], # TODO: figure out best way of implementing this. Hard to generate deterministic key without effecting performance. No edges to retrieve and delete like in clean old annotations function. ) self.batch_size: int = config.finalize_function.report_service.raw_batch_size self.delete: bool = self.config.finalize_function.clean_old_annotations @@ -100,7 +105,7 @@ def update_report(self) -> str: update_msg = "No annotations to upload" if len(self.doc_table[1]) > 0 or len(self.tag_table[1]) > 0: - update_msg = f"Uploaded annotations to db: {self.db}\n- added {len(self.doc_table[1])} rows to tbl: {self.doc_table[0]}\n- added {len(self.tag_table[1])} rows to tbl: {self.tag_table[0]}" + update_msg = f"Uploaded annotations to db: {self.db}\n- added {len(self.doc_table[1])} rows to tbl: {self.doc_table[0]}\n- added {len(self.tag_table[1])} rows to tbl: {self.tag_table[0]}\n- added {len(self.pattern_table[1])} rows to tbl: {self.pattern_table[0]}" self.client.raw.rows.insert( db_name=self.db, table_name=self.doc_table[0], @@ -113,15 +118,28 @@ def update_report(self) -> str: row=self.tag_table[1], ensure_parent=True, ) + if self.pattern_table[1]: + self.client.raw.rows.insert( + db_name=self.db, + table_name=self.pattern_table[0], + row=self.pattern_table[1], + ensure_parent=True, + ) self._clear_tables() if delete_msg: return f" {delete_msg}\n{update_msg}" return f" {update_msg}" + def add_pattern_tags(self, pattern_rows: list[RowWrite]): + self.pattern_table[1].extend(pattern_rows) + return + def _clear_tables(self) -> None: self.doc_table[1].clear() self.tag_table[1].clear() + self.pattern_table[1].clear() if self.delete: self.doc_table[2].clear() self.tag_table[2].clear() + # self.pattern_table[2].clear() # TODO: figure out best approach diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index ac9186da..453666aa 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -210,6 +210,7 @@ class ReportServiceConfig(BaseModel, alias_generator=to_camel): raw_db: str raw_table_doc_tag: str raw_table_doc_doc: str + raw_table_doc_pattern: str raw_batch_size: int From f50a16bb10830619a79d012bfbf3190f76434799 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 17:27:13 -0500 Subject: [PATCH 014/128] removed patternModeJobId whenever annotation state fails --- .../extraction_pipelines/ep_file_annotation.config.yaml | 2 +- .../fn_file_annotation_finalize/services/FinalizeService.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 09bb956d..51b1458f 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -132,4 +132,4 @@ config: rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} rawTableDocPattern: {{ rawTableDocPattern }} - rawBatchSize: 10000 + rawBatchSize: 1000 diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index cf3b2fa4..df8543d1 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -458,6 +458,7 @@ def _update_batch_state( "annotationStatus": status, "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": None, + "patternModeJobId": None, } batch.update_node_properties( new_properties=update_properties, From c6b2e722148409d0022c8cbca01ac24ee915660d Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 17:35:46 -0500 Subject: [PATCH 015/128] efficiency from one time retrieval of file node --- .../services/ApplyService.py | 27 ++++++------------- .../services/FinalizeService.py | 10 +++++-- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 86ffe9d9..4ed85acc 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -36,11 +36,11 @@ class IApplyService(abc.ABC): """ @abc.abstractmethod - def apply_annotations(self, result_item: dict, file_id: NodeId) -> tuple[list, list]: + def apply_annotations(self, result_item: dict, file_node: Node) -> tuple[list, list]: pass @abc.abstractmethod - def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[RowWrite]: + def process_pattern_results(self, result_item: dict, file_node: Node) -> list[RowWrite]: pass @abc.abstractmethod @@ -74,17 +74,10 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.suggest_threshold = self.config.finalize_function.apply_service.auto_suggest_threshold # NOTE: could implement annotation edges to be updated in batches for performance gains but leaning towards no. Since it will over complicate error handling. - def apply_annotations(self, result_item: dict, file_id: NodeId) -> tuple[list[RowWrite], list[RowWrite]]: + def apply_annotations(self, result_item: dict, file_node: Node) -> tuple[list[RowWrite], list[RowWrite]]: """ Push the annotations to the file and set the "AnnotationInProcess" tag to "Annotated" """ - - file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( - nodes=file_id, sources=self.file_view_id - ) - if not file_node: - raise ValueError("No file node found.") - node_apply: NodeApply = file_node.as_write() node_apply.existing_version = None @@ -103,7 +96,7 @@ def apply_annotations(self, result_item: dict, file_id: NodeId) -> tuple[list[Ro edge_applies: list[EdgeApply] = [] for detect_annotation in result_item["annotations"]: edge_apply_dict: dict[tuple, EdgeApply] = self._detect_annotation_to_edge_applies( - file_id, + file_node.as_id(), source_id, doc_doc, doc_tag, @@ -236,7 +229,7 @@ def _create_annotation_id( def delete_annotations_for_file( self, - file_node: NodeId, + file_node: Node, ) -> tuple[list[str], list[str]]: """ Delete all annotation edges for a file node. @@ -264,18 +257,14 @@ def delete_annotations_for_file( return doc_annotations_delete, tag_annotations_delete - def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[RowWrite]: - + def process_pattern_results(self, result_item: dict, file_node: Node) -> list[RowWrite]: if not result_item.get("annotations"): return [] - - file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( - nodes=file_id, sources=self.file_view_id - ) if not file_node: return [] doc_patterns: list[RowWrite] = [] + file_id: NodeId = file_node.as_id() source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) # TODO: Lots of potential here to create annotation edges from the results pattern mode for detect_annotation in result_item["annotations"]: @@ -312,7 +301,7 @@ def process_pattern_results(self, result_item: dict, file_id: NodeId) -> list[Ro def _list_annotations_for_file( self, - node: NodeId, + node: Node, ): """ List all annotation edges for a file node. diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index df8543d1..afb33662 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -198,6 +198,12 @@ def run(self) -> Literal["Done"] | None: # Loop through the merged results, processing one file at a time for file_id_str, results in merged_results.items(): file_id: NodeId = NodeId.load(file_id_str) + file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( + nodes=file_id, sources=self.file_view.as_view_id() + ) + if not file_node: + self.logger.debug(f"No file node found for file id {str(file_id)}") + continue annotation_state_node: Node = file_to_state_map[file_id] current_attempt_count: int = cast( @@ -214,10 +220,10 @@ def run(self) -> Literal["Done"] | None: self.logger.info(f"Applying annotations to file {str(file_id)}") if self.clean_old_annotations: # This should only run once, so we tie it to the regular annotation processing - doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) + doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_node) self.report_service.delete_annotations(doc_delete, tag_delete) - doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_id) + doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_node) self.report_service.add_annotations(doc_rows=doc_add, tag_rows=tag_add) annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." self.logger.info(f"\t- {annotation_msg}") From 83444b0b83c7a04d568e823f84564ec4af9f4056 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 17:47:13 -0500 Subject: [PATCH 016/128] made function signature same for implementation and interface --- .../fn_file_annotation_finalize/services/ApplyService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 4ed85acc..cc602408 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -48,7 +48,7 @@ def update_nodes(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: pass @abc.abstractmethod - def delete_annotations_for_file(self, file_node: NodeId) -> tuple[list[str], list[str]]: + def delete_annotations_for_file(self, file_node: Node) -> tuple[list[str], list[str]]: pass From 20546d076f827a116626da6e8fbe7a5c4a1fedaf Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 18 Aug 2025 21:23:17 -0500 Subject: [PATCH 017/128] moved generate sample log to make more sense --- .../fn_file_annotation_launch/services/CacheService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 763d1952..6d9ea3d7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -226,13 +226,13 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict # Structure: { resource_type: { full_template_key: list_of_collected_variable_parts } } # where list_of_collected_variable_parts is [ [{'L1_alt1', 'L1_alt2'}], [{'L2_alt1'}], ... ] pattern_builders: dict[str, dict[str, list[list[set[str]]]]] = {} + self.logger.info(f"Generating pattern samples from {len(entities)} entities.") def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: """ Parses an alias into a structural template key and its variable letter components. A segment '629P' yields a template '000A' and a variable part ['P']. """ - self.logger.info(f"Generating pattern samples from {len(entities)} entities.") alias_parts = re.split(r"([ -])", alias) full_template_key_parts: list[str] = [] From 0ea20b92c85a709608cccf0b67a21d40bbe8b0cf Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 09:03:35 -0500 Subject: [PATCH 018/128] adjusted container and view --- .../data_models/hdm.container.yaml | 29 +++++++++++++++++++ .../data_models/hdm.view.yaml | 16 +++++----- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml index fa448add..04ba7351 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml @@ -12,6 +12,9 @@ list: false type: int64 annotationMessage: + autoIncrement: false + immutable: false + nullable: true autoIncrement: false immutable: false nullable: true @@ -28,10 +31,13 @@ list: false type: text patternModeMessage: + autoIncrement: false + immutable: false nullable: true type: type: text collation: ucs_basic + list: false attemptCount: autoIncrement: false immutable: false @@ -69,3 +75,26 @@ type: int64 space: {{ annotationStateSchemaSpace }} usedFor: node + indexes: + annotationStatus: + indexType: btree + properties: + - annotationStatus + cursorable: true + diagramDetectJobId: + indexType: btree + properties: + - diagramDetectJobId + cursorable: true + launch: + indexType: btree + properties: + - launchFunctionId + - launchFunctionCallId + cursorable: true + finalize: + indexType: btree + properties: + - finalizeFunctionId + - finalizeFunctionCallId + cursorable: true \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml index 89f593b7..d0b8f326 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml @@ -52,17 +52,19 @@ description: Diagram detect job ID name: Diagram detect job ID patternModeJobId: - nullable: true - type: - type: int64 + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container containerPropertyIdentifier: patternModeJobId description: Diagram detect job ID with pattern mode name: Pattern mode job ID patternModeMessage: - nullable: true - type: - type: text - collation: ucs_basic + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container + containerPropertyIdentifier: patternModeMessage description: Contains entities found from pattern mode or error message name: Pattern mode message linkedFile: From 3558e9c627b3d9425c87e1c93b1bcf13c88030d0 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 09:15:42 -0500 Subject: [PATCH 019/128] added indexes --- .../data_models/hdm.container.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml index 04ba7351..08cf585a 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml @@ -85,16 +85,4 @@ indexType: btree properties: - diagramDetectJobId - cursorable: true - launch: - indexType: btree - properties: - - launchFunctionId - - launchFunctionCallId - cursorable: true - finalize: - indexType: btree - properties: - - finalizeFunctionId - - finalizeFunctionCallId cursorable: true \ No newline at end of file From 2e4528a0f5c2826e99d5983b3697b417994cc88c Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 09:35:52 -0500 Subject: [PATCH 020/128] adjusted local_setup --- .../cdf_file_annotation/local_setup/quickstart_setup.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb b/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb index 195933a6..e06317ae 100644 --- a/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb +++ b/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb @@ -178,7 +178,7 @@ "outputs": [], "source": [ "# retrieve instances of txEquipment\n", - "equipments: NodeList[Node] = cdf_client.data_modeling.instances.list(instance_type=\"node\", sources=txEquipment_view.as_view_id(), limit=-1)\n", + "equipments: NodeList[Node] = cdf_client.data_modeling.instances.list(instance_type=\"node\", sources=equipment_view.as_view_id(), limit=-1)\n", "print(equipments[0])" ] }, From d201cf86620321753b31cba4a42ce69991492219 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 10:46:39 -0500 Subject: [PATCH 021/128] fixed var name and made search_property always a list --- .../fn_file_annotation_launch/services/CacheService.py | 3 ++- .../fn_file_annotation_launch/services/LaunchService.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 6d9ea3d7..02072d2f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -183,11 +183,12 @@ def _convert_instances_to_entities( ) target_entities.append(asset_entity.to_dict()) else: + search_value: list = [instance_properties.get("name")] asset_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, - search_property=instance_properties.get("name"), + search_property=search_value, category_property=category, annotation_type_external_id=self.target_entities_view.annotation_type, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 7a75c0d5..8dfc5cc6 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -91,7 +91,7 @@ def __init__( self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.in_memory_cache: list[dict] = [] - self.in_memory_pattern: list[dict] = [] + self.in_memory_patterns: list[dict] = [] self._cached_primary_scope: str | None = None self._cached_secondary_scope: str | None = None @@ -436,7 +436,6 @@ def _process_batch(self, batch: BatchOfPairedNodes): pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_pattern ) - update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING update_properties["patternModeJobId"] = pattern_job_id batch.batch_states.update_node_properties( From b71965a514c62394d6dffcbc8cd50831121ef316 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 10:47:32 -0500 Subject: [PATCH 022/128] removed the post processing status --- .../fn_file_annotation_launch/services/LaunchService.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 8dfc5cc6..08bc3d37 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -378,12 +378,11 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns)} entities" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( - files=batch.file_references, pattern_samples=self.in_memory_pattern + files=batch.file_references, pattern_samples=self.in_memory_patterns ) - update_properties["patternModeStatus"] = AnnotationStatus.PROCESSING update_properties["patternModeJobId"] = pattern_job_id batch.batch_states.update_node_properties( @@ -431,10 +430,10 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_pattern)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns)} entities" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( - files=batch.file_references, pattern_samples=self.in_memory_pattern + files=batch.file_references, pattern_samples=self.in_memory_patterns ) update_properties["patternModeJobId"] = pattern_job_id From 0cd29085543fbffb8ebeab9eb37d9bd2f8e3d389 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 11:32:17 -0500 Subject: [PATCH 023/128] working finalize function --- .../services/ApplyService.py | 14 ++++---- .../services/FinalizeService.py | 34 +++++++++++++------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index cc602408..ce4a0acc 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -48,7 +48,7 @@ def update_nodes(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: pass @abc.abstractmethod - def delete_annotations_for_file(self, file_node: Node) -> tuple[list[str], list[str]]: + def delete_annotations_for_file(self, file_id: NodeId) -> tuple[list[str], list[str]]: pass @@ -229,7 +229,7 @@ def _create_annotation_id( def delete_annotations_for_file( self, - file_node: Node, + file_id: NodeId, ) -> tuple[list[str], list[str]]: """ Delete all annotation edges for a file node. @@ -239,7 +239,7 @@ def delete_annotations_for_file( annotation_view_id (ViewId): The ViewId of the annotation view. node (NodeId): The NodeId of the file node. """ - annotations = self._list_annotations_for_file(file_node) + annotations = self._list_annotations_for_file(file_id) if not annotations: return [], [] @@ -248,7 +248,7 @@ def delete_annotations_for_file( tag_annotations_delete: list[str] = [] edge_ids = [] for edge in annotations: - edge_ids.append(EdgeId(space=file_node.space, external_id=edge.external_id)) + edge_ids.append(EdgeId(space=file_id.space, external_id=edge.external_id)) if edge.type.external_id == self.file_annotation_type: doc_annotations_delete.append(edge.external_id) else: @@ -301,7 +301,7 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro def _list_annotations_for_file( self, - node: Node, + node_id: NodeId, ): """ List all annotation edges for a file node. @@ -317,8 +317,8 @@ def _list_annotations_for_file( annotations = self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view_id], - space=node.space, - filter=Or(In(["edge", "startNode"], [node])), + space=node_id.space, + filter=Or(In(["edge", "startNode"], [node_id])), limit=-1, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index afb33662..964090ae 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -181,13 +181,19 @@ def run(self) -> Literal["Done"] | None: # Ensures that for each job, both the regular annotations and its pattern results will be updated within the same transaction. # This prevents a scenario where the regular annotation is successfully processed but an error occurs before the pattern results are successfully processed. # That would leave the file in a partially completed state. - merged_results = {item["fileInstanceId"]: {"regular": item} for item in job_results["items"]} + merged_results = { + (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): {"regular": item} + for item in job_results["items"] + } + if pattern_mode_job_results: for item in pattern_mode_job_results["items"]: - if item["fileInstanceId"] in merged_results: - merged_results[item["fileInstanceId"]]["pattern"] = item + # FIX: Use the same tuple format for the key when adding pattern results. + key = (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]) + if key in merged_results: + merged_results[key]["pattern"] = item else: - merged_results[item["fileInstanceId"]] = {"pattern": item} + merged_results[key] = {"pattern": item} count_retry = 0 count_failed = 0 @@ -196,8 +202,8 @@ def run(self) -> Literal["Done"] | None: failed_file_ids: list[NodeId] = [] # Loop through the merged results, processing one file at a time - for file_id_str, results in merged_results.items(): - file_id: NodeId = NodeId.load(file_id_str) + for (space, external_id), results in merged_results.items(): + file_id: NodeId = NodeId(space, external_id) file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( nodes=file_id, sources=self.file_view.as_view_id() ) @@ -220,13 +226,15 @@ def run(self) -> Literal["Done"] | None: self.logger.info(f"Applying annotations to file {str(file_id)}") if self.clean_old_annotations: # This should only run once, so we tie it to the regular annotation processing - doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_node) + doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) self.report_service.delete_annotations(doc_delete, tag_delete) doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_node) self.report_service.add_annotations(doc_rows=doc_add, tag_rows=tag_add) annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." self.logger.info(f"\t- {annotation_msg}") + elif regular_item and regular_item.get("errorMessage"): + annotation_msg = regular_item.get("errorMessage") else: annotation_msg: str = "Found no annotations to apply" @@ -235,16 +243,22 @@ def run(self) -> Literal["Done"] | None: if pattern_item and pattern_item.get("annotations"): self.logger.info(f"Processing pattern mode results for file {str(file_id)}") # responsible for converting pattern results into RAW rows and adding them to its internal batch for later upload. - pattern_add: list[RowWrite] = self.apply_service.process_pattern_results(pattern_item, file_id) + pattern_add: list[RowWrite] = self.apply_service.process_pattern_results(pattern_item, file_node) self.report_service.add_pattern_tags(pattern_rows=pattern_add) pattern_msg: str = f"Processed {len(pattern_item['annotations'])} pattern annotations." self.logger.info(f"\t- {pattern_msg}") + elif pattern_item and pattern_item.get("errorMessage"): + pattern_msg = pattern_item.get("errorMessage") else: pattern_msg: str = "Found no tags from pattern samples" # Determine Final State - page_count: int = regular_item["pageCount"] - annotated_page_count: int = self._check_all_pages_annotated(annotation_state_node, page_count) + if regular_item and regular_item.get("pageCount"): + page_count: int = regular_item["pageCount"] + annotated_page_count: int = self._check_all_pages_annotated(annotation_state_node, page_count) + else: + page_count = 1 + annotated_page_count = page_count if annotated_page_count == page_count: job_node_to_update = self._process_annotation_state( From 8949f5c1e9cdfb78733902aaca0d3a17a2f3800f Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 11:33:45 -0500 Subject: [PATCH 024/128] improved wording --- .../fn_file_annotation_finalize/services/FinalizeService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 964090ae..6815deee 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -245,7 +245,7 @@ def run(self) -> Literal["Done"] | None: # responsible for converting pattern results into RAW rows and adding them to its internal batch for later upload. pattern_add: list[RowWrite] = self.apply_service.process_pattern_results(pattern_item, file_node) self.report_service.add_pattern_tags(pattern_rows=pattern_add) - pattern_msg: str = f"Processed {len(pattern_item['annotations'])} pattern annotations." + pattern_msg: str = f"Found {len(pattern_item['annotations'])} pattern annotations." self.logger.info(f"\t- {pattern_msg}") elif pattern_item and pattern_item.get("errorMessage"): pattern_msg = pattern_item.get("errorMessage") From 18a004b63bd9fc47e93c986ac65225c1244885bf Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 12:01:21 -0500 Subject: [PATCH 025/128] improved logging for launch --- .../services/AnnotationService.py | 8 +++----- .../fn_file_annotation_launch/services/LaunchService.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index 204daf9f..c1480e05 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -58,16 +58,14 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str def run_pattern_mode_detect(self, files: list, pattern_samples: list[dict[str, Any]]) -> int: """Generates patterns and runs the diagram detection job in pattern mode.""" - self.logger.info(f"Generated {len(pattern_samples)} pattern samples for detection.") - detect_job: DiagramDetectResults = self.client.diagrams.detect( file_references=files, - entities=pattern_samples, # Use the generated patterns + entities=pattern_samples, partial_match=self.annotation_config.partial_match, min_tokens=self.annotation_config.min_tokens, - search_field="sample", # The key in your generated samples + search_field="sample", configuration=self.diagram_detect_config, - pattern_mode=True, # The crucial flag + pattern_mode=True, ) if detect_job.job_id: return detect_job.job_id diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 08bc3d37..942305d4 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -378,7 +378,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]["sample"]) + len(self.in_memory_patterns[1]["sample"])} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns @@ -430,7 +430,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns)} entities" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]["sample"]) + len(self.in_memory_patterns[1]["sample"])} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns From 619eccc2599f8a6eba896a022d381118e68b28ba Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 12:21:33 -0500 Subject: [PATCH 026/128] changed log var access --- .../fn_file_annotation_launch/services/LaunchService.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 942305d4..e2cc68e5 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -378,7 +378,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]["sample"]) + len(self.in_memory_patterns[1]["sample"])} sample patterns" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns @@ -430,7 +430,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode if self.config.launch_function.pattern_mode: self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]["sample"]) + len(self.in_memory_patterns[1]["sample"])} sample patterns" + f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns From b046a99351b7b11c39c53cda54edf92fdbcfdd9e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 12:25:42 -0500 Subject: [PATCH 027/128] changed log var access --- .../fn_file_annotation_finalize/services/ApplyService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index ce4a0acc..0d8ed738 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -293,7 +293,7 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro "sourceUpdatedUser": self.FUNCTION_ID, } row: RowWrite = RowWrite( - key=f"{baseline_properties["startSourceId"]}_{baseline_properties["text"]}_{baseline_properties["startSourceId"]}_{detect_annotation["region"]["page"]}_{min(v["x"] for v in detect_annotation["region"]["vertices"])}_{min(v["y"] for v in detect_annotation["region"]["vertices"])}", + key=f"{baseline_properties['startSourceId']}_{baseline_properties['text']}_{baseline_properties['startSourceId']}_{detect_annotation['region']['page']}_{min(v['x'] for v in detect_annotation['region']['vertices'])}_{min(v['y'] for v in detect_annotation['region']['vertices'])}", columns=baseline_properties, ) doc_patterns.append(row) From 0d28807f7480e201db9328524ecc1141c6741c83 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 12:39:09 -0500 Subject: [PATCH 028/128] added logging of pettern mode job id --- .../fn_file_annotation_launch/services/LaunchService.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index e2cc68e5..a3466516 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -391,7 +391,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( - message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}", + message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {job_id}", section="END", ) finally: @@ -443,7 +443,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( - message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}", + message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {job_id}", section="END", ) except CogniteAPIError as e: From 228798c6351536a8ce8ec8a648eb222abd238780 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 19 Aug 2025 12:59:39 -0500 Subject: [PATCH 029/128] improved logging --- .../fn_file_annotation_launch/services/LaunchService.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index a3466516..0a9a3c25 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -376,6 +376,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): } # Run diagram detect on pattern mode + pattern_job_id: int | None = None if self.config.launch_function.pattern_mode: self.logger.info( f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" @@ -391,7 +392,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( - message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {job_id}", + message=f"Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {pattern_job_id}", section="END", ) finally: @@ -428,6 +429,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): } # Run diagram detect on pattern mode + pattern_job_id: int | None = None if self.config.launch_function.pattern_mode: self.logger.info( f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" @@ -443,7 +445,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) self.data_model_service.update_annotation_state(batch.batch_states.apply) self.logger.info( - message=f" Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {job_id}", + message=f"Updated the annotation state instances:\n- annotation status set to 'Processing'\n- job id set to {job_id}\n- pattern mode job id set to {pattern_job_id}", section="END", ) except CogniteAPIError as e: From 321fcde842398923101cb06cb9a0af3b06db6335 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 24 Aug 2025 12:38:44 -0500 Subject: [PATCH 030/128] added to README --- .../cdf_file_annotation/README.md | 196 +++++++++--------- 1 file changed, 101 insertions(+), 95 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/README.md b/modules/contextualization/cdf_file_annotation/README.md index ee3f4638..ba2cb6a1 100644 --- a/modules/contextualization/cdf_file_annotation/README.md +++ b/modules/contextualization/cdf_file_annotation/README.md @@ -7,6 +7,7 @@ The Annotation template is a framework designed to automate the process of annot ## Key Features - **Configuration-Driven Workflow:** The entire process is controlled by a single config.yaml file, allowing adaptation to different data models and operational parameters without code changes. +- **Dual Annotation Modes**: Simultaneously runs standard entity matching and a new pattern-based detection mode to create a comprehensive indexed reference catalog. - **Large Document Support (\>50 Pages):** Automatically handles files with more than 50 pages by breaking them into manageable chunks, processing them iteratively, and tracking the overall progress. - **Parallel Execution Ready:** Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions when multiple finalize function instances run in parallel. - **Detailed Reporting:** Local logs and processed annotation details stored in CDF RAW tables, fucntion logs, and extraction pipeline runs for auditing and analysis. @@ -43,11 +44,12 @@ _(if videos fail to load, try loading page in incognito or re-sign into github) 1. **Create a CDF Project through Toolkit** - Follow the guide [here](https://docs.cognite.com/cdf/deploy/cdf_toolkit/) - (optional) Initialize the quickstart package using toolkit CLI - ```bash - poetry init - poetry add cognite-toolkit - poetry run cdf modules init - ``` + +```bash +poetry init +poetry add cognite-toolkit +poetry run cdf modules init +``` @@ -68,88 +70,89 @@ _(if videos fail to load, try loading page in incognito or re-sign into github) - (optional) Build and deploy the quickstart template modules - Build and deploy this module - ```bash - poetry run cdf build --env dev - poetry run cdf deploy --dry-run - poetry run cdf deploy - ``` - - ```yaml - # config..yaml used in examples below - environment: - name: dev - project: - validation-type: dev - selected: - - modules/ - - variables: - modules: - # stuff from quickstart package... - organization: tx - - # ... - - cdf_ingestion: - workflow: ingestion - groupSourceId: - ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} - ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} - pandidContextualizationFunction: contextualization_p_and_id_annotater - contextualization_connection_writer: contextualization_connection_writer - schemaSpace: sp_enterprise_process_industry - schemaSpace2: cdf_cdm - schemaSpace3: cdf_idm - instanceSpaces: - - springfield_instances - - cdf_cdm_units - runWorkflowUserIds: - - - - contextualization: - cdf_file_annotation: - # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows - annotationDatasetExternalId: ds_file_annotation - - # used in /data_models and /extraction_pipelines - annotationStateExternalId: FileAnnotationState - annotationStateInstanceSpace: sp_dat_cdf_annotation_states - annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model - annotationStateVersion: v1.0.1 - fileSchemaSpace: sp_enterprise_process_industry - fileExternalId: txFile - fileVersion: v1 - - # used in /raw and /extraction_pipelines - rawDb: db_file_annotation - rawTableDocTag: annotation_documents_tags - rawTableDocDoc: annotation_documents_docs - rawTableCache: annotation_entities_cache - - # used in /extraction_pipelines - extractionPipelineExternalId: ep_file_annotation - targetEntitySchemaSpace: sp_enterprise_process_industry - targetEntityExternalId: txEquipment - targetEntityVersion: v1 - - # used in /functions and /workflows - launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID - launchFunctionVersion: v1.0.0 - finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID - finalizeFunctionVersion: v1.0.0 - functionClientId: ${IDP_CLIENT_ID} - functionClientSecret: ${IDP_CLIENT_SECRET} - - # used in /workflows - workflowSchedule: "*/10 * * * *" - workflowExternalId: wf_file_annotation - workflowVersion: v1 - - # used in /auth - groupSourceId: # source ID from Azure AD for the corresponding groups - - # ... - ``` +```bash +poetry run cdf build --env dev +poetry run cdf deploy --dry-run +poetry run cdf deploy +``` + +```yaml +# config..yaml used in examples below +environment: + name: dev + project: + validation-type: dev + selected: + - modules/ + +variables: + modules: + # stuff from quickstart package... + organization: tx + + # ... + + cdf_ingestion: + workflow: ingestion + groupSourceId: + ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} + ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} + pandidContextualizationFunction: contextualization_p_and_id_annotater + contextualization_connection_writer: contextualization_connection_writer + schemaSpace: sp_enterprise_process_industry + schemaSpace2: cdf_cdm + schemaSpace3: cdf_idm + instanceSpaces: + - springfield_instances + - cdf_cdm_units + runWorkflowUserIds: + - + + contextualization: + cdf_file_annotation: + # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows + annotationDatasetExternalId: ds_file_annotation + + # used in /data_models and /extraction_pipelines + annotationStateExternalId: FileAnnotationState + annotationStateInstanceSpace: sp_dat_cdf_annotation_states + annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model + annotationStateVersion: v1.0.1 + fileSchemaSpace: sp_enterprise_process_industry + fileExternalId: txFile + fileVersion: v1 + + # used in /raw and /extraction_pipelines + rawDb: db_file_annotation + rawTableDocTag: annotation_documents_tags + rawTableDocDoc: annotation_documents_docs + rawTableCache: annotation_entities_cache + + # used in /extraction_pipelines + extractionPipelineExternalId: ep_file_annotation + targetEntitySchemaSpace: sp_enterprise_process_industry + targetEntityExternalId: txEquipment + targetEntityVersion: v1 + + # used in /functions and /workflows + launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID + launchFunctionVersion: v1.0.0 + finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID + finalizeFunctionVersion: v1.0.0 + functionClientId: ${IDP_CLIENT_ID} + functionClientSecret: ${IDP_CLIENT_SECRET} + + # used in /workflows + workflowSchedule: "*/10 * * * *" + workflowExternalId: wf_file_annotation + workflowVersion: v1 + + # used in /auth + groupSourceId: # source ID from Azure AD for the corresponding groups + + + # ... +``` @@ -208,8 +211,10 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since 1. It queries for `AnnotationState` instances with a "New" or "Retry" status. 2. It groups these files by a primary scope to provide context. 3. For each group, it fetches the relevant file and target entity information, using a cache to avoid redundant lookups. - 4. It calls the Cognite Diagram Detect API to start the annotation job. - 5. It updates the `AnnotationState` instance with the `diagramDetectJobId` and sets the status to "Processing". + 4. It calls the Cognite Diagram Detect API to initiate two async jobs: + - A `standard annotation` job to find and link known entities. + - A `pattern mode` job to detect all potential tags and build an indexed reference catalog. + 5. It updates the `AnnotationState` instance with both the `diagramDetectJobId` and `patternModeJobId` and sets the overall `annotationStatus` to "Processing". ### Finalize Phase @@ -218,11 +223,12 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since - **Goal**: Retrieve, process, and store the results of completed annotation jobs. - **Process**: 1. It queries for `AnnotationState` instances with a "Processing" status. - 2. It checks the status of the corresponding diagram detection job. - 3. Once a job is complete, it retrieves the annotation results. - 4. It applies the new annotations, optionally cleaning up old ones first. + 2. It waits until both the standard and pattern modejobs for a given file are complete. + 3. It then retrieves and merges the results from both jobs. + 4. It will optionally clean old annotations first and then: + - Applies the standard annotations by creating edges in the data model, writing the results to a dedicated RAW table. + - Processes the pattern mode results, writing them to a dedicated RAW table to populate the reference catalog. 5. It updates the `AnnotationState` status to "Annotated" or "Failed" and tags the file accordingly. - 6. It writes a summary of the approved annotations to a CDF RAW table for reporting. ## Configuration @@ -232,7 +238,7 @@ Key configuration sections include: - `dataModelViews`: Defines the data model views for files, annotation states, and target entities. - `prepareFunction`: Configures the queries to find files to annotate. -- `launchFunction`: Sets parameters for the annotation job, such as batch size and entity matching properties. +- `launchFunction`: Sets parameters for the annotation job, such as batch size, entity matching properties, and a new `patternMode: true` flag to enable the pattern detection feature. - `finalizeFunction`: Defines how to process and apply the final annotations. This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. @@ -285,6 +291,6 @@ The template is designed around a core set of abstract interfaces (e.g., `IDataM ## About Me -Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach and Noah Karsky for providing invaluable input from a solution architect's point of view. I also want to thank Khaled Shaheen and Gayatri Babel for their help in building this. +Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach, Noah Karsky, and Darren Downtain for providing invaluable input from a solution architect's point of view. I also want to thank Khaled Shaheen and Gayatri Babel for their help in building this. This code is my attempt to create a standard template that 'breaks' the cycle where projects build simple tools, outgrow them, and are then forced to build a new and often hard-to-reuse solution. My current belief is that it's impossible for a template to have long-term success if it's not built on the fundamental premise of being extended. Customer needs will evolve, and new product features will create new opportunities for optimization. From 70c5aefb92221fc8aa00b8cebfaa9b3d10b8308a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 28 Aug 2025 11:23:55 -0500 Subject: [PATCH 031/128] storing results of pattern mode to be more in line with referenced_index_catalog standardized column names to camel case --- .../services/ApplyService.py | 83 +++++++++++-------- .../services/ConfigService.py | 4 +- .../utils/DataStructures.py | 8 +- .../services/CacheService.py | 34 ++++---- .../services/ConfigService.py | 4 +- .../utils/DataStructures.py | 8 +- 6 files changed, 78 insertions(+), 63 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 0d8ed738..2d0e7680 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -152,9 +152,9 @@ def _detect_annotation_to_edge_applies( "startNode": file_instance_id.external_id, "endNode": entity["external_id"], "endNodeSpace": entity["space"], - "view_id": self.core_annotation_view_id.external_id, - "view_space": self.core_annotation_view_id.space, - "view_version": self.core_annotation_view_id.version, + "viewId": self.core_annotation_view_id.external_id, + "viewSpace": self.core_annotation_view_id.space, + "viewVersion": self.core_annotation_view_id.version, } now = datetime.now(timezone.utc).replace(microsecond=0) @@ -182,7 +182,7 @@ def _detect_annotation_to_edge_applies( existing_version=None, type=DirectRelationReference( space=annotation_schema_space, - external_id=entity["annotation_type_external_id"], + external_id=entity["annotation_type"], ), start_node=DirectRelationReference( space=file_instance_id.space, @@ -201,7 +201,7 @@ def _detect_annotation_to_edge_applies( if edge_apply_key not in diagram_annotations: diagram_annotations[edge_apply_key] = edge_apply_instance - if entity["annotation_type_external_id"] == self.file_annotation_type: + if entity["annotation_type"] == self.file_annotation_type: doc_doc.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) else: doc_tag.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) @@ -263,40 +263,55 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro if not file_node: return [] - doc_patterns: list[RowWrite] = [] file_id: NodeId = file_node.as_id() source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) - # TODO: Lots of potential here to create annotation edges from the results pattern mode + + # Step 1: Group all detections by their text content + # The key is the detected tag text, e.g., "P-101A" + aggregated_detections = {} + for detect_annotation in result_item["annotations"]: - for entity in detect_annotation["entities"]: - if detect_annotation["confidence"] >= self.approve_threshold: - annotation_status = DiagramAnnotationStatus.APPROVED.value - elif detect_annotation["confidence"] >= self.suggest_threshold: - annotation_status = DiagramAnnotationStatus.SUGGESTED.value - else: - continue - - # TODO: need to change row columns to create reference catalog -> which will live as a DM not RAW tbl-> from there create the table needed for baseline report - baseline_properties = { - "startSourceId": source_id, - "startNode": file_id.external_id, - "text": detect_annotation["text"], - "category": entity["category_property"], - "confidence": detect_annotation["confidence"], - "status": annotation_status, - "startNodePageNumber": detect_annotation["region"]["page"], - "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), - "startNodeXMax": max(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMax": max(v["y"] for v in detect_annotation["region"]["vertices"]), - "sourceCreatedUser": self.FUNCTION_ID, - "sourceUpdatedUser": self.FUNCTION_ID, + tag_text = detect_annotation["text"] + + if tag_text not in aggregated_detections: + # Initialize the entry for this tag if it's the first time we've seen it + aggregated_detections[tag_text] = { + "regions": [], + "resource_type": "Unknown", # Default resource_type } - row: RowWrite = RowWrite( - key=f"{baseline_properties['startSourceId']}_{baseline_properties['text']}_{baseline_properties['startSourceId']}_{detect_annotation['region']['page']}_{min(v['x'] for v in detect_annotation['region']['vertices'])}_{min(v['y'] for v in detect_annotation['region']['vertices'])}", - columns=baseline_properties, + + # Add the location of the current detection + # The region dict contains page, vertices, etc. + aggregated_detections[tag_text]["regions"].append(detect_annotation["region"]) + + # Assume the resource_type is consistent for a given tag text + if "entities" in detect_annotation and detect_annotation["entities"]: + aggregated_detections[tag_text]["resource_type"] = detect_annotation["entities"][0].get( + "resource_type", "Unknown" ) - doc_patterns.append(row) + + # Step 2: Create one RowWrite object for each unique tag + doc_patterns: list[RowWrite] = [] + for tag_text, data in aggregated_detections.items(): + # The columns for the RAW table row + catalog_properties = { + "startSourceId": source_id, + "startNode": file_id.external_id, + "text": tag_text, + "resourceType": data["resource_type"], + # Store the entire list of region dicts + # Note: The RAW table will automatically serialize this list of dicts into a JSON string + "regions": data["regions"], + "sourceCreatedUser": self.FUNCTION_ID, + "sourceUpdatedUser": self.FUNCTION_ID, + } + + # Create a deterministic key based on the tag text and file + row_key = f"{tag_text}_{source_id}" + + row = RowWrite(key=row_key, columns=catalog_properties) + doc_patterns.append(row) + return doc_patterns def _list_annotations_for_file( diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 453666aa..447a6d91 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -189,8 +189,8 @@ class LaunchFunction(BaseModel, alias_generator=to_camel): file_search_property: str = "aliases" target_entities_search_property: str = "aliases" pattern_mode: bool - file_category_property: Optional[str] = None - target_entities_category_property: Optional[str] = None + file_resource_property: Optional[str] = None + target_entities_resource_property: Optional[str] = None data_model_service: DataModelServiceConfig cache_service: CacheServiceConfig annotation_service: AnnotationServiceConfig diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py index b3b9b7a5..658cfd90 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py @@ -126,16 +126,16 @@ class entity: "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, "annotation_type_external_id": job_config.file_view.type, - "search_property": file.properties[job_config.file_view.as_view_id()][{category_property}], - "category_property": file.properties[job_config.file_view.as_view_id()][{search_property}], + "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], + "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } """ external_id: str name: str space: str - annotation_type_external_id: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None - category_property: str + annotation_type: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + resource_type: str search_property: list[str] = field(default_factory=list) def to_dict(self): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 02072d2f..8066f80f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -162,23 +162,23 @@ def _convert_instances_to_entities( """ Convert the asset and file nodes into an entity """ - target_entities_category_property: str | None = self.config.launch_function.target_entities_category_property + target_entities_resource_type: str | None = self.config.launch_function.target_entities_resource_property target_entities_search_property: str = self.config.launch_function.target_entities_search_property target_entities: list[dict] = [] for instance in asset_instances: instance_properties = instance.properties.get(self.target_entities_view.as_view_id()) - if target_entities_category_property: - category: str = instance_properties[target_entities_category_property] + if target_entities_resource_type: + resource_type: str = instance_properties[target_entities_resource_type] else: - category: str = self.target_entities_view.external_id + resource_type: str = self.target_entities_view.external_id if target_entities_search_property in instance_properties: asset_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, - annotation_type_external_id=self.target_entities_view.annotation_type, - category_property=category, + annotation_type=self.target_entities_view.annotation_type, + resource_type=resource_type, search_property=instance_properties.get(target_entities_search_property), ) target_entities.append(asset_entity.to_dict()) @@ -188,29 +188,29 @@ def _convert_instances_to_entities( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, + annotation_type=self.target_entities_view.annotation_type, + resource_type=resource_type, search_property=search_value, - category_property=category, - annotation_type_external_id=self.target_entities_view.annotation_type, ) target_entities.append(asset_entity.to_dict()) - file_category_property: str | None = self.config.launch_function.file_category_property + file_resource_type: str | None = self.config.launch_function.file_resource_property file_search_property: str = self.config.launch_function.file_search_property file_entities: list[dict] = [] for instance in file_instances: instance_properties = instance.properties.get(self.file_view.as_view_id()) - if target_entities_category_property: - category: str = instance_properties[file_category_property] + if target_entities_resource_type: + resource_type: str = instance_properties[file_resource_type] else: - category: str = self.file_view.external_id + resource_type: str = self.file_view.external_id file_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, + annotation_type=self.file_view.annotation_type, + resource_type=resource_type, search_property=instance_properties.get(file_search_property), - category_property=category, - annotation_type_external_id=self.file_view.annotation_type, ) file_entities.append(file_entity.to_dict()) @@ -268,7 +268,7 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str for entity in entities: # NOTE: - key = entity["category_property"] + key = entity["resource_type"] if key not in pattern_builders: pattern_builders[key] = {} @@ -294,7 +294,7 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str # --- Build the final result from the processed patterns --- result = [] - for category, templates in pattern_builders.items(): + for resource_type, templates in pattern_builders.items(): final_samples = [] for template_key, collected_vars in templates.items(): # Create an iterator for the collected letter groups @@ -323,5 +323,5 @@ def replace_A(match): final_samples.append("".join(final_pattern_parts)) if final_samples: - result.append({"sample": sorted(final_samples), "category_property": category}) + result.append({"sample": sorted(final_samples), "resource_type": resource_type}) return result diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 453666aa..447a6d91 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -189,8 +189,8 @@ class LaunchFunction(BaseModel, alias_generator=to_camel): file_search_property: str = "aliases" target_entities_search_property: str = "aliases" pattern_mode: bool - file_category_property: Optional[str] = None - target_entities_category_property: Optional[str] = None + file_resource_property: Optional[str] = None + target_entities_resource_property: Optional[str] = None data_model_service: DataModelServiceConfig cache_service: CacheServiceConfig annotation_service: AnnotationServiceConfig diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index b3b9b7a5..658cfd90 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -126,16 +126,16 @@ class entity: "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, "annotation_type_external_id": job_config.file_view.type, - "search_property": file.properties[job_config.file_view.as_view_id()][{category_property}], - "category_property": file.properties[job_config.file_view.as_view_id()][{search_property}], + "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], + "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } """ external_id: str name: str space: str - annotation_type_external_id: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None - category_property: str + annotation_type: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + resource_type: str search_property: list[str] = field(default_factory=list) def to_dict(self): From 4a1ec2df156f15e0898eb3f29634bbba7f3e98e5 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 28 Aug 2025 11:28:33 -0500 Subject: [PATCH 032/128] improved streamlit experience for multiple extraction pipelines --- .../Extraction_Pipeline.py | 539 ++++++++--------- .../data_structures.py | 38 +- .../file_annotation_dashboard/helper.py | 546 +++++++++--------- .../pages/Status_Overview.py | 273 +++++---- .../requirements.txt | 8 +- 5 files changed, 732 insertions(+), 672 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py index bbb18a2b..bda688fd 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py @@ -1,262 +1,277 @@ -import streamlit as st -import pandas as pd -import altair as alt -from cognite.client import CogniteClient -from datetime import datetime, timedelta -from helper import ( - fetch_annotation_states, - fetch_pipeline_run_history, - process_runs_for_graphing, - fetch_extraction_pipeline_config, - calculate_success_failure_stats, - fetch_function_logs, - parse_run_message, -) - - -# --- Page Configuration --- -st.set_page_config( - page_title="Pipeline Run History", - page_icon="📈", - layout="wide", -) - -# --- Data Fetching --- -pipeline_runs = fetch_pipeline_run_history() - -# --- Main Application --- -st.title("Pipeline Run History") -st.markdown("This page provides statistics and detailed history for all extraction pipeline runs.") - - -# --- Pipeline Statistics Section --- -if pipeline_runs: - # Time window selection - time_window_map = { - "All": None, - "Last 24 Hours": 24, - "Last 7 Days": 7 * 24, - "Last 30 Days": 30 * 24, - } - time_window_option = st.sidebar.selectbox( - "Filter by Time Window:", - options=list(time_window_map.keys()), - ) - window_hours = time_window_map[time_window_option] - - if window_hours is not None: - now = pd.Timestamp.now(tz="UTC") - filter_start_time = now - timedelta(hours=window_hours) - # Filter runs based on the time window - recent_pipeline_runs = [ - run - for run in pipeline_runs - if pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") > filter_start_time - ] - else: - # If 'All' is selected, use the original unfiltered list of runs - recent_pipeline_runs = pipeline_runs - - # MODIFICATION: Check if 'recent_pipeline_runs' has data BEFORE processing. - # If it's empty, display a message. Otherwise, proceed with stats and graphs. - if not recent_pipeline_runs: - st.warning("No pipeline runs found in the selected time window.") - else: - # --- Calculate detailed stats for the selected time window --- - df_runs_for_graphing = process_runs_for_graphing(recent_pipeline_runs) - - launch_success = 0 - launch_failure = 0 - finalize_success = 0 - finalize_failure = 0 - - for run in recent_pipeline_runs: - # We need to parse the message to determine the caller type - parsed_message = parse_run_message(run.message) - caller = parsed_message.get("caller") - - if caller == "Launch": - if run.status == "success": - launch_success += 1 - elif run.status == "failure": - launch_failure += 1 - elif caller == "Finalize": - if run.status == "success": - finalize_success += 1 - elif run.status == "failure": - finalize_failure += 1 - - total_launched_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Launch"]["count"].sum()) - total_finalized_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Finalize"]["count"].sum()) - - # --- Display Metrics and Graphs in two columns --- - g_col1, g_col2 = st.columns(2) - - with g_col1: - st.subheader("Launch Runs") - m_col1, m_col2, m_col3 = st.columns(3) - m_col1.metric( - f"Files Launched", - f"{total_launched_recent:,}", - ) - m_col2.metric( - "Successful Runs", - f"{launch_success:,}", - ) - m_col3.metric( - "Failed Runs", - f"{launch_failure:,}", - delta=f"{launch_failure:,}" if launch_failure > 0 else "0", - delta_color="inverse", - ) - - with g_col2: - st.subheader("Finalize Runs") - m_col4, m_col5, m_col6 = st.columns(3) - m_col4.metric( - f"Files Finalized", - f"{total_finalized_recent:,}", - ) - m_col5.metric( - "Successful Runs", - f"{finalize_success:,}", - ) - m_col6.metric( - "Failed Runs", - f"{finalize_failure:,}", - delta=f"{finalize_failure:,}" if finalize_failure > 0 else "0", - delta_color="inverse", - ) - - # --- Graphs --- - base_chart = ( - alt.Chart(df_runs_for_graphing) - .mark_circle(size=60, opacity=0.7) - .encode( - x=alt.X("timestamp:T", title="Time of Run"), - y=alt.Y("count:Q", title="Files Processed"), - tooltip=["timestamp:T", "count:Q", "type:N"], - ) - .interactive() - ) - - chart_col1, chart_col2 = st.columns(2) - with chart_col1: - launch_chart = base_chart.transform_filter(alt.datum.type == "Launch").properties( - title="Files Processed per Launch Run" - ) - st.altair_chart(launch_chart, use_container_width=True) - with chart_col2: - finalize_chart = base_chart.transform_filter(alt.datum.type == "Finalize").properties( - title="Files Processed per Finalize Run" - ) - st.altair_chart(finalize_chart, use_container_width=True) - - # --- UNIFIED DETAILED RUN HISTORY --- - with st.expander("View recent runs and fetch logs", expanded=True): - if not recent_pipeline_runs: - st.info("No runs in the selected time window.") - else: - f_col1, f_col2 = st.columns(2) - with f_col1: - run_status_filter = st.radio( - "Filter by run status:", - ("All", "Success", "Failure"), - horizontal=True, - key="run_status_filter", - ) - with f_col2: - caller_type_filter = st.radio( - "Filter by caller type:", - ("All", "Launch", "Finalize"), - horizontal=True, - key="caller_type_filter", - ) - - st.divider() - - filtered_runs = recent_pipeline_runs - if run_status_filter != "All": - filtered_runs = [run for run in filtered_runs if run.status.lower() == run_status_filter.lower()] - - if caller_type_filter != "All": - filtered_runs = [ - run for run in filtered_runs if parse_run_message(run.message).get("caller") == caller_type_filter - ] - - if not filtered_runs: - st.warning(f"No runs match the selected filters.") - else: - # Pagination state - if "page_num" not in st.session_state: - st.session_state.page_num = 0 - - items_per_page = 3 - start_idx = st.session_state.page_num * items_per_page - end_idx = start_idx + items_per_page - paginated_runs = filtered_runs[start_idx:end_idx] - - # Display logic for each run - for run in paginated_runs: - - if run.status == "success": - st.markdown(f"**Status:** Success") - st.success( - f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" - ) - else: - st.markdown(f"**Status:** Failure") - st.error( - f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" - ) - - parsed_message = parse_run_message(run.message) - if run.message: - st.code(run.message, language="text") - - function_id = int(parsed_message.get("function_id")) - call_id = int(parsed_message.get("call_id")) - - if function_id and call_id: - button_key = f"log_btn_all_{call_id}" - if st.button("Fetch Function Logs", key=button_key): - with st.spinner("Fetching logs..."): - logs = fetch_function_logs(function_id=function_id, call_id=call_id) - if logs: - st.text_area( - "Function Logs", - "".join(logs), - height=300, - key=f"log_area_all_{call_id}", - ) - else: - st.warning("No logs found for this run.") - st.divider() - - # Pagination controls - total_pages = (len(filtered_runs) + items_per_page - 1) // items_per_page - if total_pages > 1: - p_col1, p_col2, p_col3 = st.columns([1, 2, 1]) - with p_col1: - if st.button( - "Previous", - disabled=(st.session_state.page_num == 0), - use_container_width=True, - ): - st.session_state.page_num -= 1 - st.rerun() - with p_col2: - st.markdown( - f"
Page {st.session_state.page_num + 1} of {total_pages}
", - unsafe_allow_html=True, - ) - with p_col3: - if st.button( - "Next", - disabled=(st.session_state.page_num >= total_pages - 1), - use_container_width=True, - ): - st.session_state.page_num += 1 - st.rerun() -else: - st.info("No data returned from Cognite Data Fusion. Please check your settings and data model.") +import streamlit as st +import pandas as pd +import altair as alt +from cognite.client import CogniteClient +from datetime import datetime, timedelta +from helper import ( + fetch_annotation_states, + fetch_pipeline_run_history, + process_runs_for_graphing, + fetch_extraction_pipeline_config, + calculate_success_failure_stats, + fetch_function_logs, + parse_run_message, + find_pipelines, +) + +st.set_page_config( + page_title="Pipeline Run History", + page_icon="📈", + layout="wide", +) + +# --- Sidebar for Pipeline Selection --- +st.sidebar.title("Pipeline Selection") +# The helper function now returns a pre-filtered list +pipeline_ids = find_pipelines() + +if not pipeline_ids: + st.info("No active file annotation pipelines found to monitor.") + st.stop() + +# Use session_state to remember the selection across pages +if "selected_pipeline" not in st.session_state or st.session_state.selected_pipeline not in pipeline_ids: + st.session_state.selected_pipeline = pipeline_ids[0] + +# The selectbox displays the filtered list for the user +selected_pipeline = st.sidebar.selectbox("Select a pipeline to monitor:", options=pipeline_ids, key="selected_pipeline") + +# --- Main Application --- +st.title("Pipeline Run History") +st.markdown("This page provides statistics and detailed history for the selected extraction pipeline run.") + +# Fetch data using the user's selection +pipeline_runs = fetch_pipeline_run_history(selected_pipeline) + + +# --- Pipeline Statistics Section --- +if pipeline_runs: + # Time window selection + time_window_map = { + "All": None, + "Last 24 Hours": 24, + "Last 7 Days": 7 * 24, + "Last 30 Days": 30 * 24, + } + time_window_option = st.sidebar.selectbox( + "Filter by Time Window:", + options=list(time_window_map.keys()), + ) + window_hours = time_window_map[time_window_option] + + if window_hours is not None: + now = pd.Timestamp.now(tz="UTC") + filter_start_time = now - timedelta(hours=window_hours) + # Filter runs based on the time window + recent_pipeline_runs = [ + run + for run in pipeline_runs + if pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") > filter_start_time + ] + else: + # If 'All' is selected, use the original unfiltered list of runs + recent_pipeline_runs = pipeline_runs + + # MODIFICATION: Check if 'recent_pipeline_runs' has data BEFORE processing. + # If it's empty, display a message. Otherwise, proceed with stats and graphs. + if not recent_pipeline_runs: + st.warning("No pipeline runs found in the selected time window.") + else: + # --- Calculate detailed stats for the selected time window --- + df_runs_for_graphing = process_runs_for_graphing(recent_pipeline_runs) + + launch_success = 0 + launch_failure = 0 + finalize_success = 0 + finalize_failure = 0 + + for run in recent_pipeline_runs: + # We need to parse the message to determine the caller type + parsed_message = parse_run_message(run.message) + caller = parsed_message.get("caller") + + if caller == "Launch": + if run.status == "success": + launch_success += 1 + elif run.status == "failure": + launch_failure += 1 + elif caller == "Finalize": + if run.status == "success": + finalize_success += 1 + elif run.status == "failure": + finalize_failure += 1 + + total_launched_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Launch"]["count"].sum()) + total_finalized_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Finalize"]["count"].sum()) + + # --- Display Metrics and Graphs in two columns --- + g_col1, g_col2 = st.columns(2) + + with g_col1: + st.subheader("Launch Runs") + m_col1, m_col2, m_col3 = st.columns(3) + m_col1.metric( + f"Files Launched", + f"{total_launched_recent:,}", + ) + m_col2.metric( + "Successful Runs", + f"{launch_success:,}", + ) + m_col3.metric( + "Failed Runs", + f"{launch_failure:,}", + delta=f"{launch_failure:,}" if launch_failure > 0 else "0", + delta_color="inverse", + ) + + with g_col2: + st.subheader("Finalize Runs") + m_col4, m_col5, m_col6 = st.columns(3) + m_col4.metric( + f"Files Finalized", + f"{total_finalized_recent:,}", + ) + m_col5.metric( + "Successful Runs", + f"{finalize_success:,}", + ) + m_col6.metric( + "Failed Runs", + f"{finalize_failure:,}", + delta=f"{finalize_failure:,}" if finalize_failure > 0 else "0", + delta_color="inverse", + ) + + # --- Graphs --- + base_chart = ( + alt.Chart(df_runs_for_graphing) + .mark_circle(size=60, opacity=0.7) + .encode( + x=alt.X("timestamp:T", title="Time of Run"), + y=alt.Y("count:Q", title="Files Processed"), + tooltip=["timestamp:T", "count:Q", "type:N"], + ) + .interactive() + ) + + chart_col1, chart_col2 = st.columns(2) + with chart_col1: + launch_chart = base_chart.transform_filter(alt.datum.type == "Launch").properties( + title="Files Processed per Launch Run" + ) + st.altair_chart(launch_chart, use_container_width=True) + with chart_col2: + finalize_chart = base_chart.transform_filter(alt.datum.type == "Finalize").properties( + title="Files Processed per Finalize Run" + ) + st.altair_chart(finalize_chart, use_container_width=True) + + # --- UNIFIED DETAILED RUN HISTORY --- + with st.expander("View recent runs and fetch logs", expanded=True): + if not recent_pipeline_runs: + st.info("No runs in the selected time window.") + else: + f_col1, f_col2 = st.columns(2) + with f_col1: + run_status_filter = st.radio( + "Filter by run status:", + ("All", "Success", "Failure"), + horizontal=True, + key="run_status_filter", + ) + with f_col2: + caller_type_filter = st.radio( + "Filter by caller type:", + ("All", "Launch", "Finalize"), + horizontal=True, + key="caller_type_filter", + ) + + st.divider() + + filtered_runs = recent_pipeline_runs + if run_status_filter != "All": + filtered_runs = [run for run in filtered_runs if run.status.lower() == run_status_filter.lower()] + + if caller_type_filter != "All": + filtered_runs = [ + run for run in filtered_runs if parse_run_message(run.message).get("caller") == caller_type_filter + ] + + if not filtered_runs: + st.warning(f"No runs match the selected filters.") + else: + # Pagination state + if "page_num" not in st.session_state: + st.session_state.page_num = 0 + + items_per_page = 3 + start_idx = st.session_state.page_num * items_per_page + end_idx = start_idx + items_per_page + paginated_runs = filtered_runs[start_idx:end_idx] + + # Display logic for each run + for run in paginated_runs: + + if run.status == "success": + st.markdown(f"**Status:** Success") + st.success( + f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" + ) + else: + st.markdown(f"**Status:** Failure") + st.error( + f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" + ) + + parsed_message = parse_run_message(run.message) + if run.message: + st.code(run.message, language="text") + + function_id = int(parsed_message.get("function_id")) + call_id = int(parsed_message.get("call_id")) + + if function_id and call_id: + button_key = f"log_btn_all_{call_id}" + if st.button("Fetch Function Logs", key=button_key): + with st.spinner("Fetching logs..."): + logs = fetch_function_logs(function_id=function_id, call_id=call_id) + if logs: + st.text_area( + "Function Logs", + "".join(logs), + height=300, + key=f"log_area_all_{call_id}", + ) + else: + st.warning("No logs found for this run.") + st.divider() + + # Pagination controls + total_pages = (len(filtered_runs) + items_per_page - 1) // items_per_page + if total_pages > 1: + p_col1, p_col2, p_col3 = st.columns([1, 2, 1]) + with p_col1: + if st.button( + "Previous", + disabled=(st.session_state.page_num == 0), + use_container_width=True, + ): + st.session_state.page_num -= 1 + st.rerun() + with p_col2: + st.markdown( + f"
Page {st.session_state.page_num + 1} of {total_pages}
", + unsafe_allow_html=True, + ) + with p_col3: + if st.button( + "Next", + disabled=(st.session_state.page_num >= total_pages - 1), + use_container_width=True, + ): + st.session_state.page_num += 1 + st.rerun() +else: + st.info("No data returned from Cognite Data Fusion. Please check your settings and data model.") diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py index 400cd560..7b7cb897 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py @@ -1,18 +1,20 @@ -import streamlit as st -from cognite.client.data_classes.data_modeling import ViewId -from dataclasses import dataclass - - -# Configuration Classes -@dataclass -class ViewPropertyConfig: - schema_space: str - external_id: str - version: str - instance_space: str | None = None - - def as_view_id(self) -> ViewId: - return ViewId(space=self.schema_space, external_id=self.external_id, version=self.version) - - def as_property_ref(self, property) -> list[str]: - return [self.schema_space, f"{self.external_id}/{self.version}", property] +import streamlit as st +from cognite.client.data_classes.data_modeling import ViewId +from dataclasses import dataclass + + +# Configuration Classes +@dataclass +class ViewPropertyConfig: + schema_space: str + external_id: str + version: str + instance_space: str | None = None + + def as_view_id(self) -> ViewId: + return ViewId( + space=self.schema_space, external_id=self.external_id, version=self.version + ) + + def as_property_ref(self, property) -> list[str]: + return [self.schema_space, f"{self.external_id}/{self.version}", property] diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 542efa81..c2ecfb52 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -1,261 +1,285 @@ -import os -import re -import yaml -import streamlit as st -import pandas as pd -from datetime import datetime, timedelta -from cognite.client import CogniteClient -from cognite.client.data_classes.data_modeling import ViewId, NodeId -from cognite.client.data_classes.functions import FunctionCallLog -from data_structures import ViewPropertyConfig - -client = CogniteClient() - -PIPELINE_EXT_ID = "ep_file_annotation" - - -def parse_run_message(message: str) -> dict: - """Parses the structured run message and returns a dictionary of its components.""" - if not message: - return {} - - # Regex to capture all key-value pairs from the new format - pattern = re.compile( - r"\(caller:(?P\w+), function_id:(?P[\w\.-]+), call_id:(?P[\w\.-]+)\) - " - r"total files processed: (?P\d+) - " - r"successful files: (?P\d+) - " - r"failed files: (?P\d+)" - ) - match = pattern.search(message) - if match: - data = match.groupdict() - # Convert numeric strings to integers - for key in ["total", "success", "failed"]: - if key in data: - data[key] = int(data[key]) - return data - return {} - - -@st.cache_data(ttl=3600) -def fetch_extraction_pipeline_config() -> tuple[dict, ViewPropertyConfig, ViewPropertyConfig]: - """ - Fetch configurations from the latest extraction - """ - ep_configuration = client.extraction_pipelines.config.retrieve(external_id=PIPELINE_EXT_ID) - config_dict = yaml.safe_load(ep_configuration.config) - - local_annotation_state_view = config_dict["dataModelViews"]["annotationStateView"] - annotation_state_view = ViewPropertyConfig( - local_annotation_state_view["schemaSpace"], - local_annotation_state_view["externalId"], - local_annotation_state_view["version"], - local_annotation_state_view["instanceSpace"], - ) - - local_file_view = config_dict["dataModelViews"]["fileView"] - file_view = ViewPropertyConfig( - local_file_view["schemaSpace"], - local_file_view["externalId"], - local_file_view["version"], - local_file_view.get("instanceSpace"), - ) - - return (config_dict, annotation_state_view, file_view) - - -@st.cache_data(ttl=3600) -def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view: ViewPropertyConfig): - """ - Fetches annotation state instances from the specified data model view - and joins them with their corresponding file instances. - """ - # 1. Fetch all annotation state instances - annotation_instances = client.data_modeling.instances.list( - instance_type="node", - space=annotation_state_view.instance_space, - sources=annotation_state_view.as_view_id(), - limit=-1, - ) - if not annotation_instances: - st.info("No annotation state instances found in the specified view.") - return pd.DataFrame() - - # 2. Process annotation states and collect NodeIds for linked files - annotation_data = [] - nodes_to_fetch = [] - for instance in annotation_instances: - node_data = { - "externalId": instance.external_id, - "space": instance.space, - "createdTime": pd.to_datetime(instance.created_time, unit="ms"), - "lastUpdatedTime": pd.to_datetime(instance.last_updated_time, unit="ms"), - } - for prop_key, prop_value in instance.properties[annotation_state_view.as_view_id()].items(): - if prop_key == "linkedFile" and prop_value: - file_external_id = prop_value.get("externalId") - file_space = prop_value.get("space") - node_data["fileExternalId"] = file_external_id - node_data["fileSpace"] = file_space - if file_external_id and file_space: - nodes_to_fetch.append(NodeId(space=file_space, external_id=file_external_id)) - node_data[prop_key] = prop_value - annotation_data.append(node_data) - - df_annotations = pd.DataFrame(annotation_data) - if df_annotations.empty or not nodes_to_fetch: - return df_annotations - - # 3. Fetch corresponding file instances using the collected NodeIds - # Remove duplicates before fetching - unique_nodes_to_fetch = list(set(nodes_to_fetch)) - file_instances = client.data_modeling.instances.retrieve_nodes( - nodes=unique_nodes_to_fetch, sources=file_view.as_view_id() - ) - - # 4. Process file instances into a DataFrame - file_data = [] - for instance in file_instances: - node_data = { - "fileExternalId": instance.external_id, - "fileSpace": instance.space, - } - properties = instance.properties[file_view.as_view_id()] - - for prop_key, prop_value in properties.items(): - if isinstance(prop_value, list): - string_values = [] - for value in prop_value: - string_values.append(value) - node_data[f"file{prop_key.capitalize()}"] = ", ".join(filter(None, string_values)) - else: - node_data[f"file{prop_key.capitalize()}"] = prop_value - file_data.append(node_data) - - if not file_data: - return df_annotations - - df_files = pd.DataFrame(file_data) - - # 5. Merge annotation data with file data - df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") - - # 6. Final data cleaning and preparation - if "createdTime" in df_merged.columns: - df_merged["createdTime"] = df_merged["createdTime"].dt.tz_localize("UTC") - if "lastUpdatedTime" in df_merged.columns: - df_merged["lastUpdatedTime"] = df_merged["lastUpdatedTime"].dt.tz_localize("UTC") - - df_merged.rename( - columns={ - "annotationStatus": "status", - "attemptCount": "retries", - "diagramDetectJobId": "jobId", - }, - inplace=True, - ) - - for col in ["status", "fileExternalId", "retries", "jobId"]: - if col not in df_merged.columns: - df_merged[col] = None - - return df_merged - - -@st.cache_data(ttl=3600) -def fetch_pipeline_run_history(): - """Fetches the full run history for a given extraction pipeline.""" - return client.extraction_pipelines.runs.list(external_id=PIPELINE_EXT_ID, limit=-1) - - -def calculate_success_failure_stats(runs): - """Calculates success and failure counts from a list of pipeline runs.""" - success_count = sum(1 for run in runs if run.status == "success") - failure_count = sum(1 for run in runs if run.status == "failure") - return success_count, failure_count - - -def get_failed_run_details(runs): - """Filters for failed runs and extracts their details, including IDs.""" - failed_runs = [] - for run in runs: - if run.status == "failure": - parsed_message = parse_run_message(run.message) - failed_runs.append( - { - "timestamp": pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), - "message": run.message, - "status": run.status, - "function_id": parsed_message.get("function_id"), - "call_id": parsed_message.get("call_id"), - } - ) - return sorted(failed_runs, key=lambda x: x["timestamp"], reverse=True) - - -@st.cache_data(ttl=3600) -def fetch_function_logs(function_id: int, call_id: int): - """Fetches the logs for a specific function call.""" - try: - log: FunctionCallLog = client.functions.calls.get_logs(call_id, function_id) - return log.to_text(with_timestamps=False) - except Exception as e: - return [f"Could not retrieve logs: {e}"] - - -def process_runs_for_graphing(runs): - """Transforms pipeline run data into a DataFrame for graphing.""" - launch_data = [] - finalize_runs_to_agg = [] - - for run in runs: - if run.status != "success": - continue - - parsed = parse_run_message(run.message) - if not parsed: - continue - - timestamp = pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") - count = parsed.get("total", 0) - caller = parsed.get("caller") - - if caller == "Launch": - launch_data.append({"timestamp": timestamp, "count": count, "type": "Launch"}) - elif caller == "Finalize": - finalize_runs_to_agg.append({"timestamp": timestamp, "count": count}) - - # --- Aggregate Finalize Runs --- - aggregated_finalize_data = [] - if finalize_runs_to_agg: - finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) - current_group_start_time = finalize_runs_to_agg[0]["timestamp"] - current_group_count = 0 - - for run in finalize_runs_to_agg: - if run["timestamp"] < current_group_start_time + timedelta(minutes=10): - current_group_count += run["count"] - else: - aggregated_finalize_data.append( - { - "timestamp": current_group_start_time, - "count": current_group_count, - "type": "Finalize", - } - ) - current_group_start_time = run["timestamp"] - current_group_count = run["count"] - - if current_group_count > 0: - aggregated_finalize_data.append( - { - "timestamp": current_group_start_time, - "count": current_group_count, - "type": "Finalize", - } - ) - - df_launch = pd.DataFrame(launch_data) - df_finalize = pd.DataFrame(aggregated_finalize_data) - - return pd.concat([df_launch, df_finalize], ignore_index=True) +import os +import re +import yaml +import streamlit as st +import pandas as pd +from datetime import datetime, timedelta +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import ViewId, NodeId +from cognite.client.data_classes.functions import FunctionCallLog +from data_structures import ViewPropertyConfig + +client = CogniteClient() + + +@st.cache_data(ttl=600) +def find_pipelines(name_filter: str = "file_annotation") -> list[str]: + """ + Finds the external IDs of all extraction pipelines in the project, + filtered by a substring in their external ID. + """ + try: + # List all pipelines in the project + all_pipelines = client.extraction_pipelines.list(limit=-1) + if not all_pipelines: + st.warning(f"No extraction pipelines found in the project.") + return [] + + # Filter pipelines where the external ID contains the name_filter string + filtered_ids = [p.external_id for p in all_pipelines if name_filter in p.external_id] + + if not filtered_ids: + st.warning(f"No pipelines matching the filter '*{name_filter}*' found in the project.") + return [] + + return sorted(filtered_ids) + except Exception as e: + st.error(f"An error occurred while searching for extraction pipelines: {e}") + return [] + + +def parse_run_message(message: str) -> dict: + """Parses the structured run message and returns a dictionary of its components.""" + if not message: + return {} + + # Regex to capture all key-value pairs from the new format + pattern = re.compile( + r"\(caller:(?P\w+), function_id:(?P[\w\.-]+), call_id:(?P[\w\.-]+)\) - " + r"total files processed: (?P\d+) - " + r"successful files: (?P\d+) - " + r"failed files: (?P\d+)" + ) + match = pattern.search(message) + if match: + data = match.groupdict() + # Convert numeric strings to integers + for key in ["total", "success", "failed"]: + if key in data: + data[key] = int(data[key]) + return data + return {} + + +@st.cache_data(ttl=3600) +def fetch_extraction_pipeline_config(pipeline_ext_id: str) -> tuple[dict, ViewPropertyConfig, ViewPropertyConfig]: + """ + Fetch configurations from the latest extraction + """ + ep_configuration = client.extraction_pipelines.config.retrieve(external_id=pipeline_ext_id) + config_dict = yaml.safe_load(ep_configuration.config) + + local_annotation_state_view = config_dict["dataModelViews"]["annotationStateView"] + annotation_state_view = ViewPropertyConfig( + local_annotation_state_view["schemaSpace"], + local_annotation_state_view["externalId"], + local_annotation_state_view["version"], + local_annotation_state_view["instanceSpace"], + ) + + local_file_view = config_dict["dataModelViews"]["fileView"] + file_view = ViewPropertyConfig( + local_file_view["schemaSpace"], + local_file_view["externalId"], + local_file_view["version"], + local_file_view.get("instanceSpace"), + ) + + return (config_dict, annotation_state_view, file_view) + + +@st.cache_data(ttl=3600) +def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view: ViewPropertyConfig): + """ + Fetches annotation state instances from the specified data model view + and joins them with their corresponding file instances. + """ + # 1. Fetch all annotation state instances + annotation_instances = client.data_modeling.instances.list( + instance_type="node", + space=annotation_state_view.instance_space, + sources=annotation_state_view.as_view_id(), + limit=-1, + ) + if not annotation_instances: + st.info("No annotation state instances found in the specified view.") + return pd.DataFrame() + + # 2. Process annotation states and collect NodeIds for linked files + annotation_data = [] + nodes_to_fetch = [] + for instance in annotation_instances: + node_data = { + "externalId": instance.external_id, + "space": instance.space, + "createdTime": pd.to_datetime(instance.created_time, unit="ms"), + "lastUpdatedTime": pd.to_datetime(instance.last_updated_time, unit="ms"), + } + for prop_key, prop_value in instance.properties[annotation_state_view.as_view_id()].items(): + if prop_key == "linkedFile" and prop_value: + file_external_id = prop_value.get("externalId") + file_space = prop_value.get("space") + node_data["fileExternalId"] = file_external_id + node_data["fileSpace"] = file_space + if file_external_id and file_space: + nodes_to_fetch.append(NodeId(space=file_space, external_id=file_external_id)) + node_data[prop_key] = prop_value + annotation_data.append(node_data) + + df_annotations = pd.DataFrame(annotation_data) + if df_annotations.empty or not nodes_to_fetch: + return df_annotations + + # 3. Fetch corresponding file instances using the collected NodeIds + # Remove duplicates before fetching + unique_nodes_to_fetch = list(set(nodes_to_fetch)) + file_instances = client.data_modeling.instances.retrieve_nodes( + nodes=unique_nodes_to_fetch, sources=file_view.as_view_id() + ) + + # 4. Process file instances into a DataFrame + file_data = [] + for instance in file_instances: + node_data = { + "fileExternalId": instance.external_id, + "fileSpace": instance.space, + } + properties = instance.properties[file_view.as_view_id()] + + for prop_key, prop_value in properties.items(): + if isinstance(prop_value, list): + string_values = [] + for value in prop_value: + string_values.append(str(value)) + node_data[f"file{prop_key.capitalize()}"] = ", ".join(filter(None, string_values)) + else: + node_data[f"file{prop_key.capitalize()}"] = prop_value + file_data.append(node_data) + + if not file_data: + return df_annotations + + df_files = pd.DataFrame(file_data) + + # 5. Merge annotation data with file data + df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") + + # 6. Final data cleaning and preparation + if "createdTime" in df_merged.columns: + df_merged["createdTime"] = df_merged["createdTime"].dt.tz_localize("UTC") + if "lastUpdatedTime" in df_merged.columns: + df_merged["lastUpdatedTime"] = df_merged["lastUpdatedTime"].dt.tz_localize("UTC") + + df_merged.rename( + columns={ + "annotationStatus": "status", + "attemptCount": "retries", + "diagramDetectJobId": "jobId", + }, + inplace=True, + ) + + for col in ["status", "fileExternalId", "retries", "jobId"]: + if col not in df_merged.columns: + df_merged[col] = None + + return df_merged + + +@st.cache_data(ttl=3600) +def fetch_pipeline_run_history(pipeline_ext_id: str): + """Fetches the full run history for a given extraction pipeline.""" + return client.extraction_pipelines.runs.list(external_id=pipeline_ext_id, limit=-1) + + +def calculate_success_failure_stats(runs): + """Calculates success and failure counts from a list of pipeline runs.""" + success_count = sum(1 for run in runs if run.status == "success") + failure_count = sum(1 for run in runs if run.status == "failure") + return success_count, failure_count + + +def get_failed_run_details(runs): + """Filters for failed runs and extracts their details, including IDs.""" + failed_runs = [] + for run in runs: + if run.status == "failure": + parsed_message = parse_run_message(run.message) + failed_runs.append( + { + "timestamp": pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), + "message": run.message, + "status": run.status, + "function_id": parsed_message.get("function_id"), + "call_id": parsed_message.get("call_id"), + } + ) + return sorted(failed_runs, key=lambda x: x["timestamp"], reverse=True) + + +@st.cache_data(ttl=3600) +def fetch_function_logs(function_id: int, call_id: int): + """Fetches the logs for a specific function call.""" + try: + log: FunctionCallLog = client.functions.calls.get_logs(call_id, function_id) + return log.to_text(with_timestamps=False) + except Exception as e: + return [f"Could not retrieve logs: {e}"] + + +def process_runs_for_graphing(runs): + """Transforms pipeline run data into a DataFrame for graphing.""" + launch_data = [] + finalize_runs_to_agg = [] + + for run in runs: + if run.status != "success": + continue + + parsed = parse_run_message(run.message) + if not parsed: + continue + + timestamp = pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") + count = parsed.get("total", 0) + caller = parsed.get("caller") + + if caller == "Launch": + launch_data.append({"timestamp": timestamp, "count": count, "type": "Launch"}) + elif caller == "Finalize": + finalize_runs_to_agg.append({"timestamp": timestamp, "count": count}) + + # --- Aggregate Finalize Runs --- + aggregated_finalize_data = [] + if finalize_runs_to_agg: + finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) + current_group_start_time = finalize_runs_to_agg[0]["timestamp"] + current_group_count = 0 + + for run in finalize_runs_to_agg: + if run["timestamp"] < current_group_start_time + timedelta(minutes=10): + current_group_count += run["count"] + else: + aggregated_finalize_data.append( + { + "timestamp": current_group_start_time, + "count": current_group_count, + "type": "Finalize", + } + ) + current_group_start_time = run["timestamp"] + current_group_count = run["count"] + + if current_group_count > 0: + aggregated_finalize_data.append( + { + "timestamp": current_group_start_time, + "count": current_group_count, + "type": "Finalize", + } + ) + + df_launch = pd.DataFrame(launch_data) + df_finalize = pd.DataFrame(aggregated_finalize_data) + + return pd.concat([df_launch, df_finalize], ignore_index=True) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py index 0a8c9bdc..b3c289bc 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py @@ -1,127 +1,146 @@ -import streamlit as st -import pandas as pd -from helper import ( - fetch_annotation_states, - fetch_extraction_pipeline_config, -) - -# --- Page Configuration --- -st.set_page_config( - page_title="Annotation Status Overview", - page_icon="📄", - layout="wide", -) - -# --- Data Fetching --- -ep_config, annotation_state_view, file_view = fetch_extraction_pipeline_config() -df_raw = fetch_annotation_states(annotation_state_view, file_view) - - -# --- Main Application --- -st.title("Annotation Status Overview") -st.markdown("This page provides an audit trail and overview of the file annotation process.") - -if not df_raw.empty: - # --- Sidebar Filters --- - st.sidebar.title("Filters") - - # Status Filter - all_statuses = ["All"] + sorted(df_raw["status"].unique().tolist()) - selected_status = st.sidebar.selectbox("Filter by Status", options=all_statuses) - - # Date Range Filter - min_date = df_raw["lastUpdatedTime"].min().date() - max_date = df_raw["lastUpdatedTime"].max().date() - # THE FIX IS HERE: Changed max_date to max_value - date_range = st.sidebar.date_input( - "Filter by Last Updated Date", - value=(min_date, max_date), - min_value=min_date, - max_value=max_date, - ) - - # Dynamic Scope Property Filters - primary_scope_property = ep_config["launchFunction"].get("primaryScopeProperty") - secondary_scope_property = ep_config["launchFunction"].get("secondaryScopeProperty") - - selected_primary_scope = "All" - if primary_scope_property and f"file{primary_scope_property.capitalize()}" in df_raw.columns: - primary_scope_options = ["All"] + df_raw[f"file{primary_scope_property.capitalize()}"].unique().tolist() - selected_primary_scope = st.sidebar.selectbox( - f"Filter by {primary_scope_property}", options=primary_scope_options - ) - - selected_secondary_scope = "All" - if secondary_scope_property and f"file{secondary_scope_property.capitalize()}" in df_raw.columns: - secondary_scope_options = ["All"] + df_raw[f"file{secondary_scope_property.capitalize()}"].unique().tolist() - selected_secondary_scope = st.sidebar.selectbox( - f"Filter by {secondary_scope_property}", options=secondary_scope_options - ) - - # Apply all filters - df_filtered = df_raw.copy() - if selected_status != "All": - df_filtered = df_filtered[df_filtered["status"] == selected_status] - - if len(date_range) == 2: - start_date, end_date = date_range - df_filtered = df_filtered[ - (df_filtered["lastUpdatedTime"].dt.date >= start_date) - & (df_filtered["lastUpdatedTime"].dt.date <= end_date) - ] - - if selected_primary_scope != "All": - df_filtered = df_filtered[df_filtered[f"file{primary_scope_property.capitalize()}"] == selected_primary_scope] - - if selected_secondary_scope != "All": - df_filtered = df_filtered[ - df_filtered[f"file{secondary_scope_property.capitalize()}"] == selected_secondary_scope - ] - - # --- Dashboard Metrics --- - st.subheader("Status Overview") - - status_counts = df_filtered["status"].value_counts() - - col1, col2, col3, col4 = st.columns(4) - with col1: - st.metric("Total Files", len(df_filtered)) - with col2: - st.metric("Annotated", status_counts.get("Annotated", 0)) - with col3: - st.metric("New", status_counts.get("New", 0)) - st.metric("Processing", status_counts.get("Processing", 0)) - with col4: - st.metric("Finalizing", status_counts.get("Finalizing", 0)) - st.metric("Failed", status_counts.get("Failed", 0)) - - # --- Detailed Data View --- - default_columns = [ - "fileName", - "status", - "jobId", - "annotationMessage", - "filePageCount", - "retries", - "fileTags", - "lastUpdatedTime", - ] - - available_columns = df_filtered.columns.tolist() - default_selection = [col for col in default_columns if col in available_columns] - - with st.popover("Customize Columns"): - selected_columns = st.multiselect( - "Select columns to display:", - options=available_columns, - default=default_selection, - label_visibility="collapsed", - ) - - if selected_columns: - st.dataframe(df_filtered[selected_columns], use_container_width=True) - else: - st.warning("Please select at least one column to display.") - -else: - st.info("No annotation state data returned from Cognite Data Fusion. Please check your settings and data model.") +import streamlit as st +import pandas as pd +from helper import ( + fetch_annotation_states, + fetch_extraction_pipeline_config, + find_pipelines, +) + +# --- Page Configuration --- +st.set_page_config( + page_title="Annotation Status Overview", + page_icon="📄", + layout="wide", +) + +# --- Sidebar for Pipeline Selection --- +st.sidebar.title("Pipeline Selection") +pipeline_ids = find_pipelines() + +if not pipeline_ids: + st.info("No active file annotation pipelines found to monitor.") + st.stop() + +# Add an independent dropdown selector for this page +# It uses a different key to avoid conflicts +selected_pipeline = st.sidebar.selectbox( + "Select a pipeline to view status:", options=pipeline_ids, key="status_pipeline_selector" +) + +# --- Data Fetching --- +# Pass the selected pipeline ID from this page's dropdown +config_result = fetch_extraction_pipeline_config(selected_pipeline) +if not config_result: + st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") + st.stop() + +ep_config, annotation_state_view, file_view = config_result +df_raw = fetch_annotation_states(annotation_state_view, file_view) + + +# --- Main Application --- +st.title(f"Annotation Status Overview:") +st.markdown("This page provides an audit trail and overview of the file annotation process.") + +if not df_raw.empty: + # --- Sidebar Filters --- + st.sidebar.title("Filters") + + # Status Filter + all_statuses = ["All"] + sorted(df_raw["status"].unique().tolist()) + selected_status = st.sidebar.selectbox("Filter by Status", options=all_statuses) + + # Date Range Filter + min_date = df_raw["lastUpdatedTime"].min().date() + max_date = df_raw["lastUpdatedTime"].max().date() + date_range = st.sidebar.date_input( + "Filter by Last Updated Date", + value=(min_date, max_date), + min_value=min_date, + max_value=max_date, + ) + + # ... (The rest of your page logic for filters, metrics, and dataframes is the same) + # Dynamic Scope Property Filters + primary_scope_property = ep_config["launchFunction"].get("primaryScopeProperty") + secondary_scope_property = ep_config["launchFunction"].get("secondaryScopeProperty") + + selected_primary_scope = "All" + if primary_scope_property and f"file{primary_scope_property.capitalize()}" in df_raw.columns: + primary_scope_options = ["All"] + df_raw[f"file{primary_scope_property.capitalize()}"].unique().tolist() + selected_primary_scope = st.sidebar.selectbox( + f"Filter by {primary_scope_property}", options=primary_scope_options + ) + + selected_secondary_scope = "All" + if secondary_scope_property and f"file{secondary_scope_property.capitalize()}" in df_raw.columns: + secondary_scope_options = ["All"] + df_raw[f"file{secondary_scope_property.capitalize()}"].unique().tolist() + selected_secondary_scope = st.sidebar.selectbox( + f"Filter by {secondary_scope_property}", options=secondary_scope_options + ) + + # Apply all filters + df_filtered = df_raw.copy() + if selected_status != "All": + df_filtered = df_filtered[df_filtered["status"] == selected_status] + + if len(date_range) == 2: + start_date, end_date = date_range + df_filtered = df_filtered[ + (df_filtered["lastUpdatedTime"].dt.date >= start_date) + & (df_filtered["lastUpdatedTime"].dt.date <= end_date) + ] + + if selected_primary_scope != "All": + df_filtered = df_filtered[df_filtered[f"file{primary_scope_property.capitalize()}"] == selected_primary_scope] + + if selected_secondary_scope != "All": + df_filtered = df_filtered[ + df_filtered[f"file{secondary_scope_property.capitalize()}"] == selected_secondary_scope + ] + # --- Dashboard Metrics --- + st.subheader("Status Overview") + + status_counts = df_filtered["status"].value_counts() + + col1, col2, col3, col4 = st.columns(4) + with col1: + st.metric("Total Files", len(df_filtered)) + with col2: + st.metric("Annotated", status_counts.get("Annotated", 0)) + with col3: + st.metric("New", status_counts.get("New", 0)) + st.metric("Processing", status_counts.get("Processing", 0)) + with col4: + st.metric("Finalizing", status_counts.get("Finalizing", 0)) + st.metric("Failed", status_counts.get("Failed", 0)) + + # --- Detailed Data View --- + default_columns = [ + "fileName", + "status", + "jobId", + "annotationMessage", + "filePageCount", + "retries", + "fileTags", + "lastUpdatedTime", + ] + + available_columns = df_filtered.columns.tolist() + default_selection = [col for col in default_columns if col in available_columns] + + with st.popover("Customize Columns"): + selected_columns = st.multiselect( + "Select columns to display:", + options=available_columns, + default=default_selection, + label_visibility="collapsed", + ) + + if selected_columns: + st.dataframe(df_filtered[selected_columns], use_container_width=True) + else: + st.warning("Please select at least one column to display.") +else: + st.info("No annotation state data found for the selected pipeline. Please check its configuration and runs.") diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/requirements.txt b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/requirements.txt index be55ec1a..e3938ef4 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/requirements.txt +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/requirements.txt @@ -1,5 +1,5 @@ -pandas -altair -PyYaml -pyodide-http==0.2.1 +pandas +altair +PyYaml +pyodide-http==0.2.1 cognite-sdk==7.73.4 \ No newline at end of file From c04079d6cb5504734e307d8faff626aa3e77dfc6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 28 Aug 2025 12:15:10 -0500 Subject: [PATCH 033/128] Refactored apply service Raise exception when error message is provided from diagram detect job --- .../services/ApplyService.py | 22 +++++++++----- .../services/FinalizeService.py | 30 +++++++++---------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 2d0e7680..35b0e393 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -44,7 +44,11 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro pass @abc.abstractmethod - def update_nodes(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + def update_instances( + self, + list_node_apply: list[NodeApply] | NodeApply | None = None, + list_edge_apply: list[EdgeApply] | EdgeApply | None = None, + ) -> InstancesApplyResult: pass @abc.abstractmethod @@ -104,19 +108,21 @@ def apply_annotations(self, result_item: dict, file_node: Node) -> tuple[list[Ro ) edge_applies.extend(edge_apply_dict.values()) - self.client.data_modeling.instances.apply( - nodes=node_apply, - edges=edge_applies, - replace=False, - ) + self.update_instances(list_node_apply=node_apply, list_edge_apply=edge_applies) + return doc_doc, doc_tag - def update_nodes(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + def update_instances( + self, + list_node_apply: list[NodeApply] | NodeApply | None = None, + list_edge_apply: list[EdgeApply] | EdgeApply | None = None, + ) -> InstancesApplyResult: update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( nodes=list_node_apply, + edges=list_edge_apply, replace=False, # ensures we don't delete other properties in the view ) - return update_results.nodes + return update_results def _detect_annotation_to_edge_applies( self, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 6815deee..87f4ae63 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -8,6 +8,7 @@ from cognite.client.data_classes.data_modeling import ( Node, NodeId, + NodeList, NodeApply, NodeApplyList, NodeOrEdgeData, @@ -199,7 +200,7 @@ def run(self) -> Literal["Done"] | None: count_failed = 0 count_success = 0 annotation_state_node_applies: list[NodeApply] = [] - failed_file_ids: list[NodeId] = [] + failed_files: NodeList[Node] = [] # Loop through the merged results, processing one file at a time for (space, external_id), results in merged_results.items(): @@ -234,7 +235,7 @@ def run(self) -> Literal["Done"] | None: annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." self.logger.info(f"\t- {annotation_msg}") elif regular_item and regular_item.get("errorMessage"): - annotation_msg = regular_item.get("errorMessage") + raise Exception(regular_item.get("errorMessage")) else: annotation_msg: str = "Found no annotations to apply" @@ -287,35 +288,32 @@ def run(self) -> Literal["Done"] | None: except Exception as e: # If anything fails for this file, mark it for retry or failure - msg = f"Failed to process annotations for file {str(file_id)}: {str(e)}" - self.logger.error(msg) + self.logger.error(f"Failed to process annotations for file {str(file_id)}: {str(e)}") if next_attempt_count >= self.max_retries: job_node_to_update = self._process_annotation_state( node=annotation_state_node, status=AnnotationStatus.FAILED, attempt_count=next_attempt_count, - annotation_message=msg, - pattern_mode_message=msg, + annotation_message=str(e), + pattern_mode_message=str(e), ) count_failed += 1 - failed_file_ids.append(file_id) + failed_files.append(file_id) else: job_node_to_update = self._process_annotation_state( node=annotation_state_node, status=AnnotationStatus.RETRY, attempt_count=next_attempt_count, - annotation_message=msg, - pattern_mode_message=msg, + annotation_message=str(e), + pattern_mode_message=str(e), ) count_retry += 1 if job_node_to_update: annotation_state_node_applies.append(job_node_to_update) - if failed_file_ids: - file_applies: NodeApplyList = self.client.data_modeling.instances.retrieve_nodes( - nodes=failed_file_ids, sources=self.file_view.as_view_id() - ).as_write() + if failed_files: + file_applies: NodeApplyList = failed_files.as_write() for node_apply in file_applies: node_apply.existing_version = None tags_property: list[str] = cast(list[str], node_apply.sources[0].properties["tags"]) @@ -344,7 +342,7 @@ def run(self) -> Literal["Done"] | None: section="START", ) try: - self.apply_service.update_nodes(list_node_apply=annotation_state_node_applies) + self.apply_service.update_instances(list_node_apply=annotation_state_node_applies) self.logger.info( f"\t- {count_success} set to Annotated\n- {count_retry} set to retry\n- {count_failed} set to failed" ) @@ -501,7 +499,7 @@ def _update_batch_state( view_id=self.annotation_state_view.as_view_id(), ) try: - update_results = self.apply_service.update_nodes(list_node_apply=batch.apply) + update_results = self.apply_service.update_instances(list_node_apply=batch.apply) self.logger.info(f"- set annotation status to {status}") except Exception as e: self.logger.error( @@ -509,5 +507,5 @@ def _update_batch_state( section="END", ) time.sleep(30) - update_results = self.apply_service.update_nodes(list_node_apply=batch.apply) + update_results = self.apply_service.update_instances(list_node_apply=batch.apply) self.logger.info(f"- set annotation status to {status}") From 33c30d3cc44020f9f541e66c0db8849d3fdb55d0 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 29 Aug 2025 21:23:46 -0500 Subject: [PATCH 034/128] added annotation quality page to streamlit module --- .../ep_file_annotation.config.yaml | 11 +- .../services/ApplyService.py | 3 + .../services/FinalizeService.py | 4 +- .../utils/DataStructures.py | 2 +- .../utils/DataStructures.py | 2 +- .../file_annotation_dashboard/canvas.py | 184 ++++++++++ .../file_annotation_dashboard/helper.py | 110 +++++- .../pages/Annotation_Quality.py | 342 ++++++++++++++++++ 8 files changed, 632 insertions(+), 26 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py create mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 51b1458f..70a6a54b 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -54,8 +54,8 @@ config: secondaryScopeProperty: # NOTE: below configurations are used by pattern mode patternMode: True - fileCategoryProperty: # optional - targetEntitiesCategoryProperty: # optional + fileResourceProperty: # optional + targetEntitiesResourceProperty: # optional dataModelService: getFilesToProcessQuery: targetView: @@ -95,18 +95,11 @@ config: annotationService: pageRange: 50 partialMatch: True - minTokens: 2 diagramDetectConfig: connectionFlags: noTextInbetween: True naturalReadingOrder: True - customizeFuzziness: - fuzzyScore: 0.93 - maxBoxes: - minChars: 10 - minFuzzyScore: 0.915 readEmbeddedText: True - removeLeadingZeros: True finalizeFunction: cleanOldAnnotations: True maxRetryAttempts: 3 diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 35b0e393..2f78718e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -156,8 +156,10 @@ def _detect_annotation_to_edge_applies( "externalId": external_id, "startSourceId": source_id, "startNode": file_instance_id.external_id, + "startNodeSpace": file_instance_id.space, "endNode": entity["external_id"], "endNodeSpace": entity["space"], + "endNodeResourceType": entity["resource_type"], "viewId": self.core_annotation_view_id.external_id, "viewSpace": self.core_annotation_view_id.space, "viewVersion": self.core_annotation_view_id.version, @@ -303,6 +305,7 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro catalog_properties = { "startSourceId": source_id, "startNode": file_id.external_id, + "startNodeSpace": file_id.space, "text": tag_text, "resourceType": data["resource_type"], # Store the entire list of region dicts diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 87f4ae63..dd6d51b7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -200,7 +200,7 @@ def run(self) -> Literal["Done"] | None: count_failed = 0 count_success = 0 annotation_state_node_applies: list[NodeApply] = [] - failed_files: NodeList[Node] = [] + failed_files: NodeList[Node] = NodeList(resources=[]) # Loop through the merged results, processing one file at a time for (space, external_id), results in merged_results.items(): @@ -298,7 +298,7 @@ def run(self) -> Literal["Done"] | None: pattern_mode_message=str(e), ) count_failed += 1 - failed_files.append(file_id) + failed_files.append(file_node) else: job_node_to_update = self._process_annotation_state( node=annotation_state_node, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py index 658cfd90..e7eff7a8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py @@ -125,7 +125,7 @@ class entity: "external_id": file.external_id, "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, - "annotation_type_external_id": job_config.file_view.type, + "annotation_type": job_config.file_view.type, "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index 658cfd90..e7eff7a8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -125,7 +125,7 @@ class entity: "external_id": file.external_id, "name": file.properties[job_config.file_view.as_view_id()]["name"], "space": file.space, - "annotation_type_external_id": job_config.file_view.type, + "annotation_type": job_config.file_view.type, "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], } diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py new file mode 100644 index 00000000..c03a5c00 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -0,0 +1,184 @@ +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import NodeOrEdgeData, NodeApply, EdgeApply, ContainerId, ViewId, Node +import datetime +import uuid +import streamlit as st + +# Settings for the Industrial Canvas Data Model +CANVAS_SPACE_CANVAS = "cdf_industrial_canvas" +CANVAS_SPACE_INSTANCE = "IndustrialCanvasInstanceSpace" +CANVAS_CONTAINER_CANVAS = "Canvas" +CANVAS_CONTAINER_INSTANCE = "FdmInstanceContainerReference" +CANVAS_CONTAINER_ANNOTATION = "CanvasAnnotation" + + +def get_time(): + now = datetime.datetime.now(datetime.timezone.utc) + return now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + + +def get_user_id(client: CogniteClient): + if client: + return client.iam.user_profiles.me().user_identifier + return None + + +def generate_id(): + return str(uuid.uuid4()) + + +def generate_properties(file_node: Node, file_view_id: ViewId, node_id: str, offset_x: int = 0, offset_y: int = 0): + """Generates the property dictionary for a file node to be displayed on the canvas.""" + return { + "id": node_id, + "containerReferenceType": "fdmInstance", + "label": file_node.properties[file_view_id].get("name", file_node.external_id), + "x": offset_x, + "y": offset_y, + "width": 800, # Increased default size for better viewing + "height": 600, + "maxWidth": 1600, + "maxHeight": 1200, + "instanceExternalId": file_node.external_id, + "instanceSpace": file_node.space, + "viewExternalId": file_view_id.external_id, + "viewSpace": file_view_id.space, + "viewVersion": file_view_id.version, + "properties": {"zIndex": 0}, + } + + +def create_canvas(name: str, client: CogniteClient): + """Creates the main canvas node.""" + canvas_id = generate_id() + canvas = NodeApply( + space=CANVAS_SPACE_INSTANCE, + external_id=canvas_id, + sources=[ + NodeOrEdgeData( + source=ContainerId(CANVAS_SPACE_CANVAS, CANVAS_CONTAINER_CANVAS), + properties={ + "name": name, + "visibility": "private", + "updatedAt": get_time(), + "createdBy": get_user_id(client), + "updatedBy": get_user_id(client), + }, + ) + ], + ) + return canvas, canvas_id + + +def create_objects(canvas_id: str, file_node: Node, file_view_id: ViewId): + """Creates the node and edge for the file container, returning its ID.""" + file_container_id = generate_id() + properties = generate_properties(file_node, file_view_id, file_container_id) + + node_apply = NodeApply( + space=CANVAS_SPACE_INSTANCE, + external_id=f"{canvas_id}_{file_container_id}", + sources=[ + NodeOrEdgeData( + source=ContainerId(CANVAS_SPACE_CANVAS, CANVAS_CONTAINER_INSTANCE), + properties=properties, + ) + ], + ) + + edge_apply = EdgeApply( + space=CANVAS_SPACE_INSTANCE, + external_id=f"{canvas_id}_{canvas_id}_{file_container_id}", + type=(CANVAS_SPACE_CANVAS, "referencesFdmInstanceContainerReference"), + start_node=(CANVAS_SPACE_INSTANCE, canvas_id), + end_node=(CANVAS_SPACE_INSTANCE, f"{canvas_id}_{file_container_id}"), + ) + return [node_apply], [edge_apply], file_container_id + + +def create_bounding_box_annotations(canvas_id: str, file_container_id: str, unmatched_tags: list[dict]): + """Creates annotation nodes and edges for unmatched tags.""" + annotation_nodes = [] + annotation_edges = [] + + for tag_info in unmatched_tags: + tag_text = tag_info["text"] + regions = tag_info.get("regions", []) + + for region in regions: + vertices = region.get("vertices", []) + if not vertices: + continue + + x_coords = [v["x"] for v in vertices] + y_coords = [v["y"] for v in vertices] + x_min, x_max = min(x_coords), max(x_coords) + y_min, y_max = min(y_coords), max(y_coords) + + annotation_id = generate_id() + properties = { + "id": annotation_id, + "annotationType": "rectangle", + "containerId": file_container_id, # <-- This is the crucial link + "isSelectable": True, + "isDraggable": True, + "isResizable": True, + "properties": { + "x": x_min, + "y": y_min, + "width": x_max - x_min, + "height": y_max - y_min, + "label": tag_text, + "zIndex": 10, + "style": { + "fill": "rgba(255, 165, 0, 0.3)", # Semi-transparent orange + "stroke": "orange", + "strokeWidth": 1, + "opacity": 1, + }, + }, + } + + annotation_node = NodeApply( + space=CANVAS_SPACE_INSTANCE, + external_id=f"{canvas_id}_{annotation_id}", + sources=[ + NodeOrEdgeData( + source=ContainerId(CANVAS_SPACE_CANVAS, CANVAS_CONTAINER_ANNOTATION), + properties=properties, + ) + ], + ) + annotation_nodes.append(annotation_node) + + annotation_edge = EdgeApply( + space=CANVAS_SPACE_INSTANCE, + external_id=f"{canvas_id}_{canvas_id}_{annotation_id}", + type=(CANVAS_SPACE_CANVAS, "referencesCanvasAnnotation"), + start_node=(CANVAS_SPACE_INSTANCE, canvas_id), + end_node=(CANVAS_SPACE_INSTANCE, f"{canvas_id}_{annotation_id}"), + ) + annotation_edges.append(annotation_edge) + + return annotation_nodes, annotation_edges + + +def dm_generate( + name: str, file_node: Node, file_view_id: ViewId, client: CogniteClient, unmatched_tags_with_regions: list = [] +): + """Orchestrates the creation of the canvas, its objects, and bounding box annotations.""" + canvas, canvas_id = create_canvas(name=name, client=client) + nodes, edges, file_container_id = create_objects( + canvas_id=canvas_id, file_node=file_node, file_view_id=file_view_id + ) + + if unmatched_tags_with_regions: + annotation_nodes, annotation_edges = create_bounding_box_annotations( + canvas_id, file_container_id, unmatched_tags_with_regions + ) + nodes.extend(annotation_nodes) + edges.extend(annotation_edges) + + client.data_modeling.instances.apply(nodes=[canvas] + nodes, edges=edges) + st.session_state["canvas_id"] = canvas_id + return canvas_id diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index c2ecfb52..4785dc59 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -5,13 +5,62 @@ import pandas as pd from datetime import datetime, timedelta from cognite.client import CogniteClient -from cognite.client.data_classes.data_modeling import ViewId, NodeId +from cognite.client.data_classes.data_modeling import ViewId, NodeId, Node from cognite.client.data_classes.functions import FunctionCallLog from data_structures import ViewPropertyConfig +from canvas import dm_generate client = CogniteClient() +@st.cache_data(ttl=3600) +def get_file_node(file_id: NodeId, file_view: ViewPropertyConfig) -> Node | None: + """Fetches a single file node from CDF.""" + try: + node = client.data_modeling.instances.retrieve_nodes(nodes=file_id, sources=file_view.as_view_id()) + return node + except Exception as e: + st.error(f"Failed to retrieve file node {file_id}: {e}") + return None + + +def generate_file_canvas( + file_id: NodeId, file_view: ViewPropertyConfig, ep_config: dict, unmatched_tags_with_regions: list = [] +): + """ + Generates an Industrial Canvas, including bounding boxes for unmatched tags, + and returns the canvas URL. + """ + file_node = get_file_node(file_id, file_view) + if not file_node: + st.error("Could not generate canvas because the file node could not be retrieved.") + return None + + canvas_name = f"Annotation Quality Analysis - {file_node.external_id}" + + try: + domain = ep_config.get("streamlitDashboard", {}).get("industrialCanvasDomain", "cog-shadow-projects") + project = client.config.project + cluster = client.config.cdf_cluster + + # Pass the unmatched tags data to dm_generate + canvas_id = dm_generate( + name=canvas_name, + file_node=file_node, + file_view_id=file_view.as_view_id(), + client=client, + unmatched_tags_with_regions=unmatched_tags_with_regions, + ) + st.success(f"Successfully generated canvas: {canvas_name}") + + canvas_url = f"https://{domain}.fusion.cognite.com/{project}/industrial-canvas/canvas?canvasId={canvas_id}&cluster={cluster}.cognitedata.com&env={cluster}&workspace=industrial-tools" + return canvas_url + + except Exception as e: + st.error(f"Failed to generate canvas: {e}") + return None + + @st.cache_data(ttl=600) def find_pipelines(name_filter: str = "file_annotation") -> list[str]: """ @@ -19,13 +68,11 @@ def find_pipelines(name_filter: str = "file_annotation") -> list[str]: filtered by a substring in their external ID. """ try: - # List all pipelines in the project all_pipelines = client.extraction_pipelines.list(limit=-1) if not all_pipelines: st.warning(f"No extraction pipelines found in the project.") return [] - # Filter pipelines where the external ID contains the name_filter string filtered_ids = [p.external_id for p in all_pipelines if name_filter in p.external_id] if not filtered_ids: @@ -38,12 +85,25 @@ def find_pipelines(name_filter: str = "file_annotation") -> list[str]: return [] +@st.cache_data(ttl=3600) +def fetch_raw_table_data(db_name: str, table_name: str) -> pd.DataFrame: + """Fetches all rows from a specified RAW table and returns as a DataFrame.""" + try: + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) + if not rows: + return pd.DataFrame() + data = [row.columns for row in rows] + return pd.DataFrame(data) + except Exception as e: + st.error(f"Failed to fetch data from RAW table '{table_name}': {e}") + return pd.DataFrame() + + def parse_run_message(message: str) -> dict: """Parses the structured run message and returns a dictionary of its components.""" if not message: return {} - # Regex to capture all key-value pairs from the new format pattern = re.compile( r"\(caller:(?P\w+), function_id:(?P[\w\.-]+), call_id:(?P[\w\.-]+)\) - " r"total files processed: (?P\d+) - " @@ -53,7 +113,6 @@ def parse_run_message(message: str) -> dict: match = pattern.search(message) if match: data = match.groupdict() - # Convert numeric strings to integers for key in ["total", "success", "failed"]: if key in data: data[key] = int(data[key]) @@ -94,7 +153,7 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view Fetches annotation state instances from the specified data model view and joins them with their corresponding file instances. """ - # 1. Fetch all annotation state instances + # ... (This function remains unchanged) annotation_instances = client.data_modeling.instances.list( instance_type="node", space=annotation_state_view.instance_space, @@ -105,7 +164,6 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view st.info("No annotation state instances found in the specified view.") return pd.DataFrame() - # 2. Process annotation states and collect NodeIds for linked files annotation_data = [] nodes_to_fetch = [] for instance in annotation_instances: @@ -130,14 +188,11 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view if df_annotations.empty or not nodes_to_fetch: return df_annotations - # 3. Fetch corresponding file instances using the collected NodeIds - # Remove duplicates before fetching unique_nodes_to_fetch = list(set(nodes_to_fetch)) file_instances = client.data_modeling.instances.retrieve_nodes( nodes=unique_nodes_to_fetch, sources=file_view.as_view_id() ) - # 4. Process file instances into a DataFrame file_data = [] for instance in file_instances: node_data = { @@ -161,10 +216,8 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view df_files = pd.DataFrame(file_data) - # 5. Merge annotation data with file data df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") - # 6. Final data cleaning and preparation if "createdTime" in df_merged.columns: df_merged["createdTime"] = df_merged["createdTime"].dt.tz_localize("UTC") if "lastUpdatedTime" in df_merged.columns: @@ -194,6 +247,7 @@ def fetch_pipeline_run_history(pipeline_ext_id: str): def calculate_success_failure_stats(runs): """Calculates success and failure counts from a list of pipeline runs.""" + # ... (This function remains unchanged) success_count = sum(1 for run in runs if run.status == "success") failure_count = sum(1 for run in runs if run.status == "failure") return success_count, failure_count @@ -201,6 +255,7 @@ def calculate_success_failure_stats(runs): def get_failed_run_details(runs): """Filters for failed runs and extracts their details, including IDs.""" + # ... (This function remains unchanged) failed_runs = [] for run in runs: if run.status == "failure": @@ -229,6 +284,7 @@ def fetch_function_logs(function_id: int, call_id: int): def process_runs_for_graphing(runs): """Transforms pipeline run data into a DataFrame for graphing.""" + # ... (This function remains unchanged) launch_data = [] finalize_runs_to_agg = [] @@ -249,7 +305,6 @@ def process_runs_for_graphing(runs): elif caller == "Finalize": finalize_runs_to_agg.append({"timestamp": timestamp, "count": count}) - # --- Aggregate Finalize Runs --- aggregated_finalize_data = [] if finalize_runs_to_agg: finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) @@ -283,3 +338,32 @@ def process_runs_for_graphing(runs): df_finalize = pd.DataFrame(aggregated_finalize_data) return pd.concat([df_launch, df_finalize], ignore_index=True) + + +@st.cache_data(ttl=3600) +def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: + """ + Fetches the entity cache and explodes it to create a complete + catalog of all generated patterns, indexed by resourceType. + """ + try: + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) + if not rows: + return pd.DataFrame() + + data = [row.columns for row in rows] + df_cache = pd.DataFrame(data) + + all_patterns = [] + for _, row in df_cache.iterrows(): + for sample_list in ["AssetPatternSamples", "FilePatternSamples"]: + if row.get(sample_list) and isinstance(row[sample_list], list): + for item in row[sample_list]: + if item.get("sample") and item.get("resource_type"): + for pattern in item["sample"]: + all_patterns.append({"resourceType": item["resource_type"], "pattern": pattern}) + + return pd.DataFrame(all_patterns) + except Exception as e: + st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") + return pd.DataFrame() diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py new file mode 100644 index 00000000..f50fa834 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -0,0 +1,342 @@ +import streamlit as st +import pandas as pd +import altair as alt +from helper import ( + fetch_extraction_pipeline_config, + fetch_raw_table_data, + find_pipelines, + generate_file_canvas, + fetch_pattern_catalog, +) +from cognite.client.data_classes.data_modeling import NodeId + +# --- Page Configuration --- +st.set_page_config( + page_title="Annotation Quality", + page_icon="🎯", + layout="wide", +) + +# --- Initialize Session State --- +if "selected_row_index" not in st.session_state: + st.session_state.selected_row_index = None + +# --- Sidebar for Pipeline Selection --- +st.sidebar.title("Pipeline Selection") +pipeline_ids = find_pipelines() + +if not pipeline_ids: + st.info("No active file annotation pipelines found to monitor.") + st.stop() + +selected_pipeline = st.sidebar.selectbox( + "Select a pipeline to view quality:", options=pipeline_ids, key="quality_pipeline_selector" +) + +# --- Data Fetching & Processing --- +config_result = fetch_extraction_pipeline_config(selected_pipeline) +if not config_result: + st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") + st.stop() + +ep_config, _, _ = config_result +report_config = ep_config.get("finalizeFunction", {}).get("reportService", {}) +cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) +db_name = report_config.get("rawDb") +pattern_table = report_config.get("rawTableDocPattern") +tag_table = report_config.get("rawTableDocTag") +doc_table = report_config.get("rawTableDocDoc") +cache_table = cache_config.get("rawTableCache") + + +if not all([db_name, pattern_table, tag_table, doc_table, cache_table]): + st.error("Could not find all required RAW table names in the pipeline configuration.") + st.stop() + +df_patterns = fetch_raw_table_data(db_name, pattern_table) +df_tags = fetch_raw_table_data(db_name, tag_table) +df_docs = fetch_raw_table_data(db_name, doc_table) +df_pattern_catalog = fetch_pattern_catalog(db_name, cache_table) + + +# --- Main Application --- +st.title("Annotation Quality Dashboard") +st.markdown( + "This page measures annotation quality by comparing potential tags (from pattern mode) against actual, created annotations." +) + +if df_patterns.empty: + st.info("The pattern catalog is empty. Run the pipeline with patternMode enabled to generate data.") + st.stop() + +# --- Data Processing and Merging --- +df_annotations = pd.concat([df_tags, df_docs], ignore_index=True) + +df_patterns_agg = df_patterns.groupby("startNode")["text"].apply(set).reset_index(name="potentialTags") + +if not df_annotations.empty: + df_annotations_agg = ( + df_annotations.groupby("startNode")["startNodeText"].apply(set).reset_index(name="actualAnnotations") + ) + df_quality = pd.merge(df_patterns_agg, df_annotations_agg, on="startNode", how="outer").fillna(0) + df_quality["actualAnnotations"] = df_quality["actualAnnotations"].apply( + lambda x: x if isinstance(x, set) else set() + ) +else: + df_quality = df_patterns_agg + df_quality["actualAnnotations"] = [set() for _ in range(len(df_patterns_agg))] + +df_quality["potentialTags"] = df_quality["potentialTags"].apply(lambda x: x if isinstance(x, set) else set()) + +df_quality["matchedTags"] = df_quality.apply( + lambda row: len(row["potentialTags"].intersection(row["actualAnnotations"])), axis=1 +) +df_quality["unmatchedByAnnotation"] = df_quality.apply( + lambda row: len(row["potentialTags"] - row["actualAnnotations"]), axis=1 +) +df_quality["missedByPattern"] = df_quality.apply( + lambda row: len(row["actualAnnotations"] - row["potentialTags"]), axis=1 +) + +df_quality["coverageRate"] = ( + df_quality["matchedTags"] / (df_quality["matchedTags"] + df_quality["unmatchedByAnnotation"]) +) * 100 +df_quality["completenessRate"] = ( + df_quality["matchedTags"] / (df_quality["matchedTags"] + df_quality["missedByPattern"]) +) * 100 +df_quality.fillna(0, inplace=True) + +# --- Dashboard Metrics --- +st.subheader("Overall Annotation Quality") +total_matched = df_quality["matchedTags"].sum() +total_unmatched = df_quality["unmatchedByAnnotation"].sum() +total_missed = df_quality["missedByPattern"].sum() + +overall_coverage = ( + (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 +) +overall_completeness = ( + (total_matched / (total_matched + total_missed)) * 100 if (total_matched + total_missed) > 0 else 0 +) + +kpi_col1, kpi_col2 = st.columns(2) +kpi_col1.metric( + "Overall Annotation Coverage", + f"{overall_coverage:.2f}%", + help="Of all potential tags found by patterns, this is the percentage that were successfully annotated. Formula: Matched / (Matched + Unmatched)", +) +kpi_col2.metric( + "Overall Pattern Completeness", + f"{overall_completeness:.2f}%", + help="Of all annotations created, this is the percentage that the patterns successfully predicted. Formula: Matched / (Matched + Missed by Pattern)", +) + +# --- Annotation Quality by Resource Type --- +df_merged_for_resource = pd.merge(df_patterns, df_quality, on="startNode", how="left") + +if "resourceType" in df_merged_for_resource.columns: + df_resource_quality = ( + df_merged_for_resource.groupby("resourceType") + .agg( + matchedTags=("matchedTags", "first"), + unmatchedByAnnotation=("unmatchedByAnnotation", "first"), + missedByPattern=("missedByPattern", "first"), + ) + .reset_index() + ) + + df_resource_quality["coverageRate"] = ( + df_resource_quality["matchedTags"] + / (df_resource_quality["matchedTags"] + df_resource_quality["unmatchedByAnnotation"]) + ) * 100 + df_resource_quality["completenessRate"] = ( + df_resource_quality["matchedTags"] + / (df_resource_quality["matchedTags"] + df_resource_quality["missedByPattern"]) + ) * 100 + df_resource_quality.fillna(0, inplace=True) + + chart_col1, chart_col2 = st.columns(2) + with chart_col1: + coverage_chart = ( + alt.Chart(df_resource_quality) + .mark_bar() + .encode( + x=alt.X("resourceType:N", title="Resource Type", sort="-y"), + y=alt.Y("coverageRate:Q", title="Annotation Coverage (%)", scale=alt.Scale(domain=[0, 100])), + tooltip=["resourceType", "coverageRate", "matchedTags", "unmatchedByAnnotation"], + ) + .properties(title="Annotation Coverage by Resource Type") + ) + st.altair_chart(coverage_chart, use_container_width=True) + with chart_col2: + completeness_chart = ( + alt.Chart(df_resource_quality) + .mark_bar() + .encode( + x=alt.X("resourceType:N", title="Resource Type", sort="-y"), + y=alt.Y("completenessRate:Q", title="Pattern Completeness (%)", scale=alt.Scale(domain=[0, 100])), + tooltip=["resourceType", "completenessRate", "matchedTags", "missedByPattern"], + ) + .properties(title="Pattern Completeness by Resource Type") + ) + st.altair_chart(completeness_chart, use_container_width=True) +else: + st.info("The 'resourceType' column is not available in the pattern data to generate this chart.") + +# --- Pattern Catalog Expander with Tabs --- +with st.expander("View Full Pattern Catalog"): + if df_pattern_catalog.empty: + st.info("Pattern catalog is empty or could not be loaded.") + else: + resource_types = sorted(df_pattern_catalog["resourceType"].unique()) + tabs = st.tabs(resource_types) + + for i, resource_type in enumerate(resource_types): + with tabs[i]: + df_filtered_patterns = df_pattern_catalog[df_pattern_catalog["resourceType"] == resource_type] + st.dataframe(df_filtered_patterns[["pattern"]], use_container_width=True, hide_index=True) + +# --- File-Level Table --- +st.subheader("Per-File Annotation Quality") +st.info("✔️ Select a file in the table below to see a detailed breakdown of its tags.") + +df_display = df_quality.sort_values(by="coverageRate").reset_index(drop=True) +df_display.insert(0, "Select", False) + +if st.session_state.get("selected_row_index") is not None and st.session_state.selected_row_index < len(df_display): + df_display.at[st.session_state.selected_row_index, "Select"] = True + +edited_df = st.data_editor( + df_display, + key="quality_table_editor", + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "startNode": "File External ID", + "potentialTags": "Potential Tags", + "actualAnnotations": "Actual Annotations", + "coverageRate": st.column_config.ProgressColumn( + "Annotation Coverage ℹ️", + help="How many of the potential tags were found? (Matched / Potential)", + format="%.2f%%", + min_value=0, + max_value=100, + ), + "completenessRate": st.column_config.ProgressColumn( + "Pattern Completeness ℹ️", + help="How many of the final annotations did the patterns find? (Matched / Actual)", + format="%.2f%%", + min_value=0, + max_value=100, + ), + }, + use_container_width=True, + column_order=("Select", "startNode", "coverageRate", "completenessRate"), + hide_index=True, + disabled=df_display.columns.difference(["Select"]), +) + +# --- Logic to enforce single selection --- +selected_indices = edited_df[edited_df.Select].index.tolist() +if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.get("selected_row_index")] + if new_selection: + st.session_state.selected_row_index = new_selection[0] + st.rerun() +elif len(selected_indices) == 1: + st.session_state.selected_row_index = selected_indices[0] +elif len(selected_indices) == 0 and st.session_state.get("selected_row_index") is not None: + st.session_state.selected_row_index = None + st.rerun() + +# --- Interactive Drill-Down Section --- +st.subheader("Tag Comparison Drill-Down") + +if st.session_state.get("selected_row_index") is not None: + selected_file_data = df_display.iloc[st.session_state.selected_row_index] + selected_file = selected_file_data["startNode"] + st.markdown(f"Displaying details for file: **{selected_file}**") + + file_space_series = df_patterns[df_patterns["startNode"] == selected_file]["startNodeSpace"] + if not file_space_series.empty: + file_space = file_space_series.iloc[0] + file_node_id = NodeId(space=file_space, external_id=selected_file) + + # --- Three-Column Tag Comparison (prepare dataframes first) --- + df_potential_tags_details = df_patterns[df_patterns["startNode"] == selected_file][ + ["text", "resourceType", "regions"] + ] + + if not df_annotations.empty: + df_actual_annotations_details = df_annotations[df_annotations["startNode"] == selected_file][ + ["startNodeText", "endNodeResourceType"] + ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) + else: + df_actual_annotations_details = pd.DataFrame(columns=["text", "resourceType"]) + + potential_tags_set = set(df_potential_tags_details["text"]) + actual_tags_set = set(df_actual_annotations_details["text"]) + + matched_tags_set = potential_tags_set.intersection(actual_tags_set) + unmatched_tags_set = potential_tags_set - actual_tags_set + missed_tags_set = actual_tags_set - potential_tags_set + + matched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(matched_tags_set) + ].drop_duplicates(subset=["text", "resourceType"]) + unmatched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(unmatched_tags_set) + ].drop_duplicates(subset=["text", "resourceType"]) + missed_df = df_actual_annotations_details[ + df_actual_annotations_details["text"].isin(missed_tags_set) + ].drop_duplicates() + + if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): + with st.spinner("Generating Industrial Canvas with bounding boxes..."): + _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) + + unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") + + canvas_url = generate_file_canvas( + file_id=file_node_id, + file_view=file_view_config, + ep_config=ep_config, + unmatched_tags_with_regions=unmatched_tags_for_canvas, + ) + if canvas_url: + st.session_state["generated_canvas_url"] = canvas_url + else: + st.session_state.pop("generated_canvas_url", None) + + if "generated_canvas_url" in st.session_state and st.session_state.generated_canvas_url: + st.markdown( + f"**[Open Last Generated Canvas]({st.session_state.generated_canvas_url})**", unsafe_allow_html=True + ) + + col1, col2, col3 = st.columns(3) + + with col1: + st.metric("✅ Matched Tags", len(matched_df)) + st.dataframe( + matched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col2: + st.metric("❓ Unmatched by Annotation", len(unmatched_df)) + st.dataframe( + unmatched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col3: + st.metric("❗️ Missed by Pattern", len(missed_df)) + st.dataframe( + missed_df, + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) +else: + st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") From 56960659c0ff7b3b853a4377083a1d9c3242f85c Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 2 Sep 2025 10:37:14 -0500 Subject: [PATCH 035/128] changed the color of pattern mode results to green --- .../streamlit/file_annotation_dashboard/canvas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py index c03a5c00..6263820e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -131,8 +131,8 @@ def create_bounding_box_annotations(canvas_id: str, file_container_id: str, unma "label": tag_text, "zIndex": 10, "style": { - "fill": "rgba(255, 165, 0, 0.3)", # Semi-transparent orange - "stroke": "orange", + "fill": "rgba(40, 167, 69, 0.3)", # Semi-transparent vibrant green + "stroke": "rgb(40, 167, 69)", # Solid vibrant green for the border "strokeWidth": 1, "opacity": 1, }, From 9ade99939d3d71ca7fb634858173a2fd423dc877 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 2 Sep 2025 17:26:38 -0500 Subject: [PATCH 036/128] added manual pattern mode module, integrated to cache service, updated key generation of apply service --- .../cdf_file_annotation/default.config.yaml | 1 + ...ep_file_annotation.ExtractionPipeline.yaml | 2 + .../ep_file_annotation.config.yaml | 1 + .../services/ApplyService.py | 3 +- .../services/ConfigService.py | 1 + .../services/CacheService.py | 79 ++++++++++- .../services/ConfigService.py | 1 + .../raw/tbl_file_annotation.Tables.yaml | 5 +- .../file_annotation_dashboard/helper.py | 105 ++++++++++++++ .../pages/Pattern_Management.py | 132 ++++++++++++++++++ 10 files changed, 321 insertions(+), 9 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index aa4bb008..897990fd 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -16,6 +16,7 @@ rawTableDocTag: annotation_documents_tags rawTableDocDoc: annotation_documents_docs rawTableDocPattern: annotation_documents_patterns rawTableCache: annotation_entities_cache +rawManualPatternsCatalog: manual_patterns_catalog # used in /extraction_pipelines extractionPipelineExternalId: ep_file_annotation diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml index dcdbc901..7dd00a93 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml @@ -9,6 +9,8 @@ rawTables: tableName: {{ rawTableDocDoc }} - dbName: {{ rawDb }} tableName: {{ rawTableCache }} + - dbName: {{ rawDb }} + tableName: {{ rawManualPatternsCatalog }} source: "Files" documentation: > # Guide to Configuring the Annotation Function via YAML diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 70a6a54b..948163f1 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -92,6 +92,7 @@ config: cacheTimeLimit: 24 # hours rawDb: {{ rawDb }} rawTableCache: {{ rawTableCache }} + rawManualPatternsCatalog: {{ rawManualPatternsCatalog }} annotationService: pageRange: 50 partialMatch: True diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 2f78718e..69867416 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -316,8 +316,7 @@ def process_pattern_results(self, result_item: dict, file_node: Node) -> list[Ro } # Create a deterministic key based on the tag text and file - row_key = f"{tag_text}_{source_id}" - + row_key = f"{tag_text}:{file_id.space}:{file_id.external_id}" row = RowWrite(key=row_key, columns=catalog_properties) doc_patterns.append(row) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 447a6d91..7556b016 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -168,6 +168,7 @@ class CacheServiceConfig(BaseModel, alias_generator=to_camel): cache_time_limit: int raw_db: str raw_table_cache: str + raw_manual_patterns_catalog: str class AnnotationServiceConfig(BaseModel, alias_generator=to_camel): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 8066f80f..df27e0d8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -1,9 +1,11 @@ import abc import re from typing import Iterator +from collections import defaultdict from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient from cognite.client.data_classes import RowWrite, Row +from cognite.client.exceptions import CogniteNotFoundError from cognite.client.data_classes.data_modeling import ( Node, NodeList, @@ -57,6 +59,7 @@ def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctio self.db_name: str = config.launch_function.cache_service.raw_db self.tbl_name: str = config.launch_function.cache_service.raw_table_cache + self.manual_patterns_tbl_name: str = config.launch_function.cache_service.raw_manual_patterns_catalog self.cache_time_limit: int = config.launch_function.cache_service.cache_time_limit # in hours self.file_view: ViewPropertyConfig = config.data_model_views.file_view @@ -89,10 +92,8 @@ def get_entities( self.logger.debug(f"Cache valid for key: {key}. Retrieving entities and patterns.") asset_entities: list[dict] = row.columns.get("AssetEntities", []) file_entities: list[dict] = row.columns.get("FileEntities", []) - asset_pattern_samples: list[dict] = row.columns.get("AssetPatternSamples", []) # Get patterns from cache - file_pattern_samples: list[dict] = row.columns.get("FilePatternSamples", []) # Get patterns from cache - - return (asset_entities + file_entities), (asset_pattern_samples + file_pattern_samples) + combined_pattern_samples: list[dict] = row.columns.get("CombinedPatternSamples", []) + return (asset_entities + file_entities), combined_pattern_samples self.logger.info(f"Refreshing RAW entities cache and patterns cache for key: {key}") @@ -108,7 +109,13 @@ def get_entities( # Generate pattern samples from the same entities asset_pattern_samples = self._generate_tag_samples_from_entities(asset_entities) file_pattern_samples = self._generate_tag_samples_from_entities(file_entities) - pattern_samples = asset_pattern_samples + file_pattern_samples + auto_pattern_samples = asset_pattern_samples + file_pattern_samples + + # Grab the manual pattern samples + manual_pattern_samples = self._get_manual_patterns(primary_scope_value, secondary_scope_value) + + # Merge the auto and manual patterns + combined_pattern_samples = self._merge_patterns(auto_pattern_samples, manual_pattern_samples) # Update cache new_row = RowWrite( @@ -118,11 +125,13 @@ def get_entities( "FileEntities": file_entities, "AssetPatternSamples": asset_pattern_samples, "FilePatternSamples": file_pattern_samples, + "ManualPatternSamples": manual_pattern_samples, + "CombinedPatternSamples": combined_pattern_samples, "LastUpdateTimeUtcIso": datetime.now(timezone.utc).isoformat(), }, ) self._update_cache(new_row) - return entities, pattern_samples + return entities, combined_pattern_samples def _update_cache(self, row_to_write: RowWrite) -> None: """ @@ -325,3 +334,61 @@ def replace_A(match): if final_samples: result.append({"sample": sorted(final_samples), "resource_type": resource_type}) return result + + def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) -> list[dict]: + """Fetches and combines manual patterns from GLOBAL, primary, and secondary scopes.""" + keys_to_fetch = ["GLOBAL"] + if primary_scope: + keys_to_fetch.append(primary_scope) + if primary_scope and secondary_scope: + keys_to_fetch.append(f"{primary_scope}_{secondary_scope}") + + self.logger.info(f"Fetching manual patterns for keys: {keys_to_fetch}") + all_manual_patterns = [] + for key in keys_to_fetch: + try: + row: Row | None = self.client.raw.rows.retrieve( + db_name=self.db_name, table_name=self.manual_patterns_tbl_name, key=key + ) + if row: + patterns = (row.columns or {}).get("patterns", []) + all_manual_patterns.extend(patterns) + except CogniteNotFoundError: + self.logger.info(f"No manual patterns found for keys: {keys_to_fetch}. This may be expected.") + except Exception as e: + self.logger.error(f"Failed to retrieve manual patterns: {e}") + + return all_manual_patterns + + def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict]) -> list[dict]: + """Merges auto-generated and manual patterns, de-duplicating samples.""" + # The structure of manual_patterns is [{"sample": "P-1", "resource_type": "A"}, ...] + # The structure of auto_patterns is [{"sample": ["P-2", "P-3"], "resource_type": "A"}, ...] + + # Use a dictionary with sets for efficient merging and de-duplication + merged = defaultdict(set) + + # Process auto-generated patterns + for item in auto_patterns: + resource_type = item.get("resource_type") + samples = item.get("sample", []) + if resource_type and isinstance(samples, list): + merged[resource_type].update(samples) + + # Process manual patterns + for item in manual_patterns: + resource_type = item.get("resource_type") + sample = item.get("sample") + if resource_type and sample: + merged[resource_type].add(sample) + + # Convert the merged dictionary back to the required list format + final_list = [ + {"resource_type": resource_type, "sample": sorted(list(samples))} + for resource_type, samples in merged.items() + ] + + self.logger.info( + f"Merged {len(auto_patterns)} auto-patterns and {len(manual_patterns)} manual patterns into {len(final_list)} resource types." + ) + return final_list diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 447a6d91..7556b016 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -168,6 +168,7 @@ class CacheServiceConfig(BaseModel, alias_generator=to_camel): cache_time_limit: int raw_db: str raw_table_cache: str + raw_manual_patterns_catalog: str class AnnotationServiceConfig(BaseModel, alias_generator=to_camel): diff --git a/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml b/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml index 5c63a3e1..7f6e9a2b 100644 --- a/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml +++ b/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml @@ -5,4 +5,7 @@ tableName: {{ rawTableDocDoc }} - dbName: {{ rawDb }} - tableName: {{ rawTableCache }} \ No newline at end of file + tableName: {{ rawTableCache }} + +- dbName: {{ rawDb }} + tableName: {{ rawManualPatternsCatalog }} \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 4785dc59..a70584c7 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -5,6 +5,7 @@ import pandas as pd from datetime import datetime, timedelta from cognite.client import CogniteClient +from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ViewId, NodeId, Node from cognite.client.data_classes.functions import FunctionCallLog from data_structures import ViewPropertyConfig @@ -367,3 +368,107 @@ def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: except Exception as e: st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") return pd.DataFrame() + +def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: + """ + Fetches all manual patterns from the RAW table and explodes them + into a tidy DataFrame for display and editing. + """ + all_patterns = [] + expected_columns = [ + "key", + "scope_level", + "primary_scope", + "secondary_scope", + "sample", + "resource_type", + "created_by", + ] + + try: + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) + for row in rows: + key = row.key + patterns_list = row.columns.get("patterns", []) + + scope_level = "Global" + primary_scope = "" + secondary_scope = "" + if key != "GLOBAL": + parts = key.split("_") + if len(parts) == 2: + scope_level = "Secondary Scope" + primary_scope, secondary_scope = parts + else: + scope_level = "Primary Scope" + primary_scope = key + + for p in patterns_list: + all_patterns.append( + { + "key": key, + "scope_level": scope_level, + "primary_scope": primary_scope, + "secondary_scope": secondary_scope, + "sample": p.get("sample"), + "resource_type": p.get("resource_type"), + "created_by": p.get("created_by"), + } + ) + + if not all_patterns: + return pd.DataFrame(columns=expected_columns) + + df = pd.DataFrame(all_patterns) + + for col in expected_columns: + if col not in df.columns: + df[col] = "" + df[col] = df[col].fillna("").astype(str) + + return df + + except Exception as e: + if "NotFoundError" in str(type(e)): + return pd.DataFrame(columns=expected_columns) + st.error(f"Failed to fetch manual patterns: {e}") + return pd.DataFrame(columns=expected_columns) + + +def save_manual_patterns(df: pd.DataFrame, db_name: str, table_name: str): + """ + Takes a tidy DataFrame of patterns, groups them by scope key, + and writes them back to the RAW table. + """ + + def create_key(row): + if row["scope_level"] == "Global": + return "GLOBAL" + elif row["scope_level"] == "Primary Scope" and row["primary_scope"]: + return row["primary_scope"] + elif row["scope_level"] == "Secondary Scope" and row["primary_scope"] and row["secondary_scope"]: + return f"{row['primary_scope']}_{row['secondary_scope']}" + return None + + df["key"] = df.apply(create_key, axis=1) + + df.dropna(subset=["key"], inplace=True) + + grouped = df.groupby("key") + + rows_to_write = [] + for key, group in grouped: + patterns_list = group[["sample", "resource_type", "created_by"]].to_dict("records") + + row = RowWrite(key=key, columns={"patterns": patterns_list}) + rows_to_write.append(row) + + existing_keys = {r.key for r in client.raw.rows.list(db_name, table_name, limit=-1)} + new_keys = {r.key for r in rows_to_write} + keys_to_delete = list(existing_keys - new_keys) + + if keys_to_delete: + client.raw.rows.delete(db_name=db_name, table_name=table_name, key=keys_to_delete) + + if rows_to_write: + client.raw.rows.insert(db_name=db_name, table_name=table_name, row=rows_to_write, ensure_parent=True) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py new file mode 100644 index 00000000..d8f383ec --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py @@ -0,0 +1,132 @@ +import streamlit as st +import pandas as pd +from datetime import datetime, timezone + +from helper import ( + fetch_extraction_pipeline_config, + find_pipelines, + fetch_manual_patterns, + save_manual_patterns, +) + +st.set_page_config(page_title="Pattern Management", page_icon="✏️", layout="wide") + +st.title("Pattern Management") +st.markdown("Add, edit, or delete manual patterns to improve the quality of the pattern detection job.") + +# --- Sidebar for Pipeline Selection --- +st.sidebar.title("Pipeline Selection") +pipeline_ids = find_pipelines() + +if not pipeline_ids: + st.info("No active file annotation pipelines found to monitor.") + st.stop() + +selected_pipeline = st.sidebar.selectbox( + "Select a pipeline to manage patterns for:", options=pipeline_ids, key="pattern_pipeline_selector" +) + +# --- Data Fetching --- +config_result = fetch_extraction_pipeline_config(selected_pipeline) +if not config_result: + st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") + st.stop() + +ep_config, _, _ = config_result +cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) +db_name = cache_config.get("rawDb") +manual_patterns_table = cache_config.get("rawManualPatternsCatalog") +primary_scope_prop = ep_config.get("launchFunction", {}).get("primaryScopeProperty") +secondary_scope_prop = ep_config.get("launchFunction", {}).get("secondaryScopeProperty") + + +if not all([db_name, manual_patterns_table]): + st.error("RAW DB name or manual patterns table name is not configured in the extraction pipeline.") + st.stop() + +# --- Load and Display Existing Patterns --- +st.subheader("Existing Manual Patterns") + +df_patterns = fetch_manual_patterns(db_name, manual_patterns_table) + +edited_df = st.data_editor( + df_patterns, + num_rows="dynamic", + use_container_width=True, + column_config={ + "key": st.column_config.TextColumn("Scope Key", disabled=True), + "sample": st.column_config.TextColumn("Pattern String", required=True), + "resource_type": st.column_config.TextColumn("Resource Type", required=True), + "scope_level": st.column_config.TextColumn("Scope Level", required=True), + "primary_scope": st.column_config.TextColumn("Primary Scope", required=False), + "secondary_scope": st.column_config.TextColumn("Secondary Scope", required=False), + "created_by": st.column_config.TextColumn("Created By", required=True), + }, +) + +if st.button("Save Changes", type="primary"): + with st.spinner("Saving changes to RAW..."): + try: + save_manual_patterns(edited_df, db_name, manual_patterns_table) + st.success("Changes saved successfully!") + st.cache_data.clear() + st.rerun() + except Exception as e: + st.error(f"Failed to save changes: {e}") + + +st.divider() + +# --- Add New Pattern Form --- +st.subheader("Add a New Pattern") + +scope_level = st.selectbox( + "1. Select Scope Level", ["Global", "Primary Scope", "Secondary Scope"], key="scope_level_selector" +) + +with st.form(key="new_pattern_form", clear_on_submit=True): + st.write("2. Enter Pattern Details") + new_pattern = st.text_input("Pattern String", placeholder="e.g., [PI]-00000") + new_resource_type = st.text_input("Resource Type", placeholder="e.g., Asset") + + primary_scope_value = "" + if scope_level in ["Primary Scope", "Secondary Scope"]: + primary_scope_value = st.text_input(f"Primary Scope Value ({primary_scope_prop or 'not configured'})") + + secondary_scope_value = "" + if scope_level == "Secondary Scope": + secondary_scope_value = st.text_input(f"Secondary Scope Value ({secondary_scope_prop or 'not configured'})") + + submit_button = st.form_submit_button(label="Add New Pattern") + + if submit_button: + if not all([new_pattern, new_resource_type]): + st.warning("Pattern String and Resource Type are required.") + else: + with st.spinner("Adding new pattern..."): + try: + updated_df = pd.concat( + [ + edited_df, + pd.DataFrame( + [ + { + "sample": new_pattern, + "resource_type": new_resource_type, + "scope_level": scope_level, + "primary_scope": primary_scope_value, + "secondary_scope": secondary_scope_value, + "created_by": "streamlit", + } + ] + ), + ], + ignore_index=True, + ) + + save_manual_patterns(updated_df, db_name, manual_patterns_table) + st.success("New pattern added successfully!") + st.cache_data.clear() + st.rerun() + except Exception as e: + st.error(f"Failed to add pattern: {e}") From 0c9ecf910c2a79408400ecbded5688fc20c6655b Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 3 Sep 2025 14:42:58 -0500 Subject: [PATCH 037/128] improved annotation percentage calculations --- .../file_annotation_dashboard/helper.py | 1 + .../pages/Annotation_Quality.py | 172 ++++++++++++++---- 2 files changed, 138 insertions(+), 35 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index a70584c7..4df145b1 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -369,6 +369,7 @@ def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") return pd.DataFrame() + def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: """ Fetches all manual patterns from the RAW table and explodes them diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index f50fa834..eceb1d6c 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -7,6 +7,7 @@ find_pipelines, generate_file_canvas, fetch_pattern_catalog, + fetch_manual_patterns, # Import the function to get manual patterns ) from cognite.client.data_classes.data_modeling import NodeId @@ -47,17 +48,20 @@ tag_table = report_config.get("rawTableDocTag") doc_table = report_config.get("rawTableDocDoc") cache_table = cache_config.get("rawTableCache") +manual_patterns_table = cache_config.get("rawManualPatternsCatalog") -if not all([db_name, pattern_table, tag_table, doc_table, cache_table]): +if not all([db_name, pattern_table, tag_table, doc_table, cache_table, manual_patterns_table]): st.error("Could not find all required RAW table names in the pipeline configuration.") st.stop() df_patterns = fetch_raw_table_data(db_name, pattern_table) df_tags = fetch_raw_table_data(db_name, tag_table) df_docs = fetch_raw_table_data(db_name, doc_table) -df_pattern_catalog = fetch_pattern_catalog(db_name, cache_table) +# Fetch both auto-generated and manual patterns +df_auto_patterns = fetch_pattern_catalog(db_name, cache_table) +df_manual_patterns = fetch_manual_patterns(db_name, manual_patterns_table) # --- Main Application --- st.title("Annotation Quality Dashboard") @@ -107,18 +111,50 @@ df_quality.fillna(0, inplace=True) # --- Dashboard Metrics --- -st.subheader("Overall Annotation Quality") -total_matched = df_quality["matchedTags"].sum() -total_unmatched = df_quality["unmatchedByAnnotation"].sum() -total_missed = df_quality["missedByPattern"].sum() +st.header("Overall Annotation Quality") +# Get a unique, sorted list of resource types for the filter +all_resource_types = ["All"] + sorted(df_patterns["resourceType"].unique().tolist()) + +selected_resource_type = st.selectbox("Filter by Resource Type:", options=all_resource_types) + +# --- Filter the data based on selection --- +if selected_resource_type == "All": + df_metrics_input = df_patterns + df_annotations_input = df_annotations +else: + df_metrics_input = df_patterns[df_patterns["resourceType"] == selected_resource_type] + if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: + df_annotations_input = df_annotations[df_annotations["endNodeResourceType"] == selected_resource_type] + else: + df_annotations_input = pd.DataFrame() + + +# --- Recalculate metrics based on the filtered data --- +potential_tags_set = set(df_metrics_input["text"]) +if not df_annotations_input.empty and "startNodeText" in df_annotations_input.columns: + actual_annotations_set = set(df_annotations_input["startNodeText"]) +else: + actual_annotations_set = set() + + +matched_tags_set = potential_tags_set.intersection(actual_annotations_set) +unmatched_by_annotation_set = potential_tags_set - actual_annotations_set +missed_by_pattern_set = actual_annotations_set - potential_tags_set + +total_matched = len(matched_tags_set) +total_unmatched = len(unmatched_by_annotation_set) +total_missed = len(missed_by_pattern_set) + +# Calculate overall rates overall_coverage = ( (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 ) overall_completeness = ( - (total_matched / (total_matched + total_missed)) * 100 if (total_matched + total_missed) > 0 else 0 + (total_matched / (total_missed + total_matched)) * 100 if (total_missed + total_matched) > 0 else 0 ) +# Display KPIs kpi_col1, kpi_col2 = st.columns(2) kpi_col1.metric( "Overall Annotation Coverage", @@ -131,34 +167,60 @@ help="Of all annotations created, this is the percentage that the patterns successfully predicted. Formula: Matched / (Matched + Missed by Pattern)", ) -# --- Annotation Quality by Resource Type --- -df_merged_for_resource = pd.merge(df_patterns, df_quality, on="startNode", how="left") +st.divider() -if "resourceType" in df_merged_for_resource.columns: - df_resource_quality = ( - df_merged_for_resource.groupby("resourceType") - .agg( - matchedTags=("matchedTags", "first"), - unmatchedByAnnotation=("unmatchedByAnnotation", "first"), - missedByPattern=("missedByPattern", "first"), - ) - .reset_index() +# --- Prepare data for charts --- +chart_data = [] +# Use all_resource_types[1:] to skip the "All" option +for resource_type in all_resource_types[1:]: + # Filter data for this specific resource type + df_patterns_filtered = df_patterns[df_patterns["resourceType"] == resource_type] + + if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: + df_annotations_filtered = df_annotations[df_annotations["endNodeResourceType"] == resource_type] + else: + df_annotations_filtered = pd.DataFrame() + + # Calculate metrics using global unique tags for THIS resource type + potential = set(df_patterns_filtered["text"]) + + if not df_annotations_filtered.empty and "startNodeText" in df_annotations_filtered.columns: + actual = set(df_annotations_filtered["startNodeText"]) + else: + actual = set() + + matched = len(potential.intersection(actual)) + unmatched = len(potential - actual) + missed = len(actual - potential) + + coverage = (matched / (matched + unmatched)) * 100 if (matched + unmatched) > 0 else 0 + completeness = (matched / (matched + missed)) * 100 if (matched + missed) > 0 else 0 + + chart_data.append( + { + "resourceType": resource_type, + "coverageRate": coverage, + "completenessRate": completeness, + "matchedTags": matched, + "unmatchedByAnnotation": unmatched, + "missedByPattern": missed, + } ) - df_resource_quality["coverageRate"] = ( - df_resource_quality["matchedTags"] - / (df_resource_quality["matchedTags"] + df_resource_quality["unmatchedByAnnotation"]) - ) * 100 - df_resource_quality["completenessRate"] = ( - df_resource_quality["matchedTags"] - / (df_resource_quality["matchedTags"] + df_resource_quality["missedByPattern"]) - ) * 100 - df_resource_quality.fillna(0, inplace=True) +df_chart_data = pd.DataFrame(chart_data) + +# --- Filter chart data based on dropdown selection --- +if selected_resource_type != "All": + df_chart_display = df_chart_data[df_chart_data["resourceType"] == selected_resource_type] +else: + df_chart_display = df_chart_data +# --- Render Charts --- +if not df_chart_display.empty: chart_col1, chart_col2 = st.columns(2) with chart_col1: coverage_chart = ( - alt.Chart(df_resource_quality) + alt.Chart(df_chart_display) .mark_bar() .encode( x=alt.X("resourceType:N", title="Resource Type", sort="-y"), @@ -170,7 +232,7 @@ st.altair_chart(coverage_chart, use_container_width=True) with chart_col2: completeness_chart = ( - alt.Chart(df_resource_quality) + alt.Chart(df_chart_display) .mark_bar() .encode( x=alt.X("resourceType:N", title="Resource Type", sort="-y"), @@ -181,20 +243,60 @@ ) st.altair_chart(completeness_chart, use_container_width=True) else: - st.info("The 'resourceType' column is not available in the pattern data to generate this chart.") + st.info("No data available for the selected resource type to generate charts.") -# --- Pattern Catalog Expander with Tabs --- +# --- Combine auto and manual patterns for display --- with st.expander("View Full Pattern Catalog"): - if df_pattern_catalog.empty: + # Standardize column names for merging + df_auto_patterns.rename(columns={"resourceType": "resource_type", "pattern": "sample"}, inplace=True) + + # Select and combine relevant columns + df_combined_patterns = ( + pd.concat([df_auto_patterns[["resource_type", "sample"]], df_manual_patterns[["resource_type", "sample"]]]) + .drop_duplicates() + .sort_values(by=["resource_type", "sample"]) + ) + + if df_combined_patterns.empty: st.info("Pattern catalog is empty or could not be loaded.") else: - resource_types = sorted(df_pattern_catalog["resourceType"].unique()) + resource_types = sorted(df_combined_patterns["resource_type"].unique()) tabs = st.tabs(resource_types) for i, resource_type in enumerate(resource_types): with tabs[i]: - df_filtered_patterns = df_pattern_catalog[df_pattern_catalog["resourceType"] == resource_type] - st.dataframe(df_filtered_patterns[["pattern"]], use_container_width=True, hide_index=True) + df_filtered_patterns = df_combined_patterns[df_combined_patterns["resource_type"] == resource_type] + st.dataframe( + df_filtered_patterns[["sample"]], + use_container_width=True, + hide_index=True, + column_config={"sample": "Pattern"}, + ) + +# --- Display Matched, Unmatched and Missed Tags --- +st.subheader("Tag Details") +tag_col1, tag_col2, tag_col3 = st.columns(3) + +with tag_col1: + st.metric("✅ Matched Tags", f"{total_matched}") + st.dataframe( + pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), use_container_width=True, hide_index=True + ) + +with tag_col2: + st.metric("❓ Unmatched by Annotation", f"{total_unmatched}") + st.dataframe( + pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), + use_container_width=True, + hide_index=True, + ) + +with tag_col3: + st.metric("❗️ Missed by Pattern", f"{total_missed}") + st.dataframe( + pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), use_container_width=True, hide_index=True + ) + # --- File-Level Table --- st.subheader("Per-File Annotation Quality") From 8acda8db33ba4a2616eb885b5af488e60ea55135 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 3 Sep 2025 14:45:39 -0500 Subject: [PATCH 038/128] made per-file-annotation a header instead of subheader --- .../file_annotation_dashboard/pages/Annotation_Quality.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index eceb1d6c..e41f12b1 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -299,7 +299,7 @@ # --- File-Level Table --- -st.subheader("Per-File Annotation Quality") +st.header("Per-File Annotation Quality") st.info("✔️ Select a file in the table below to see a detailed breakdown of its tags.") df_display = df_quality.sort_values(by="coverageRate").reset_index(drop=True) From 55515aa123659779f12ae161dc01404534674d23 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 5 Sep 2025 10:00:43 -0500 Subject: [PATCH 039/128] updated annotation quality report for better filtering of large data sets --- .../pages/Annotation_Quality.py | 358 ++++++++++++------ 1 file changed, 242 insertions(+), 116 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index e41f12b1..12702abc 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -7,7 +7,8 @@ find_pipelines, generate_file_canvas, fetch_pattern_catalog, - fetch_manual_patterns, # Import the function to get manual patterns + fetch_manual_patterns, + fetch_annotation_states, ) from cognite.client.data_classes.data_modeling import NodeId @@ -18,6 +19,12 @@ layout="wide", ) + +# --- Callback function to reset selection --- +def reset_selection(): + st.session_state.selected_row_index = None + + # --- Initialize Session State --- if "selected_row_index" not in st.session_state: st.session_state.selected_row_index = None @@ -40,7 +47,7 @@ st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") st.stop() -ep_config, _, _ = config_result +ep_config, annotation_state_view, file_view = config_result report_config = ep_config.get("finalizeFunction", {}).get("reportService", {}) cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) db_name = report_config.get("rawDb") @@ -116,7 +123,7 @@ # Get a unique, sorted list of resource types for the filter all_resource_types = ["All"] + sorted(df_patterns["resourceType"].unique().tolist()) -selected_resource_type = st.selectbox("Filter by Resource Type:", options=all_resource_types) +selected_resource_type = st.selectbox("Filter by Resource Type:", options=all_resource_types, on_change=reset_selection) # --- Filter the data based on selection --- if selected_resource_type == "All": @@ -169,6 +176,9 @@ st.divider() +# --- Annotation Quality by Resource Type --- +st.subheader("Analysis by Resource Type") + # --- Prepare data for charts --- chart_data = [] # Use all_resource_types[1:] to skip the "All" option @@ -245,7 +255,31 @@ else: st.info("No data available for the selected resource type to generate charts.") -# --- Combine auto and manual patterns for display --- +# --- Display Matched, Unmatched and Missed Tags --- +st.subheader("Tag Details") +tag_col1, tag_col2, tag_col3 = st.columns(3) + +with tag_col1: + st.metric("✅ Matched Tags", f"{total_matched}") + st.dataframe( + pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), use_container_width=True, hide_index=True + ) + +with tag_col2: + st.metric("❓ Unmatched by Annotation", f"{total_unmatched}") + st.dataframe( + pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), + use_container_width=True, + hide_index=True, + ) + +with tag_col3: + st.metric("❗️ Missed by Pattern", f"{total_missed}") + st.dataframe( + pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), use_container_width=True, hide_index=True + ) + +# --- Pattern Catalog Expander with Tabs --- with st.expander("View Full Pattern Catalog"): # Standardize column names for merging df_auto_patterns.rename(columns={"resourceType": "resource_type", "pattern": "sample"}, inplace=True) @@ -273,47 +307,135 @@ column_config={"sample": "Pattern"}, ) -# --- Display Matched, Unmatched and Missed Tags --- -st.subheader("Tag Details") -tag_col1, tag_col2, tag_col3 = st.columns(3) +# --- File-Level Table --- +st.header("Per-File Annotation Quality") -with tag_col1: - st.metric("✅ Matched Tags", f"{total_matched}") - st.dataframe( - pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), use_container_width=True, hide_index=True +# --- Fetch file metadata and merge it with the quality data --- +df_file_meta = fetch_annotation_states(annotation_state_view, file_view) +if not df_file_meta.empty: + df_display_unfiltered = pd.merge( + df_quality, df_file_meta, left_on="startNode", right_on="fileExternalId", how="left" ) +else: + df_display_unfiltered = df_quality + +# --- Advanced Filtering Section --- +with st.expander("Filter Per-File Quality Table"): + + # --- DYNAMIC METADATA FILTER --- + # Define columns that should not be offered as metadata filters + excluded_columns = [ + "Select", + "startNode", + "potentialTags", + "actualAnnotations", + "matchedTags", + "unmatchedByAnnotation", + "missedByPattern", + "coverageRate", + "completenessRate", + "fileExternalId", + "externalId", + "space", + "annotatedPageCount", + "annotationMessage", + "fileAliases", + "fileAssets", + "fileIsuploaded", + "jobId", + "linkedFile", + "pageCount", + "patternModeJobId", + "sourceCreatedUser", + "sourceCreatedTime", + "sourceUpdatedTime", + "sourceUpdatedUser", + "fileSourceupdateduser", + "fileSourcecreatedUser", + "fileSourceId", + "createdTime", + "fileSourcecreateduser", + "patternModeMessage", + "fileSourceupdatedtime", + "fileSourcecreatedtime", + "fileUploadedtime", + ] + + # Get the list of available metadata columns for filtering + filterable_columns = sorted([col for col in df_display_unfiltered.columns if col not in excluded_columns]) + + filter_col1, filter_col2 = st.columns(2) + + with filter_col1: + selected_column = st.selectbox( + "Filter by Metadata Property", options=["None"] + filterable_columns, on_change=reset_selection + ) -with tag_col2: - st.metric("❓ Unmatched by Annotation", f"{total_unmatched}") - st.dataframe( - pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), - use_container_width=True, - hide_index=True, - ) + selected_values = [] + if selected_column != "None": + unique_values = sorted(df_display_unfiltered[selected_column].dropna().unique().tolist()) + with filter_col2: + selected_values = st.multiselect( + f"Select Value(s) for {selected_column}", options=unique_values, on_change=reset_selection + ) -with tag_col3: - st.metric("❗️ Missed by Pattern", f"{total_missed}") - st.dataframe( - pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), use_container_width=True, hide_index=True - ) + coverage_range = st.slider("Filter by Annotation Coverage (%)", 0, 100, (0, 100), on_change=reset_selection) + completeness_range = st.slider("Filter by Pattern Completeness (%)", 0, 100, (0, 100), on_change=reset_selection) +df_display = df_display_unfiltered.copy() +# Apply filters +if selected_column != "None" and selected_values: + df_display = df_display[df_display[selected_column].isin(selected_values)] -# --- File-Level Table --- -st.header("Per-File Annotation Quality") -st.info("✔️ Select a file in the table below to see a detailed breakdown of its tags.") +df_display = df_display[ + (df_display["coverageRate"] >= coverage_range[0]) & (df_display["coverageRate"] <= coverage_range[1]) +] +df_display = df_display[ + (df_display["completenessRate"] >= completeness_range[0]) + & (df_display["completenessRate"] <= completeness_range[1]) +] + +# --- Reset the index after all filtering is complete --- +df_display = df_display.reset_index(drop=True) -df_display = df_quality.sort_values(by="coverageRate").reset_index(drop=True) df_display.insert(0, "Select", False) -if st.session_state.get("selected_row_index") is not None and st.session_state.selected_row_index < len(df_display): - df_display.at[st.session_state.selected_row_index, "Select"] = True +# --- Column configuration for the data editor --- +default_columns = [ + "Select", + "fileName", + "fileSourceid", + "fileMimetype", + "coverageRate", + "completenessRate", + "lastUpdatedTime", +] +all_columns = df_display.columns.tolist() + +with st.popover("Customize Table Columns"): + selected_columns = st.multiselect( + "Select columns to display:", + options=all_columns, + default=[col for col in default_columns if col in all_columns], # Ensure default is valid + ) + +if not selected_columns: + st.warning("Please select at least one column to display.") + st.stop() + + +if st.session_state.get("selected_row_index") is not None: + if st.session_state.selected_row_index < len(df_display): + df_display.at[st.session_state.selected_row_index, "Select"] = True edited_df = st.data_editor( - df_display, + df_display[selected_columns], key="quality_table_editor", column_config={ "Select": st.column_config.CheckboxColumn(required=True), - "startNode": "File External ID", + "fileName": "File Name", + "fileSourceid": "Source ID", + "fileMimetype": "Mime Type", "potentialTags": "Potential Tags", "actualAnnotations": "Actual Annotations", "coverageRate": st.column_config.ProgressColumn( @@ -330,9 +452,9 @@ min_value=0, max_value=100, ), + "lastUpdatedTime": "Last Updated Time", }, use_container_width=True, - column_order=("Select", "startNode", "coverageRate", "completenessRate"), hide_index=True, disabled=df_display.columns.difference(["Select"]), ) @@ -354,91 +476,95 @@ st.subheader("Tag Comparison Drill-Down") if st.session_state.get("selected_row_index") is not None: - selected_file_data = df_display.iloc[st.session_state.selected_row_index] - selected_file = selected_file_data["startNode"] - st.markdown(f"Displaying details for file: **{selected_file}**") - - file_space_series = df_patterns[df_patterns["startNode"] == selected_file]["startNodeSpace"] - if not file_space_series.empty: - file_space = file_space_series.iloc[0] - file_node_id = NodeId(space=file_space, external_id=selected_file) - - # --- Three-Column Tag Comparison (prepare dataframes first) --- - df_potential_tags_details = df_patterns[df_patterns["startNode"] == selected_file][ - ["text", "resourceType", "regions"] - ] - - if not df_annotations.empty: - df_actual_annotations_details = df_annotations[df_annotations["startNode"] == selected_file][ - ["startNodeText", "endNodeResourceType"] - ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) - else: - df_actual_annotations_details = pd.DataFrame(columns=["text", "resourceType"]) - - potential_tags_set = set(df_potential_tags_details["text"]) - actual_tags_set = set(df_actual_annotations_details["text"]) - - matched_tags_set = potential_tags_set.intersection(actual_tags_set) - unmatched_tags_set = potential_tags_set - actual_tags_set - missed_tags_set = actual_tags_set - potential_tags_set - - matched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(matched_tags_set) - ].drop_duplicates(subset=["text", "resourceType"]) - unmatched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(unmatched_tags_set) - ].drop_duplicates(subset=["text", "resourceType"]) - missed_df = df_actual_annotations_details[ - df_actual_annotations_details["text"].isin(missed_tags_set) - ].drop_duplicates() - - if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): - with st.spinner("Generating Industrial Canvas with bounding boxes..."): - _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) - - unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") - - canvas_url = generate_file_canvas( - file_id=file_node_id, - file_view=file_view_config, - ep_config=ep_config, - unmatched_tags_with_regions=unmatched_tags_for_canvas, - ) - if canvas_url: - st.session_state["generated_canvas_url"] = canvas_url - else: - st.session_state.pop("generated_canvas_url", None) - - if "generated_canvas_url" in st.session_state and st.session_state.generated_canvas_url: - st.markdown( - f"**[Open Last Generated Canvas]({st.session_state.generated_canvas_url})**", unsafe_allow_html=True - ) + if st.session_state.selected_row_index < len(df_display): + selected_file_data = df_display.iloc[st.session_state.selected_row_index] + selected_file = selected_file_data["startNode"] + st.markdown(f"Displaying details for file: **{selected_file}**") + + file_space_series = df_patterns[df_patterns["startNode"] == selected_file]["startNodeSpace"] + if not file_space_series.empty: + file_space = file_space_series.iloc[0] + file_node_id = NodeId(space=file_space, external_id=selected_file) + + # --- Three-Column Tag Comparison (prepare dataframes first) --- + df_potential_tags_details = df_patterns[df_patterns["startNode"] == selected_file][ + ["text", "resourceType", "regions"] + ] + + if not df_annotations.empty: + df_actual_annotations_details = df_annotations[df_annotations["startNode"] == selected_file][ + ["startNodeText", "endNodeResourceType"] + ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) + else: + df_actual_annotations_details = pd.DataFrame(columns=["text", "resourceType"]) + + potential_tags_set = set(df_potential_tags_details["text"]) + actual_tags_set = set(df_actual_annotations_details["text"]) + + matched_tags_set = potential_tags_set.intersection(actual_tags_set) + unmatched_tags_set = potential_tags_set - actual_tags_set + missed_tags_set = actual_tags_set - potential_tags_set + + matched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(matched_tags_set) + ].drop_duplicates(subset=["text", "resourceType"]) + unmatched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(unmatched_tags_set) + ].drop_duplicates(subset=["text", "resourceType"]) + missed_df = df_actual_annotations_details[ + df_actual_annotations_details["text"].isin(missed_tags_set) + ].drop_duplicates() + + if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): + with st.spinner("Generating Industrial Canvas with bounding boxes..."): + _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) + + unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") + + canvas_url = generate_file_canvas( + file_id=file_node_id, + file_view=file_view_config, + ep_config=ep_config, + unmatched_tags_with_regions=unmatched_tags_for_canvas, + ) + if canvas_url: + st.session_state["generated_canvas_url"] = canvas_url + else: + st.session_state.pop("generated_canvas_url", None) + + if "generated_canvas_url" in st.session_state and st.session_state.generated_canvas_url: + st.markdown( + f"**[Open Last Generated Canvas]({st.session_state.generated_canvas_url})**", unsafe_allow_html=True + ) - col1, col2, col3 = st.columns(3) + col1, col2, col3 = st.columns(3) + + with col1: + st.metric("✅ Matched Tags", len(matched_df)) + st.dataframe( + matched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col2: + st.metric("❓ Unmatched by Annotation", len(unmatched_df)) + st.dataframe( + unmatched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col3: + st.metric("❗️ Missed by Pattern", len(missed_df)) + st.dataframe( + missed_df, + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + else: + st.info("✔️ Previous selection is not in the filtered view. Please select a new file.") - with col1: - st.metric("✅ Matched Tags", len(matched_df)) - st.dataframe( - matched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - with col2: - st.metric("❓ Unmatched by Annotation", len(unmatched_df)) - st.dataframe( - unmatched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - with col3: - st.metric("❗️ Missed by Pattern", len(missed_df)) - st.dataframe( - missed_df, - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) else: st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") From cbba680b7f2b69d3a2d68d6246f30ba14fe782ac Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 8 Sep 2025 09:14:45 -0500 Subject: [PATCH 040/128] added to the default columns displayed --- .../file_annotation_dashboard/pages/Annotation_Quality.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 12702abc..9e58af47 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -408,6 +408,8 @@ def reset_selection(): "fileMimetype", "coverageRate", "completenessRate", + "annotationMessage", + "patternModeMessage", "lastUpdatedTime", ] all_columns = df_display.columns.tolist() @@ -452,6 +454,8 @@ def reset_selection(): min_value=0, max_value=100, ), + "annotationMessage": "Annotation Message", + "patternModeMessage": "Pattern Mode Message", "lastUpdatedTime": "Last Updated Time", }, use_container_width=True, From 9d5e5b9793a5531cb8e0e9d7511f3accfaf50e1c Mon Sep 17 00:00:00 2001 From: "Nguyen, Hieu" Date: Mon, 8 Sep 2025 10:42:48 -0500 Subject: [PATCH 041/128] refactored pattern mode logging to accurately reflect total sample patterns based on available in-memory patterns --- .../services/LaunchService.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 0a9a3c25..156404c8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -378,8 +378,13 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode pattern_job_id: int | None = None if self.config.launch_function.pattern_mode: + total_patterns = 0 + if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: + total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + len(self.in_memory_patterns[1].get('sample', [])) + elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: + total_patterns = len(self.in_memory_patterns[0].get('sample', [])) self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" + f"Running pattern mode diagram detect on {batch.size()} files with {total_patterns} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns @@ -431,8 +436,13 @@ def _process_batch(self, batch: BatchOfPairedNodes): # Run diagram detect on pattern mode pattern_job_id: int | None = None if self.config.launch_function.pattern_mode: + total_patterns = 0 + if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: + total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + len(self.in_memory_patterns[1].get('sample', [])) + elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: + total_patterns = len(self.in_memory_patterns[0].get('sample', [])) self.logger.info( - f"Running pattern mode diagram detect on {batch.size()} files with {len(self.in_memory_patterns[0]['sample']) + len(self.in_memory_patterns[1]['sample'])} sample patterns" + f"Running pattern mode diagram detect on {batch.size()} files with {total_patterns} sample patterns" ) pattern_job_id = self.annotation_service.run_pattern_mode_detect( files=batch.file_references, pattern_samples=self.in_memory_patterns From bc8ed62b2996b759b6ba576d2f23f6581733017a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 8 Sep 2025 12:36:08 -0500 Subject: [PATCH 042/128] handled potential type error from the function_id_str and call_id_str --- .../file_annotation_dashboard/Extraction_Pipeline.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py index bda688fd..569ae3ce 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py @@ -229,10 +229,13 @@ if run.message: st.code(run.message, language="text") - function_id = int(parsed_message.get("function_id")) - call_id = int(parsed_message.get("call_id")) + # Check if the IDs exist and are not None + function_id_str = parsed_message.get("function_id") + call_id_str = parsed_message.get("call_id") - if function_id and call_id: + if function_id_str and call_id_str: + function_id = int(parsed_message.get("function_id")) + call_id = int(parsed_message.get("call_id")) button_key = f"log_btn_all_{call_id}" if st.button("Fetch Function Logs", key=button_key): with st.spinner("Fetching logs..."): From 8c20695f8d8315ac8bf8d4b2dd3d9cad6f49be7a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 8 Sep 2025 14:04:02 -0500 Subject: [PATCH 043/128] added new properties to the data model --- .../data_models/hdm.container.yaml | 53 +++++++++++++++++++ .../data_models/hdm.view.yaml | 40 ++++++++++++-- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml index 08cf585a..707754e3 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.container.yaml @@ -11,6 +11,7 @@ type: list: false type: int64 + name: Annotated page count annotationMessage: autoIncrement: false immutable: false @@ -22,6 +23,7 @@ collation: ucs_basic list: false type: text + name: Annotation message annotationStatus: autoIncrement: false immutable: false @@ -30,6 +32,7 @@ collation: ucs_basic list: false type: text + name: Annotation status patternModeMessage: autoIncrement: false immutable: false @@ -38,6 +41,7 @@ type: text collation: ucs_basic list: false + name: Pattern mode message attemptCount: autoIncrement: false immutable: false @@ -45,6 +49,7 @@ type: list: false type: int64 + name: Attempt count diagramDetectJobId: autoIncrement: false immutable: false @@ -52,6 +57,7 @@ type: list: false type: int64 + name: Diagram detect job Id patternModeJobId: autoIncrement: false immutable: false @@ -59,6 +65,7 @@ type: list: false type: int64 + name: Pattern mode job Id linkedFile: autoIncrement: false immutable: false @@ -66,6 +73,7 @@ type: list: false type: direct + name: Linked file pageCount: autoIncrement: false immutable: false @@ -73,6 +81,39 @@ type: list: false type: int64 + name: Page count + launchFunctionId: # NOTE: Id of the function that was called. Will be useful as an index for query calls. B-tree + type: + list: false + type: int64 + immutable: false + nullable: true + autoIncrement: false + name: Launch function Id + launchFunctionCallId: # NOTE: specific Id that points to the function log. Will be useful as an index for query calls. B-tree + type: + list: false + type: int64 + immutable: false + nullable: true + autoIncrement: false + name: Launch function call Id + finalizeFunctionId: # NOTE: Id of the function that was called. Will be useful as an index for query calls. B-tree + type: + list: false + type: int64 + immutable: false + nullable: true + autoIncrement: false + name: Finalize function Id + finalizeFunctionCallId: # NOTE: specific Id that points to the function log. Will be useful as an index for query calls. B-tree + type: + list: false + type: int64 + immutable: false + nullable: true + autoIncrement: false + name: Finalize function call Id space: {{ annotationStateSchemaSpace }} usedFor: node indexes: @@ -85,4 +126,16 @@ indexType: btree properties: - diagramDetectJobId + cursorable: true + launchFunction: + indexType: btree + properties: + - launchFunctionId + - launchFunctionCallId + cursorable: true + finalizeFunction: + indexType: btree + properties: + - finalizeFunctionId + - finalizeFunctionCallId cursorable: true \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml index d0b8f326..e501ef2d 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.view.yaml @@ -49,16 +49,16 @@ space: {{ annotationStateSchemaSpace }} type: container containerPropertyIdentifier: diagramDetectJobId - description: Diagram detect job ID - name: Diagram detect job ID + description: Diagram detect job Id + name: Diagram detect job Id patternModeJobId: container: externalId: {{ annotationStateExternalId }} space: {{ annotationStateSchemaSpace }} type: container containerPropertyIdentifier: patternModeJobId - description: Diagram detect job ID with pattern mode - name: Pattern mode job ID + description: Diagram detect job Id with pattern mode + name: Pattern mode job Id patternModeMessage: container: externalId: {{ annotationStateExternalId }} @@ -112,5 +112,37 @@ containerPropertyIdentifier: sourceUpdatedTime description: Last updated time name: Last updated time + launchFunctionId: + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container + containerPropertyIdentifier: launchFunctionId + description: Id of the launch function that was called + name: Launch function Id + launchFunctionCallId: + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container + containerPropertyIdentifier: launchFunctionCallId + description: Specific Id that points to the function log that created the diagram detect job for the file + name: Launch function call Id + finalizeFunctionId: + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container + containerPropertyIdentifier: finalizeFunctionId + description: Id of the finalize function that was called + name: Finalize function Id + finalizeFunctionCallId: + container: + externalId: {{ annotationStateExternalId }} + space: {{ annotationStateSchemaSpace }} + type: container + containerPropertyIdentifier: finalizeFunctionCallId + description: Specific Id that points to the function log that applied annotations to the file + name: Finalize function call Id space: {{ annotationStateSchemaSpace }} version: {{ annotationStateVersion }} From 31ecef5d679257056b4da94535f13b0bdaf72e80 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 8 Sep 2025 21:09:35 -0500 Subject: [PATCH 044/128] adding function call info to the annotation state --- .../fn_file_annotation_finalize/handler.py | 13 ++++++++++--- .../services/FinalizeService.py | 9 +++++++++ .../functions/fn_file_annotation_launch/handler.py | 8 ++++++-- .../services/LaunchService.py | 8 ++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py index feee0dc3..26da2556 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py @@ -45,7 +45,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: ) finalize_instance, report_instance = _create_finalize_service( - config_instance, client, logger_instance, tracker_instance + config_instance, client, logger_instance, tracker_instance, function_call_info ) run_status: str = "success" @@ -95,7 +95,11 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): tracker_instance = PerformanceTracker() finalize_instance, report_instance = _create_finalize_service( - config_instance, client, logger_instance, tracker_instance + config_instance, + client, + logger_instance, + tracker_instance, + function_call_info={"function_id": None, "call_id": None}, ) try: @@ -138,7 +142,9 @@ def run_locally_parallel( thread_4.join() -def _create_finalize_service(config, client, logger, tracker) -> tuple[AbstractFinalizeService, IReportService]: +def _create_finalize_service( + config, client, logger, tracker, function_call_info +) -> tuple[AbstractFinalizeService, IReportService]: """ Instantiate Finalize with interfaces. """ @@ -153,6 +159,7 @@ def _create_finalize_service(config, client, logger, tracker) -> tuple[AbstractF retrieve_service=retrieve_instance, apply_service=apply_instance, report_service=report_instance, + function_call_info=function_call_info, ) return finalize_instance, report_instance diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index dd6d51b7..0a4f99b9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -72,6 +72,7 @@ def __init__( retrieve_service: IRetrieveService, apply_service: IApplyService, report_service: IReportService, + function_call_info: dict, ): super().__init__( client, @@ -85,10 +86,14 @@ def __init__( self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = config.data_model_views.file_view + self.page_range: int = config.launch_function.annotation_service.page_range self.max_retries: int = config.finalize_function.max_retry_attempts self.clean_old_annotations: bool = config.finalize_function.clean_old_annotations + self.function_id: int | None = function_call_info.get("function_id") + self.call_id: int | None = function_call_info.get("call_id") + def run(self) -> Literal["Done"] | None: """ Retrieves the result of a diagram detect job and then pushes the annotation to mpcFile. @@ -388,6 +393,8 @@ def _process_annotation_state( "annotationMessage": annotation_message, "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, + "finalizeFunctionId": self.function_id, + "finalizeFunctionCallId": self.call_id, } else: update_properties = { @@ -398,6 +405,8 @@ def _process_annotation_state( "attemptCount": attempt_count, "annotatedPageCount": annotated_page_count, "pageCount": page_count, + "finalizeFunctionId": self.function_id, + "finalizeFunctionCallId": self.call_id, } node_apply = NodeApply( diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py index 4ff0be16..fedd7396 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py @@ -47,6 +47,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: client=client, logger=logger_instance, tracker=tracker_instance, + function_call_info=function_call_info, ) run_status: str = "success" @@ -105,6 +106,7 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): client=client, logger=logger_instance, tracker=tracker_instance, + function_call_info={"function_id": None, "call_id": None}, ) try: while True: @@ -129,7 +131,7 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): logger_instance.close() -def _create_launch_service(config, client, logger, tracker) -> AbstractLaunchService: +def _create_launch_service(config, client, logger, tracker, function_call_info) -> AbstractLaunchService: cache_instance: ICacheService = create_general_cache_service(config, client, logger) data_model_instance: IDataModelService = create_general_data_model_service(config, client, logger) annotation_instance: IAnnotationService = create_general_annotation_service(config, client, logger) @@ -141,11 +143,12 @@ def _create_launch_service(config, client, logger, tracker) -> AbstractLaunchSer data_model_service=data_model_instance, cache_service=cache_instance, annotation_service=annotation_instance, + function_call_info=function_call_info, ) return launch_instance -def _create_local_launch_service(config, client, logger, tracker) -> AbstractLaunchService: +def _create_local_launch_service(config, client, logger, tracker, function_call_info) -> AbstractLaunchService: cache_instance: ICacheService = create_general_cache_service(config, client, logger) data_model_instance: IDataModelService = create_general_data_model_service(config, client, logger) annotation_instance: IAnnotationService = create_general_annotation_service(config, client, logger) @@ -157,6 +160,7 @@ def _create_local_launch_service(config, client, logger, tracker) -> AbstractLau data_model_service=data_model_instance, cache_service=cache_instance, annotation_service=annotation_instance, + function_call_info=function_call_info, ) return launch_instance diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 156404c8..420d07de 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -74,6 +74,7 @@ def __init__( data_model_service: IDataModelService, cache_service: ICacheService, annotation_service: IAnnotationService, + function_call_info: dict, ): super().__init__( client, @@ -98,6 +99,9 @@ def __init__( self.primary_scope_property: str = self.config.launch_function.primary_scope_property self.secondary_scope_property: str | None = self.config.launch_function.secondary_scope_property + self.function_id: int | None = function_call_info.get("function_id") + self.call_id: int | None = function_call_info.get("call_id") + self.reset_files: bool = False if self.config.prepare_function.get_files_for_annotation_reset_query: self.reset_files = True @@ -373,6 +377,8 @@ def _process_batch(self, batch: BatchOfPairedNodes): "annotationStatus": AnnotationStatus.PROCESSING, "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, + "launchFunctionId": self.function_id, + "launchFunctionCallId": self.call_id, } # Run diagram detect on pattern mode @@ -431,6 +437,8 @@ def _process_batch(self, batch: BatchOfPairedNodes): "annotationStatus": AnnotationStatus.PROCESSING, "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, + "launchFunctionId": self.function_id, + "launchFunctionCallId": self.call_id, } # Run diagram detect on pattern mode From ebe05b63532d0a80c11b222d887fd68194de18c3 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 10 Sep 2025 21:06:29 -0500 Subject: [PATCH 045/128] Improved streamlit dashboard to account for new logging properties --- .../file_annotation_dashboard.Streamlit.yaml | 2 +- .../Extraction_Pipeline.py | 280 ----------- .../Pipeline_Health.py | 456 ++++++++++++++++++ .../file_annotation_dashboard/helper.py | 293 ++++++----- .../pages/Status_Overview.py | 146 ------ 5 files changed, 593 insertions(+), 584 deletions(-) delete mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py create mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py delete mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard.Streamlit.yaml b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard.Streamlit.yaml index fbb9e422..7a60f61e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard.Streamlit.yaml +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard.Streamlit.yaml @@ -5,4 +5,4 @@ description: Dashboard to inspect the health of the File Annotation Pipeline published: true theme: Light dataSetExternalId: {{ annotationDatasetExternalId }} -entrypoint: Extraction_Pipeline.py \ No newline at end of file +entrypoint: Pipeline_Health.py \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py deleted file mode 100644 index 569ae3ce..00000000 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Extraction_Pipeline.py +++ /dev/null @@ -1,280 +0,0 @@ -import streamlit as st -import pandas as pd -import altair as alt -from cognite.client import CogniteClient -from datetime import datetime, timedelta -from helper import ( - fetch_annotation_states, - fetch_pipeline_run_history, - process_runs_for_graphing, - fetch_extraction_pipeline_config, - calculate_success_failure_stats, - fetch_function_logs, - parse_run_message, - find_pipelines, -) - -st.set_page_config( - page_title="Pipeline Run History", - page_icon="📈", - layout="wide", -) - -# --- Sidebar for Pipeline Selection --- -st.sidebar.title("Pipeline Selection") -# The helper function now returns a pre-filtered list -pipeline_ids = find_pipelines() - -if not pipeline_ids: - st.info("No active file annotation pipelines found to monitor.") - st.stop() - -# Use session_state to remember the selection across pages -if "selected_pipeline" not in st.session_state or st.session_state.selected_pipeline not in pipeline_ids: - st.session_state.selected_pipeline = pipeline_ids[0] - -# The selectbox displays the filtered list for the user -selected_pipeline = st.sidebar.selectbox("Select a pipeline to monitor:", options=pipeline_ids, key="selected_pipeline") - -# --- Main Application --- -st.title("Pipeline Run History") -st.markdown("This page provides statistics and detailed history for the selected extraction pipeline run.") - -# Fetch data using the user's selection -pipeline_runs = fetch_pipeline_run_history(selected_pipeline) - - -# --- Pipeline Statistics Section --- -if pipeline_runs: - # Time window selection - time_window_map = { - "All": None, - "Last 24 Hours": 24, - "Last 7 Days": 7 * 24, - "Last 30 Days": 30 * 24, - } - time_window_option = st.sidebar.selectbox( - "Filter by Time Window:", - options=list(time_window_map.keys()), - ) - window_hours = time_window_map[time_window_option] - - if window_hours is not None: - now = pd.Timestamp.now(tz="UTC") - filter_start_time = now - timedelta(hours=window_hours) - # Filter runs based on the time window - recent_pipeline_runs = [ - run - for run in pipeline_runs - if pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") > filter_start_time - ] - else: - # If 'All' is selected, use the original unfiltered list of runs - recent_pipeline_runs = pipeline_runs - - # MODIFICATION: Check if 'recent_pipeline_runs' has data BEFORE processing. - # If it's empty, display a message. Otherwise, proceed with stats and graphs. - if not recent_pipeline_runs: - st.warning("No pipeline runs found in the selected time window.") - else: - # --- Calculate detailed stats for the selected time window --- - df_runs_for_graphing = process_runs_for_graphing(recent_pipeline_runs) - - launch_success = 0 - launch_failure = 0 - finalize_success = 0 - finalize_failure = 0 - - for run in recent_pipeline_runs: - # We need to parse the message to determine the caller type - parsed_message = parse_run_message(run.message) - caller = parsed_message.get("caller") - - if caller == "Launch": - if run.status == "success": - launch_success += 1 - elif run.status == "failure": - launch_failure += 1 - elif caller == "Finalize": - if run.status == "success": - finalize_success += 1 - elif run.status == "failure": - finalize_failure += 1 - - total_launched_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Launch"]["count"].sum()) - total_finalized_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Finalize"]["count"].sum()) - - # --- Display Metrics and Graphs in two columns --- - g_col1, g_col2 = st.columns(2) - - with g_col1: - st.subheader("Launch Runs") - m_col1, m_col2, m_col3 = st.columns(3) - m_col1.metric( - f"Files Launched", - f"{total_launched_recent:,}", - ) - m_col2.metric( - "Successful Runs", - f"{launch_success:,}", - ) - m_col3.metric( - "Failed Runs", - f"{launch_failure:,}", - delta=f"{launch_failure:,}" if launch_failure > 0 else "0", - delta_color="inverse", - ) - - with g_col2: - st.subheader("Finalize Runs") - m_col4, m_col5, m_col6 = st.columns(3) - m_col4.metric( - f"Files Finalized", - f"{total_finalized_recent:,}", - ) - m_col5.metric( - "Successful Runs", - f"{finalize_success:,}", - ) - m_col6.metric( - "Failed Runs", - f"{finalize_failure:,}", - delta=f"{finalize_failure:,}" if finalize_failure > 0 else "0", - delta_color="inverse", - ) - - # --- Graphs --- - base_chart = ( - alt.Chart(df_runs_for_graphing) - .mark_circle(size=60, opacity=0.7) - .encode( - x=alt.X("timestamp:T", title="Time of Run"), - y=alt.Y("count:Q", title="Files Processed"), - tooltip=["timestamp:T", "count:Q", "type:N"], - ) - .interactive() - ) - - chart_col1, chart_col2 = st.columns(2) - with chart_col1: - launch_chart = base_chart.transform_filter(alt.datum.type == "Launch").properties( - title="Files Processed per Launch Run" - ) - st.altair_chart(launch_chart, use_container_width=True) - with chart_col2: - finalize_chart = base_chart.transform_filter(alt.datum.type == "Finalize").properties( - title="Files Processed per Finalize Run" - ) - st.altair_chart(finalize_chart, use_container_width=True) - - # --- UNIFIED DETAILED RUN HISTORY --- - with st.expander("View recent runs and fetch logs", expanded=True): - if not recent_pipeline_runs: - st.info("No runs in the selected time window.") - else: - f_col1, f_col2 = st.columns(2) - with f_col1: - run_status_filter = st.radio( - "Filter by run status:", - ("All", "Success", "Failure"), - horizontal=True, - key="run_status_filter", - ) - with f_col2: - caller_type_filter = st.radio( - "Filter by caller type:", - ("All", "Launch", "Finalize"), - horizontal=True, - key="caller_type_filter", - ) - - st.divider() - - filtered_runs = recent_pipeline_runs - if run_status_filter != "All": - filtered_runs = [run for run in filtered_runs if run.status.lower() == run_status_filter.lower()] - - if caller_type_filter != "All": - filtered_runs = [ - run for run in filtered_runs if parse_run_message(run.message).get("caller") == caller_type_filter - ] - - if not filtered_runs: - st.warning(f"No runs match the selected filters.") - else: - # Pagination state - if "page_num" not in st.session_state: - st.session_state.page_num = 0 - - items_per_page = 3 - start_idx = st.session_state.page_num * items_per_page - end_idx = start_idx + items_per_page - paginated_runs = filtered_runs[start_idx:end_idx] - - # Display logic for each run - for run in paginated_runs: - - if run.status == "success": - st.markdown(f"**Status:** Success") - st.success( - f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" - ) - else: - st.markdown(f"**Status:** Failure") - st.error( - f"Timestamp: {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S %Z')}" - ) - - parsed_message = parse_run_message(run.message) - if run.message: - st.code(run.message, language="text") - - # Check if the IDs exist and are not None - function_id_str = parsed_message.get("function_id") - call_id_str = parsed_message.get("call_id") - - if function_id_str and call_id_str: - function_id = int(parsed_message.get("function_id")) - call_id = int(parsed_message.get("call_id")) - button_key = f"log_btn_all_{call_id}" - if st.button("Fetch Function Logs", key=button_key): - with st.spinner("Fetching logs..."): - logs = fetch_function_logs(function_id=function_id, call_id=call_id) - if logs: - st.text_area( - "Function Logs", - "".join(logs), - height=300, - key=f"log_area_all_{call_id}", - ) - else: - st.warning("No logs found for this run.") - st.divider() - - # Pagination controls - total_pages = (len(filtered_runs) + items_per_page - 1) // items_per_page - if total_pages > 1: - p_col1, p_col2, p_col3 = st.columns([1, 2, 1]) - with p_col1: - if st.button( - "Previous", - disabled=(st.session_state.page_num == 0), - use_container_width=True, - ): - st.session_state.page_num -= 1 - st.rerun() - with p_col2: - st.markdown( - f"
Page {st.session_state.page_num + 1} of {total_pages}
", - unsafe_allow_html=True, - ) - with p_col3: - if st.button( - "Next", - disabled=(st.session_state.page_num >= total_pages - 1), - use_container_width=True, - ): - st.session_state.page_num += 1 - st.rerun() -else: - st.info("No data returned from Cognite Data Fusion. Please check your settings and data model.") diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py new file mode 100644 index 00000000..8f42d063 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py @@ -0,0 +1,456 @@ +import streamlit as st +import pandas as pd +import altair as alt +from datetime import timedelta +from helper import ( + fetch_annotation_states, + fetch_pipeline_run_history, + process_runs_for_graphing, + fetch_extraction_pipeline_config, + fetch_function_logs, + parse_run_message, + find_pipelines, + get_files_by_call_id, + calculate_overview_kpis, + filter_log_lines, +) + +# --- Page Configuration --- +st.set_page_config( + page_title="Pipeline Health", + page_icon="🩺", + layout="wide", +) + +# --- Session State and Callbacks --- +if "selected_pipeline" not in st.session_state: + st.session_state.selected_pipeline = None +if "selected_status_file_index" not in st.session_state: + st.session_state.selected_status_file_index = None +if "page_num" not in st.session_state: # For run history pagination + st.session_state.page_num = 0 + + +def reset_table_selection(): + st.session_state.selected_status_file_index = None + + +# --- Sidebar --- +st.sidebar.title("Pipeline Selection") +pipeline_ids = find_pipelines() + +if not pipeline_ids: + st.info("No active file annotation pipelines found to monitor.") + st.stop() + +if st.session_state.selected_pipeline not in pipeline_ids: + st.session_state.selected_pipeline = pipeline_ids[0] + +selected_pipeline = st.sidebar.selectbox("Select a pipeline to monitor:", options=pipeline_ids, key="selected_pipeline") + +# --- Main Application --- +st.title("Pipeline Health Dashboard") + +# --- Data Fetching --- +config_result = fetch_extraction_pipeline_config(selected_pipeline) +if not config_result: + st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") + st.stop() + +ep_config, annotation_state_view, file_view = config_result +df_annotation_states = fetch_annotation_states(annotation_state_view, file_view) +pipeline_runs = fetch_pipeline_run_history(selected_pipeline) + +# --- Create Tabs --- +overview_tab, explorer_tab, history_tab = st.tabs(["Overview", "File Explorer", "Run History"]) + +# ========================================== +# OVERVIEW TAB +# ========================================== +with overview_tab: + st.subheader("Live Pipeline KPIs") + + kpis = calculate_overview_kpis(df_annotation_states) + + kpi_col1, kpi_col2, kpi_col3 = st.columns(3) + kpi_col1.metric("Files Awaiting Processing", f"{kpis['awaiting_processing']:,}") + kpi_col2.metric("Total Files Processed", f"{kpis['processed_total']:,}") + kpi_col3.metric( + "Overall Failure Rate", + f"{kpis['failure_rate_total']:.2f}%", + delta=f"{kpis['failed_total']:,} failed files", + delta_color="inverse" if kpis["failed_total"] > 0 else "off", + ) + + st.divider() + st.subheader("Pipeline Throughput") + time_agg = st.radio("Aggregate by:", options=["Daily", "Hourly", "Weekly"], horizontal=True, key="time_agg_radio") + + if not df_annotation_states.empty: + df_finalized = df_annotation_states[df_annotation_states["status"].isin(["Annotated", "Failed"])].copy() + if not df_finalized.empty: + if time_agg == "Hourly": + df_finalized["time_bucket"] = df_finalized["lastUpdatedTime"].dt.floor("H") + elif time_agg == "Weekly": + df_finalized["time_bucket"] = ( + df_finalized["lastUpdatedTime"].dt.to_period("W").apply(lambda p: p.start_time) + ) + else: # Daily + df_finalized["time_bucket"] = df_finalized["lastUpdatedTime"].dt.date + + daily_counts = df_finalized.groupby("time_bucket").size().reset_index(name="count") + + throughput_chart = ( + alt.Chart(daily_counts) + .mark_bar() + .encode( + x=alt.X("time_bucket:T", title=f"Time ({time_agg})"), + y=alt.Y("count:Q", title="Number of Files Finalized"), + tooltip=["time_bucket:T", "count:Q"], + ) + .properties(title=f"Files Finalized {time_agg}") + .interactive() + ) + st.altair_chart(throughput_chart, use_container_width=True) + else: + st.info("No files have been finalized yet.") + +# ========================================== +# FILE EXPLORER TAB +# ========================================== +with explorer_tab: + st.subheader("File-Centric Debugging") + if df_annotation_states.empty: + st.info("No annotation state data found for this pipeline.") + else: + with st.expander("Filter and Slice Data"): + excluded_columns = [ + "externalId", + "space", + "createdTime", + "lastUpdatedTime", + "fileExternalId", + "fileSpace", + "retries", + "linkedFile", + ] + filterable_columns = sorted( + [ + col + for col in df_annotation_states.columns + if col not in excluded_columns and df_annotation_states[col].nunique() < 100 + ] + ) + + filter_col1, filter_col2 = st.columns(2) + selected_column = filter_col1.selectbox( + "Filter by Metadata Property", + ["None"] + filterable_columns, + on_change=reset_table_selection, + key="meta_filter", + ) + + selected_values = [] + if selected_column != "None": + unique_values = sorted(df_annotation_states[selected_column].dropna().unique().tolist()) + selected_values = filter_col2.multiselect( + f"Select Value(s) for {selected_column}", + unique_values, + on_change=reset_table_selection, + key="value_filter", + ) + + df_display = df_annotation_states.copy() + if selected_column != "None" and selected_values: + df_display = df_display[df_display[selected_column].isin(selected_values)] + + df_display = df_display.sort_values(by="lastUpdatedTime", ascending=False).reset_index(drop=True) + df_display.insert(0, "Select", False) + + if ( + st.session_state.selected_status_file_index is not None + and st.session_state.selected_status_file_index < len(df_display) + ): + df_display.at[st.session_state.selected_status_file_index, "Select"] = True + + edited_df = st.data_editor( + df_display, + key="status_table_editor", + use_container_width=True, + hide_index=True, + disabled=df_display.columns.difference(["Select"]), + ) + + selected_indices = edited_df[edited_df.Select].index.tolist() + if len(selected_indices) > 1: + new_selection = [ + idx for idx in selected_indices if idx != st.session_state.get("selected_status_file_index") + ] + st.session_state.selected_status_file_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_status_file_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.selected_status_file_index is not None: + st.session_state.selected_status_file_index = None + st.rerun() + + if ( + st.session_state.selected_status_file_index is not None + and st.session_state.selected_status_file_index < len(df_display) + ): + st.divider() + st.subheader("Function Log Viewer") + selected_row = df_display.iloc[st.session_state.selected_status_file_index] + file_ext_id = selected_row.get("fileExternalId", "") + + launch_tab, finalize_tab = st.tabs(["Launch Log", "Finalize Log"]) + with launch_tab: + launch_func_id = selected_row.get("launchFunctionId") + launch_call_id = selected_row.get("launchFunctionCallId") + if pd.notna(launch_func_id) and pd.notna(launch_call_id): + with st.spinner("Fetching launch log..."): + launch_logs_raw = "".join( + fetch_function_logs(function_id=int(launch_func_id), call_id=int(launch_call_id)) + ) + if launch_logs_raw: + st.download_button( + "Download Full Launch Log", launch_logs_raw, f"{file_ext_id}_launch_log.txt" + ) + filtered_log = filter_log_lines(launch_logs_raw, file_ext_id) + st.write("**Relevant Log Entries:**") + st.code( + filtered_log if filtered_log else "No log entries found for this specific file.", + language="log", + ) + with st.expander("View Full Log"): + st.code(launch_logs_raw, language="log") + else: + st.warning("No launch logs found.") + else: + st.info("No Launch Function call information available for this file.") + + with finalize_tab: + finalize_func_id = selected_row.get("finalizeFunctionId") + finalize_call_id = selected_row.get("finalizeFunctionCallId") + if pd.notna(finalize_func_id) and pd.notna(finalize_call_id): + with st.spinner("Fetching finalize log..."): + finalize_logs_raw = "".join( + fetch_function_logs(function_id=int(finalize_func_id), call_id=int(finalize_call_id)) + ) + if finalize_logs_raw: + st.download_button( + "Download Full Finalize Log", finalize_logs_raw, f"{file_ext_id}_finalize_log.txt" + ) + filtered_log = filter_log_lines(finalize_logs_raw, file_ext_id) + st.write("**Relevant Log Entries:**") + st.code( + filtered_log if filtered_log else "No log entries found for this specific file.", + language="log", + ) + with st.expander("View Full Log"): + st.code(finalize_logs_raw, language="log") + else: + st.warning("No finalize logs found.") + else: + st.info("No Finalize Function call information available for this file.") + +# ========================================== +# RUN HISTORY TAB +# ========================================== +with history_tab: + st.subheader("Run-Centric Analysis") + if not pipeline_runs: + st.info("No pipeline runs found for this pipeline.") + else: + time_window_map = {"All": None, "Last 24 Hours": 24, "Last 7 Days": 7 * 24, "Last 30 Days": 30 * 24} + time_window_option = st.selectbox( + "Filter by Time Window:", options=list(time_window_map.keys()), key="time_window_history" + ) + window_hours = time_window_map[time_window_option] + + if window_hours is not None: + now = pd.Timestamp.now(tz="UTC") + filter_start_time = now - timedelta(hours=window_hours) + recent_pipeline_runs = [ + run + for run in pipeline_runs + if pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") > filter_start_time + ] + else: + recent_pipeline_runs = pipeline_runs + + if not recent_pipeline_runs: + st.warning("No pipeline runs found in the selected time window.") + else: + df_runs_for_graphing = process_runs_for_graphing(recent_pipeline_runs) + launch_success, launch_failure, finalize_success, finalize_failure = 0, 0, 0, 0 + for run in recent_pipeline_runs: + parsed_message = parse_run_message(run.message) + caller = parsed_message.get("caller") + if caller == "Launch": + if run.status == "success": + launch_success += 1 + elif run.status == "failure": + launch_failure += 1 + elif caller == "Finalize": + if run.status == "success": + finalize_success += 1 + elif run.status == "failure": + finalize_failure += 1 + + total_launched_recent = int(df_runs_for_graphing[df_runs_for_graphing["type"] == "Launch"]["count"].sum()) + total_finalized_recent = int( + df_runs_for_graphing[df_runs_for_graphing["type"] == "Finalize"]["count"].sum() + ) + + g_col1, g_col2 = st.columns(2) + with g_col1: + st.subheader("Launch Runs") + m_col1, m_col2, m_col3 = st.columns(3) + m_col1.metric("Files Launched", f"{total_launched_recent:,}") + m_col2.metric("Successful Runs", f"{launch_success:,}") + m_col3.metric( + "Failed Runs", + f"{launch_failure:,}", + delta=f"{launch_failure:,}" if launch_failure > 0 else "0", + delta_color="inverse", + ) + with g_col2: + st.subheader("Finalize Runs") + m_col4, m_col5, m_col6 = st.columns(3) + m_col4.metric("Files Finalized", f"{total_finalized_recent:,}") + m_col5.metric("Successful Runs", f"{finalize_success:,}") + m_col6.metric( + "Failed Runs", + f"{finalize_failure:,}", + delta=f"{finalize_failure:,}" if finalize_failure > 0 else "0", + delta_color="inverse", + ) + + st.divider() + + base_chart = ( + alt.Chart(df_runs_for_graphing) + .mark_circle(size=60, opacity=0.7) + .encode( + x=alt.X("timestamp:T", title="Time of Run"), + y=alt.Y("count:Q", title="Files Processed"), + tooltip=["timestamp:T", "count:Q", "type:N"], + ) + .interactive() + ) + + chart_col1, chart_col2 = st.columns(2) + with chart_col1: + st.altair_chart( + base_chart.transform_filter(alt.datum.type == "Launch").properties( + title="Files Processed per Launch Run" + ), + use_container_width=True, + ) + with chart_col2: + st.altair_chart( + base_chart.transform_filter(alt.datum.type == "Finalize").properties( + title="Files Processed per Finalize Run" + ), + use_container_width=True, + ) + + st.divider() + st.subheader("Detailed Run History") + + f_col1, f_col2 = st.columns(2) + run_status_filter = f_col1.radio( + "Filter by run status:", ("All", "Success", "Failure"), horizontal=True, key="run_status_filter" + ) + caller_type_filter = f_col2.radio( + "Filter by caller type:", ("All", "Launch", "Finalize"), horizontal=True, key="caller_type_filter" + ) + + filtered_runs = recent_pipeline_runs + if run_status_filter != "All": + filtered_runs = [run for run in filtered_runs if run.status.lower() == run_status_filter.lower()] + if caller_type_filter != "All": + filtered_runs = [ + run for run in filtered_runs if parse_run_message(run.message).get("caller") == caller_type_filter + ] + + if not filtered_runs: + st.warning("No runs match the selected filters.") + else: + items_per_page = 5 + start_idx = st.session_state.page_num * items_per_page + end_idx = start_idx + items_per_page + paginated_runs = filtered_runs[start_idx:end_idx] + + if not paginated_runs: + st.warning("No runs match the selected filters.") + else: + for run in paginated_runs: + st.markdown( + f"**Status:** {run.status.capitalize()} at {pd.to_datetime(run.created_time, unit='ms').tz_localize('UTC').strftime('%Y-%m-%d %H:%M:%S')}" + ) + st.code(run.message, language="text") + + parsed_message = parse_run_message(run.message) + function_id_str = parsed_message.get("function_id") + call_id_str = parsed_message.get("call_id") + + expander_col1, expander_col2 = st.columns(2) + + with expander_col1: + with st.expander("View Function Log"): + st.write("**Function Log**") + log_key = f"log_{run.id}" + + if function_id_str and call_id_str: + # Show the log if it has been fetched, otherwise show the load button + if log_key in st.session_state: + st.download_button( + "Download Log", st.session_state[log_key], f"run_{run.id}_log.txt" + ) + st.code(st.session_state[log_key], language="log") + else: + if st.button("Load Log", key=f"load_btn_{run.id}"): + with st.spinner("Fetching logs..."): + logs = "".join( + fetch_function_logs( + function_id=int(function_id_str), call_id=int(call_id_str) + ) + ) + st.session_state[log_key] = ( + logs if logs else "No logs found for this run." + ) + st.rerun() + else: + st.info("No log information in run message.") + + with expander_col2: + with st.expander("View Files Processed"): + st.write("External ID(s):") + if call_id_str: + df_files_in_run = get_files_by_call_id(int(call_id_str), annotation_state_view) + if not df_files_in_run.empty: + file_list = df_files_in_run["File External ID"].tolist() + st.text("\n".join(file_list)) + else: + st.write("No associated files found.") + else: + st.info("No call_id found in run message.") + st.divider() + + total_pages = (len(filtered_runs) + items_per_page - 1) // items_per_page + if total_pages > 1: + p_col1, p_col2, p_col3 = st.columns([1, 2, 1]) + if p_col1.button("Previous", disabled=(st.session_state.page_num == 0), use_container_width=True): + st.session_state.page_num -= 1 + st.rerun() + p_col2.markdown( + f"
Page {st.session_state.page_num + 1} of {total_pages}
", + unsafe_allow_html=True, + ) + if p_col3.button( + "Next", disabled=(st.session_state.page_num >= total_pages - 1), use_container_width=True + ): + st.session_state.page_num += 1 + st.rerun() diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 4df145b1..bfffee0e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -6,7 +6,7 @@ from datetime import datetime, timedelta from cognite.client import CogniteClient from cognite.client.data_classes import RowWrite -from cognite.client.data_classes.data_modeling import ViewId, NodeId, Node +from cognite.client.data_classes.data_modeling import ViewId, NodeId, Node, filters from cognite.client.data_classes.functions import FunctionCallLog from data_structures import ViewPropertyConfig from canvas import dm_generate @@ -44,7 +44,6 @@ def generate_file_canvas( project = client.config.project cluster = client.config.cdf_cluster - # Pass the unmatched tags data to dm_generate canvas_id = dm_generate( name=canvas_name, file_node=file_node, @@ -154,7 +153,6 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view Fetches annotation state instances from the specified data model view and joins them with their corresponding file instances. """ - # ... (This function remains unchanged) annotation_instances = client.data_modeling.instances.list( instance_type="node", space=annotation_state_view.instance_space, @@ -162,7 +160,6 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view limit=-1, ) if not annotation_instances: - st.info("No annotation state instances found in the specified view.") return pd.DataFrame() annotation_data = [] @@ -196,46 +193,26 @@ def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view file_data = [] for instance in file_instances: - node_data = { - "fileExternalId": instance.external_id, - "fileSpace": instance.space, - } + node_data = {"fileExternalId": instance.external_id, "fileSpace": instance.space} properties = instance.properties[file_view.as_view_id()] for prop_key, prop_value in properties.items(): - if isinstance(prop_value, list): - string_values = [] - for value in prop_value: - string_values.append(str(value)) - node_data[f"file{prop_key.capitalize()}"] = ", ".join(filter(None, string_values)) - else: - node_data[f"file{prop_key.capitalize()}"] = prop_value + node_data[f"file{prop_key.capitalize()}"] = ( + ", ".join(map(str, prop_value)) if isinstance(prop_value, list) else prop_value + ) file_data.append(node_data) if not file_data: return df_annotations df_files = pd.DataFrame(file_data) - df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") - if "createdTime" in df_merged.columns: - df_merged["createdTime"] = df_merged["createdTime"].dt.tz_localize("UTC") - if "lastUpdatedTime" in df_merged.columns: - df_merged["lastUpdatedTime"] = df_merged["lastUpdatedTime"].dt.tz_localize("UTC") - - df_merged.rename( - columns={ - "annotationStatus": "status", - "attemptCount": "retries", - "diagramDetectJobId": "jobId", - }, - inplace=True, - ) + for col in ["createdTime", "lastUpdatedTime"]: + if col in df_merged.columns: + df_merged[col] = df_merged[col].dt.tz_localize("UTC") - for col in ["status", "fileExternalId", "retries", "jobId"]: - if col not in df_merged.columns: - df_merged[col] = None + df_merged.rename(columns={"annotationStatus": "status", "attemptCount": "retries"}, inplace=True) return df_merged @@ -246,33 +223,6 @@ def fetch_pipeline_run_history(pipeline_ext_id: str): return client.extraction_pipelines.runs.list(external_id=pipeline_ext_id, limit=-1) -def calculate_success_failure_stats(runs): - """Calculates success and failure counts from a list of pipeline runs.""" - # ... (This function remains unchanged) - success_count = sum(1 for run in runs if run.status == "success") - failure_count = sum(1 for run in runs if run.status == "failure") - return success_count, failure_count - - -def get_failed_run_details(runs): - """Filters for failed runs and extracts their details, including IDs.""" - # ... (This function remains unchanged) - failed_runs = [] - for run in runs: - if run.status == "failure": - parsed_message = parse_run_message(run.message) - failed_runs.append( - { - "timestamp": pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), - "message": run.message, - "status": run.status, - "function_id": parsed_message.get("function_id"), - "call_id": parsed_message.get("call_id"), - } - ) - return sorted(failed_runs, key=lambda x: x["timestamp"], reverse=True) - - @st.cache_data(ttl=3600) def fetch_function_logs(function_id: int, call_id: int): """Fetches the logs for a specific function call.""" @@ -285,22 +235,18 @@ def fetch_function_logs(function_id: int, call_id: int): def process_runs_for_graphing(runs): """Transforms pipeline run data into a DataFrame for graphing.""" - # ... (This function remains unchanged) - launch_data = [] - finalize_runs_to_agg = [] - + launch_data, finalize_runs_to_agg = [], [] for run in runs: if run.status != "success": continue - parsed = parse_run_message(run.message) if not parsed: continue - - timestamp = pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC") - count = parsed.get("total", 0) - caller = parsed.get("caller") - + timestamp, count, caller = ( + pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), + parsed.get("total", 0), + parsed.get("caller"), + ) if caller == "Launch": launch_data.append({"timestamp": timestamp, "count": count, "type": "Launch"}) elif caller == "Finalize": @@ -309,36 +255,21 @@ def process_runs_for_graphing(runs): aggregated_finalize_data = [] if finalize_runs_to_agg: finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) - current_group_start_time = finalize_runs_to_agg[0]["timestamp"] - current_group_count = 0 - + current_group_start_time, current_group_count = finalize_runs_to_agg[0]["timestamp"], 0 for run in finalize_runs_to_agg: if run["timestamp"] < current_group_start_time + timedelta(minutes=10): current_group_count += run["count"] else: aggregated_finalize_data.append( - { - "timestamp": current_group_start_time, - "count": current_group_count, - "type": "Finalize", - } + {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} ) - current_group_start_time = run["timestamp"] - current_group_count = run["count"] - + current_group_start_time, current_group_count = run["timestamp"], run["count"] if current_group_count > 0: aggregated_finalize_data.append( - { - "timestamp": current_group_start_time, - "count": current_group_count, - "type": "Finalize", - } + {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} ) - df_launch = pd.DataFrame(launch_data) - df_finalize = pd.DataFrame(aggregated_finalize_data) - - return pd.concat([df_launch, df_finalize], ignore_index=True) + return pd.concat([pd.DataFrame(launch_data), pd.DataFrame(aggregated_finalize_data)], ignore_index=True) @st.cache_data(ttl=3600) @@ -351,19 +282,18 @@ def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) if not rows: return pd.DataFrame() - - data = [row.columns for row in rows] - df_cache = pd.DataFrame(data) - all_patterns = [] - for _, row in df_cache.iterrows(): + for row in pd.DataFrame([row.columns for row in rows]).itertuples(): for sample_list in ["AssetPatternSamples", "FilePatternSamples"]: - if row.get(sample_list) and isinstance(row[sample_list], list): - for item in row[sample_list]: + if hasattr(row, sample_list) and isinstance(getattr(row, sample_list), list): + for item in getattr(row, sample_list): if item.get("sample") and item.get("resource_type"): - for pattern in item["sample"]: - all_patterns.append({"resourceType": item["resource_type"], "pattern": pattern}) - + all_patterns.extend( + [ + {"resourceType": item["resource_type"], "pattern": pattern} + for pattern in item["sample"] + ] + ) return pd.DataFrame(all_patterns) except Exception as e: st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") @@ -376,36 +306,18 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: into a tidy DataFrame for display and editing. """ all_patterns = [] - expected_columns = [ - "key", - "scope_level", - "primary_scope", - "secondary_scope", - "sample", - "resource_type", - "created_by", - ] - try: - rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) - for row in rows: - key = row.key - patterns_list = row.columns.get("patterns", []) - - scope_level = "Global" - primary_scope = "" - secondary_scope = "" + for row in client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1): + key, patterns_list = row.key, row.columns.get("patterns", []) + scope_level, primary_scope, secondary_scope = "Global", "", "" if key != "GLOBAL": parts = key.split("_") if len(parts) == 2: - scope_level = "Secondary Scope" - primary_scope, secondary_scope = parts + scope_level, primary_scope, secondary_scope = "Secondary Scope", parts[0], parts[1] else: - scope_level = "Primary Scope" - primary_scope = key - - for p in patterns_list: - all_patterns.append( + scope_level, primary_scope = "Primary Scope", key + all_patterns.extend( + [ { "key": key, "scope_level": scope_level, @@ -415,25 +327,32 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: "resource_type": p.get("resource_type"), "created_by": p.get("created_by"), } - ) - - if not all_patterns: - return pd.DataFrame(columns=expected_columns) - - df = pd.DataFrame(all_patterns) - - for col in expected_columns: - if col not in df.columns: - df[col] = "" - df[col] = df[col].fillna("").astype(str) - - return df + for p in patterns_list + ] + ) + df = ( + pd.DataFrame(all_patterns) + if all_patterns + else pd.DataFrame( + columns=[ + "key", + "scope_level", + "primary_scope", + "secondary_scope", + "sample", + "resource_type", + "created_by", + ] + ) + ) + return df.fillna("").astype(str) except Exception as e: - if "NotFoundError" in str(type(e)): - return pd.DataFrame(columns=expected_columns) - st.error(f"Failed to fetch manual patterns: {e}") - return pd.DataFrame(columns=expected_columns) + if "NotFoundError" not in str(type(e)): + st.error(f"Failed to fetch manual patterns: {e}") + return pd.DataFrame( + columns=["key", "scope_level", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] + ) def save_manual_patterns(df: pd.DataFrame, db_name: str, table_name: str): @@ -445,31 +364,91 @@ def save_manual_patterns(df: pd.DataFrame, db_name: str, table_name: str): def create_key(row): if row["scope_level"] == "Global": return "GLOBAL" - elif row["scope_level"] == "Primary Scope" and row["primary_scope"]: + if row["scope_level"] == "Primary Scope" and row["primary_scope"]: return row["primary_scope"] - elif row["scope_level"] == "Secondary Scope" and row["primary_scope"] and row["secondary_scope"]: + if row["scope_level"] == "Secondary Scope" and row["primary_scope"] and row["secondary_scope"]: return f"{row['primary_scope']}_{row['secondary_scope']}" return None df["key"] = df.apply(create_key, axis=1) - df.dropna(subset=["key"], inplace=True) - - grouped = df.groupby("key") - - rows_to_write = [] - for key, group in grouped: - patterns_list = group[["sample", "resource_type", "created_by"]].to_dict("records") - - row = RowWrite(key=key, columns={"patterns": patterns_list}) - rows_to_write.append(row) + rows_to_write = [ + RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "created_by"]].to_dict("records")}) + for key, group in df.groupby("key") + ] existing_keys = {r.key for r in client.raw.rows.list(db_name, table_name, limit=-1)} - new_keys = {r.key for r in rows_to_write} - keys_to_delete = list(existing_keys - new_keys) - + keys_to_delete = list(existing_keys - {r.key for r in rows_to_write}) if keys_to_delete: client.raw.rows.delete(db_name=db_name, table_name=table_name, key=keys_to_delete) - if rows_to_write: client.raw.rows.insert(db_name=db_name, table_name=table_name, row=rows_to_write, ensure_parent=True) + + +@st.cache_data(ttl=600) +def get_files_by_call_id(call_id: int, annotation_state_view: ViewPropertyConfig) -> pd.DataFrame: + """ + Finds all files associated with a specific function call ID by querying + the AnnotationState data model. + """ + if not call_id: + return pd.DataFrame() + try: + call_id_filter = filters.Or( + filters.Equals(annotation_state_view.as_property_ref("launchFunctionCallId"), call_id), + filters.Equals(annotation_state_view.as_property_ref("finalizeFunctionCallId"), call_id), + ) + instances = client.data_modeling.instances.list( + instance_type="node", sources=annotation_state_view.as_view_id(), filter=call_id_filter, limit=-1 + ) + if not instances: + return pd.DataFrame() + + view_id_tuple = annotation_state_view.as_view_id() + file_ids = [ + instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") + for instance in instances + if instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") + ] + return pd.DataFrame(file_ids, columns=["File External ID"]) + except Exception as e: + st.error(f"Failed to query files by call ID: {e}") + return pd.DataFrame() + + +def calculate_overview_kpis(df: pd.DataFrame) -> dict: + """Calculates high-level KPIs from the AnnotationState dataframe.""" + kpis = {"awaiting_processing": 0, "processed_total": 0, "failed_total": 0, "failure_rate_total": 0} + if df.empty: + return kpis + kpis["awaiting_processing"] = len(df[df["status"].isin(["New", "Retry"])]) + finalized_all_time = df[df["status"].isin(["Annotated", "Failed"])] + kpis["processed_total"] = len(finalized_all_time) + kpis["failed_total"] = len(finalized_all_time[finalized_all_time["status"] == "Failed"]) + if kpis["processed_total"] > 0: + kpis["failure_rate_total"] = (kpis["failed_total"] / kpis["processed_total"]) * 100 + return kpis + + +def filter_log_lines(log_text: str, search_string: str) -> str: + """ + Takes a block of log text and a search string, returning a new string + containing the lines that include the search string, plus the subsequent + indented lines that provide context. + """ + if not log_text or not isinstance(log_text, str): + return "Log content is not available or in an invalid format." + relevant_blocks, lines = [], log_text.splitlines() + for i, line in enumerate(lines): + if search_string in line: + current_block = [line] + next_line_index = i + 1 + while next_line_index < len(lines): + next_line = lines[next_line_index] + if next_line.strip().startswith("-") or "\t" in next_line or " " in next_line: + current_block.append(next_line) + next_line_index += 1 + else: + break + relevant_blocks.append("\n".join(current_block)) + return "\n\n".join(relevant_blocks) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py deleted file mode 100644 index b3c289bc..00000000 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Status_Overview.py +++ /dev/null @@ -1,146 +0,0 @@ -import streamlit as st -import pandas as pd -from helper import ( - fetch_annotation_states, - fetch_extraction_pipeline_config, - find_pipelines, -) - -# --- Page Configuration --- -st.set_page_config( - page_title="Annotation Status Overview", - page_icon="📄", - layout="wide", -) - -# --- Sidebar for Pipeline Selection --- -st.sidebar.title("Pipeline Selection") -pipeline_ids = find_pipelines() - -if not pipeline_ids: - st.info("No active file annotation pipelines found to monitor.") - st.stop() - -# Add an independent dropdown selector for this page -# It uses a different key to avoid conflicts -selected_pipeline = st.sidebar.selectbox( - "Select a pipeline to view status:", options=pipeline_ids, key="status_pipeline_selector" -) - -# --- Data Fetching --- -# Pass the selected pipeline ID from this page's dropdown -config_result = fetch_extraction_pipeline_config(selected_pipeline) -if not config_result: - st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") - st.stop() - -ep_config, annotation_state_view, file_view = config_result -df_raw = fetch_annotation_states(annotation_state_view, file_view) - - -# --- Main Application --- -st.title(f"Annotation Status Overview:") -st.markdown("This page provides an audit trail and overview of the file annotation process.") - -if not df_raw.empty: - # --- Sidebar Filters --- - st.sidebar.title("Filters") - - # Status Filter - all_statuses = ["All"] + sorted(df_raw["status"].unique().tolist()) - selected_status = st.sidebar.selectbox("Filter by Status", options=all_statuses) - - # Date Range Filter - min_date = df_raw["lastUpdatedTime"].min().date() - max_date = df_raw["lastUpdatedTime"].max().date() - date_range = st.sidebar.date_input( - "Filter by Last Updated Date", - value=(min_date, max_date), - min_value=min_date, - max_value=max_date, - ) - - # ... (The rest of your page logic for filters, metrics, and dataframes is the same) - # Dynamic Scope Property Filters - primary_scope_property = ep_config["launchFunction"].get("primaryScopeProperty") - secondary_scope_property = ep_config["launchFunction"].get("secondaryScopeProperty") - - selected_primary_scope = "All" - if primary_scope_property and f"file{primary_scope_property.capitalize()}" in df_raw.columns: - primary_scope_options = ["All"] + df_raw[f"file{primary_scope_property.capitalize()}"].unique().tolist() - selected_primary_scope = st.sidebar.selectbox( - f"Filter by {primary_scope_property}", options=primary_scope_options - ) - - selected_secondary_scope = "All" - if secondary_scope_property and f"file{secondary_scope_property.capitalize()}" in df_raw.columns: - secondary_scope_options = ["All"] + df_raw[f"file{secondary_scope_property.capitalize()}"].unique().tolist() - selected_secondary_scope = st.sidebar.selectbox( - f"Filter by {secondary_scope_property}", options=secondary_scope_options - ) - - # Apply all filters - df_filtered = df_raw.copy() - if selected_status != "All": - df_filtered = df_filtered[df_filtered["status"] == selected_status] - - if len(date_range) == 2: - start_date, end_date = date_range - df_filtered = df_filtered[ - (df_filtered["lastUpdatedTime"].dt.date >= start_date) - & (df_filtered["lastUpdatedTime"].dt.date <= end_date) - ] - - if selected_primary_scope != "All": - df_filtered = df_filtered[df_filtered[f"file{primary_scope_property.capitalize()}"] == selected_primary_scope] - - if selected_secondary_scope != "All": - df_filtered = df_filtered[ - df_filtered[f"file{secondary_scope_property.capitalize()}"] == selected_secondary_scope - ] - # --- Dashboard Metrics --- - st.subheader("Status Overview") - - status_counts = df_filtered["status"].value_counts() - - col1, col2, col3, col4 = st.columns(4) - with col1: - st.metric("Total Files", len(df_filtered)) - with col2: - st.metric("Annotated", status_counts.get("Annotated", 0)) - with col3: - st.metric("New", status_counts.get("New", 0)) - st.metric("Processing", status_counts.get("Processing", 0)) - with col4: - st.metric("Finalizing", status_counts.get("Finalizing", 0)) - st.metric("Failed", status_counts.get("Failed", 0)) - - # --- Detailed Data View --- - default_columns = [ - "fileName", - "status", - "jobId", - "annotationMessage", - "filePageCount", - "retries", - "fileTags", - "lastUpdatedTime", - ] - - available_columns = df_filtered.columns.tolist() - default_selection = [col for col in default_columns if col in available_columns] - - with st.popover("Customize Columns"): - selected_columns = st.multiselect( - "Select columns to display:", - options=available_columns, - default=default_selection, - label_visibility="collapsed", - ) - - if selected_columns: - st.dataframe(df_filtered[selected_columns], use_container_width=True) - else: - st.warning("Please select at least one column to display.") -else: - st.info("No annotation state data found for the selected pipeline. Please check its configuration and runs.") From 4a20ed4118af82214c7bf1eabb80c3869a1d6aa6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 10 Sep 2025 21:41:24 -0500 Subject: [PATCH 046/128] improved the UI of the pipeline health dashboard --- .../Pipeline_Health.py | 112 ++++++++++++------ .../file_annotation_dashboard/helper.py | 2 +- 2 files changed, 80 insertions(+), 34 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py index 8f42d063..cad1aece 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py @@ -124,15 +124,31 @@ def reset_table_selection(): st.info("No annotation state data found for this pipeline.") else: with st.expander("Filter and Slice Data"): + # ... (your existing filter logic remains unchanged here) ... excluded_columns = [ "externalId", "space", - "createdTime", - "lastUpdatedTime", - "fileExternalId", - "fileSpace", - "retries", + "annotationMessage", + "fileAliases", + "fileAssets", + "fileIsuploaded", + "diagramDetectJobId", "linkedFile", + "patternModeJobId", + "sourceCreatedUser", + "sourceCreatedTime", + "sourceUpdatedTime", + "sourceUpdatedUser", + "fileSpace", + "fileSourceupdateduser", + "fileSourcecreatedUser", + "fileSourceId", + "createdTime", + "fileSourcecreateduser", + "patternModeMessage", + "fileSourceupdatedtime", + "fileSourcecreatedtime", + "fileUploadedtime", ] filterable_columns = sorted( [ @@ -173,9 +189,46 @@ def reset_table_selection(): ): df_display.at[st.session_state.selected_status_file_index, "Select"] = True + # --- START: New additions for customizable and readable columns --- + default_columns = [ + "Select", + "fileName", + "fileExternalId", + "fileSourceid", + "status", + "fileMimetype", + "pageCount", + "annotatedPageCount", + ] + all_columns = df_display.columns.tolist() + + with st.popover("Customize Table Columns"): + selected_columns = st.multiselect( + "Select columns to display:", + options=all_columns, + default=[col for col in default_columns if col in all_columns], + ) + + if not selected_columns: + st.warning("Please select at least one column to display.") + st.stop() + edited_df = st.data_editor( - df_display, + df_display[selected_columns], # Display only selected columns key="status_table_editor", + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "fileName": "File Name", + "fileExternalId": "File External ID", + "status": "Annotation Status", + "retries": "Retries", + "fileSourceid": "Source ID", + "fileMimetype": "Mime Type", + "annotationMessage": "Annotation Message", + "patternModeMessage": "Pattern Mode Message", + "pageCount": "Page Count", + "annotatedPageCount": "Annotated Page Count", + }, use_container_width=True, hide_index=True, disabled=df_display.columns.difference(["Select"]), @@ -203,32 +256,7 @@ def reset_table_selection(): selected_row = df_display.iloc[st.session_state.selected_status_file_index] file_ext_id = selected_row.get("fileExternalId", "") - launch_tab, finalize_tab = st.tabs(["Launch Log", "Finalize Log"]) - with launch_tab: - launch_func_id = selected_row.get("launchFunctionId") - launch_call_id = selected_row.get("launchFunctionCallId") - if pd.notna(launch_func_id) and pd.notna(launch_call_id): - with st.spinner("Fetching launch log..."): - launch_logs_raw = "".join( - fetch_function_logs(function_id=int(launch_func_id), call_id=int(launch_call_id)) - ) - if launch_logs_raw: - st.download_button( - "Download Full Launch Log", launch_logs_raw, f"{file_ext_id}_launch_log.txt" - ) - filtered_log = filter_log_lines(launch_logs_raw, file_ext_id) - st.write("**Relevant Log Entries:**") - st.code( - filtered_log if filtered_log else "No log entries found for this specific file.", - language="log", - ) - with st.expander("View Full Log"): - st.code(launch_logs_raw, language="log") - else: - st.warning("No launch logs found.") - else: - st.info("No Launch Function call information available for this file.") - + finalize_tab, launch_tab = st.tabs(["Finalize Log", "Launch Log"]) with finalize_tab: finalize_func_id = selected_row.get("finalizeFunctionId") finalize_call_id = selected_row.get("finalizeFunctionCallId") @@ -239,7 +267,7 @@ def reset_table_selection(): ) if finalize_logs_raw: st.download_button( - "Download Full Finalize Log", finalize_logs_raw, f"{file_ext_id}_finalize_log.txt" + "Download Full Log", finalize_logs_raw, f"{file_ext_id}_finalize_log.txt" ) filtered_log = filter_log_lines(finalize_logs_raw, file_ext_id) st.write("**Relevant Log Entries:**") @@ -254,6 +282,24 @@ def reset_table_selection(): else: st.info("No Finalize Function call information available for this file.") + with launch_tab: + launch_func_id = selected_row.get("launchFunctionId") + launch_call_id = selected_row.get("launchFunctionCallId") + if pd.notna(launch_func_id) and pd.notna(launch_call_id): + with st.spinner("Fetching launch log..."): + launch_logs_raw = "".join( + fetch_function_logs(function_id=int(launch_func_id), call_id=int(launch_call_id)) + ) + # NOTE: launch log doesn't provide log lines with individual Node Id's of files processed + if launch_logs_raw: + st.download_button("Download Full Log", launch_logs_raw, f"{file_ext_id}_launch_log.txt") + with st.expander("View Full Log"): + st.code(launch_logs_raw, language="log") + else: + st.warning("No launch logs found.") + else: + st.info("No Launch Function call information available for this file.") + # ========================================== # RUN HISTORY TAB # ========================================== diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index bfffee0e..3b15a679 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -421,7 +421,7 @@ def calculate_overview_kpis(df: pd.DataFrame) -> dict: kpis = {"awaiting_processing": 0, "processed_total": 0, "failed_total": 0, "failure_rate_total": 0} if df.empty: return kpis - kpis["awaiting_processing"] = len(df[df["status"].isin(["New", "Retry"])]) + kpis["awaiting_processing"] = len(df[df["status"].isin(["New", "Retry", "Processing", "Finalizing"])]) finalized_all_time = df[df["status"].isin(["Annotated", "Failed"])] kpis["processed_total"] = len(finalized_all_time) kpis["failed_total"] = len(finalized_all_time[finalized_all_time["status"] == "Failed"]) From 0af0e7ea4d3b36e59f0aa756c3ee95dab2a46328 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 10 Sep 2025 22:43:56 -0500 Subject: [PATCH 047/128] improved the UI of the annotation quality dashboard --- .../Pipeline_Health.py | 15 +- .../pages/Annotation_Quality.py | 1042 +++++++++-------- .../pages/Pattern_Management.py | 132 --- 3 files changed, 580 insertions(+), 609 deletions(-) delete mode 100644 modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py index cad1aece..023fb6fb 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py @@ -68,7 +68,10 @@ def reset_table_selection(): # OVERVIEW TAB # ========================================== with overview_tab: - st.subheader("Live Pipeline KPIs") + st.subheader( + "Live Pipeline KPIs", + help="Provides a high-level summary of the pipeline's current state and historical throughput. The KPIs are calculated directly from the AnnotationState data model for real-time accuracy.", + ) kpis = calculate_overview_kpis(df_annotation_states) @@ -119,7 +122,10 @@ def reset_table_selection(): # FILE EXPLORER TAB # ========================================== with explorer_tab: - st.subheader("File-Centric Debugging") + st.subheader( + "File-Centric Debugging", + help="A file-centric debugging tool for deep-dive analysis. Filter and select any file to view its current status, metadata, and the specific Launch and Finalize function logs associated with it.", + ) if df_annotation_states.empty: st.info("No annotation state data found for this pipeline.") else: @@ -304,7 +310,10 @@ def reset_table_selection(): # RUN HISTORY TAB # ========================================== with history_tab: - st.subheader("Run-Centric Analysis") + st.subheader( + "Run-Centric Analysis", + help="A run-centric view for analyzing the execution history of the pipeline functions. Review the status, logs, and a list of files processed for each individual pipeline run.", + ) if not pipeline_runs: st.info("No pipeline runs found for this pipeline.") else: diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 9e58af47..c01267e9 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -1,6 +1,7 @@ import streamlit as st import pandas as pd import altair as alt +from datetime import datetime, timezone from helper import ( fetch_extraction_pipeline_config, fetch_raw_table_data, @@ -9,6 +10,7 @@ fetch_pattern_catalog, fetch_manual_patterns, fetch_annotation_states, + save_manual_patterns, ) from cognite.client.data_classes.data_modeling import NodeId @@ -37,9 +39,7 @@ def reset_selection(): st.info("No active file annotation pipelines found to monitor.") st.stop() -selected_pipeline = st.sidebar.selectbox( - "Select a pipeline to view quality:", options=pipeline_ids, key="quality_pipeline_selector" -) +selected_pipeline = st.sidebar.selectbox("Select a pipeline:", options=pipeline_ids, key="quality_pipeline_selector") # --- Data Fetching & Processing --- config_result = fetch_extraction_pipeline_config(selected_pipeline) @@ -62,513 +62,607 @@ def reset_selection(): st.error("Could not find all required RAW table names in the pipeline configuration.") st.stop() -df_patterns = fetch_raw_table_data(db_name, pattern_table) -df_tags = fetch_raw_table_data(db_name, tag_table) -df_docs = fetch_raw_table_data(db_name, doc_table) - -# Fetch both auto-generated and manual patterns -df_auto_patterns = fetch_pattern_catalog(db_name, cache_table) -df_manual_patterns = fetch_manual_patterns(db_name, manual_patterns_table) - # --- Main Application --- st.title("Annotation Quality Dashboard") -st.markdown( - "This page measures annotation quality by comparing potential tags (from pattern mode) against actual, created annotations." -) - -if df_patterns.empty: - st.info("The pattern catalog is empty. Run the pipeline with patternMode enabled to generate data.") - st.stop() - -# --- Data Processing and Merging --- -df_annotations = pd.concat([df_tags, df_docs], ignore_index=True) - -df_patterns_agg = df_patterns.groupby("startNode")["text"].apply(set).reset_index(name="potentialTags") -if not df_annotations.empty: - df_annotations_agg = ( - df_annotations.groupby("startNode")["startNodeText"].apply(set).reset_index(name="actualAnnotations") - ) - df_quality = pd.merge(df_patterns_agg, df_annotations_agg, on="startNode", how="outer").fillna(0) - df_quality["actualAnnotations"] = df_quality["actualAnnotations"].apply( - lambda x: x if isinstance(x, set) else set() - ) -else: - df_quality = df_patterns_agg - df_quality["actualAnnotations"] = [set() for _ in range(len(df_patterns_agg))] - -df_quality["potentialTags"] = df_quality["potentialTags"].apply(lambda x: x if isinstance(x, set) else set()) - -df_quality["matchedTags"] = df_quality.apply( - lambda row: len(row["potentialTags"].intersection(row["actualAnnotations"])), axis=1 -) -df_quality["unmatchedByAnnotation"] = df_quality.apply( - lambda row: len(row["potentialTags"] - row["actualAnnotations"]), axis=1 -) -df_quality["missedByPattern"] = df_quality.apply( - lambda row: len(row["actualAnnotations"] - row["potentialTags"]), axis=1 +# --- Create Tabs --- +overall_tab, per_file_tab, management_tab = st.tabs( + ["Overall Quality Metrics", "Per-File Analysis", "Pattern Management"] ) -df_quality["coverageRate"] = ( - df_quality["matchedTags"] / (df_quality["matchedTags"] + df_quality["unmatchedByAnnotation"]) -) * 100 -df_quality["completenessRate"] = ( - df_quality["matchedTags"] / (df_quality["matchedTags"] + df_quality["missedByPattern"]) -) * 100 -df_quality.fillna(0, inplace=True) - -# --- Dashboard Metrics --- -st.header("Overall Annotation Quality") - -# Get a unique, sorted list of resource types for the filter -all_resource_types = ["All"] + sorted(df_patterns["resourceType"].unique().tolist()) - -selected_resource_type = st.selectbox("Filter by Resource Type:", options=all_resource_types, on_change=reset_selection) - -# --- Filter the data based on selection --- -if selected_resource_type == "All": - df_metrics_input = df_patterns - df_annotations_input = df_annotations -else: - df_metrics_input = df_patterns[df_patterns["resourceType"] == selected_resource_type] - if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: - df_annotations_input = df_annotations[df_annotations["endNodeResourceType"] == selected_resource_type] - else: - df_annotations_input = pd.DataFrame() +# ========================================== +# OVERALL QUALITY METRICS TAB +# ========================================== +with overall_tab: + df_patterns = fetch_raw_table_data(db_name, pattern_table) + df_tags = fetch_raw_table_data(db_name, tag_table) + df_docs = fetch_raw_table_data(db_name, doc_table) + if df_patterns.empty: + st.info("The pattern catalog is empty. Run the pipeline with patternMode enabled to generate data.") + else: + df_annotations = pd.concat([df_tags, df_docs], ignore_index=True) -# --- Recalculate metrics based on the filtered data --- -potential_tags_set = set(df_metrics_input["text"]) -if not df_annotations_input.empty and "startNodeText" in df_annotations_input.columns: - actual_annotations_set = set(df_annotations_input["startNodeText"]) -else: - actual_annotations_set = set() - + st.subheader( + "Overall Annotation Quality", + help="Provides a high-level summary of pattern performance across all files. Use these aggregate metrics, charts, and tag lists to understand the big picture and identify systemic trends or gaps in the pattern catalog.", + ) + all_resource_types = ["All"] + sorted(df_patterns["resourceType"].unique().tolist()) + selected_resource_type = st.selectbox( + "Filter by Resource Type:", + options=all_resource_types, + on_change=reset_selection, + key="resource_type_filter", + ) -matched_tags_set = potential_tags_set.intersection(actual_annotations_set) -unmatched_by_annotation_set = potential_tags_set - actual_annotations_set -missed_by_pattern_set = actual_annotations_set - potential_tags_set + if selected_resource_type == "All": + df_metrics_input = df_patterns + df_annotations_input = df_annotations + else: + df_metrics_input = df_patterns[df_patterns["resourceType"] == selected_resource_type] + if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: + df_annotations_input = df_annotations[df_annotations["endNodeResourceType"] == selected_resource_type] + else: + df_annotations_input = pd.DataFrame() -total_matched = len(matched_tags_set) -total_unmatched = len(unmatched_by_annotation_set) -total_missed = len(missed_by_pattern_set) + potential_tags_set = set(df_metrics_input["text"]) + actual_annotations_set = ( + set(df_annotations_input["startNodeText"]) + if not df_annotations_input.empty and "startNodeText" in df_annotations_input.columns + else set() + ) + matched_tags_set = potential_tags_set.intersection(actual_annotations_set) + unmatched_by_annotation_set = potential_tags_set - actual_annotations_set + missed_by_pattern_set = actual_annotations_set - potential_tags_set + total_matched = len(matched_tags_set) + total_unmatched = len(unmatched_by_annotation_set) + total_missed = len(missed_by_pattern_set) + overall_coverage = ( + (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 + ) + overall_completeness = ( + (total_matched / (total_missed + total_matched)) * 100 if (total_missed + total_matched) > 0 else 0 + ) -# Calculate overall rates -overall_coverage = ( - (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 -) -overall_completeness = ( - (total_matched / (total_missed + total_matched)) * 100 if (total_missed + total_matched) > 0 else 0 -) + kpi_col1, kpi_col2 = st.columns(2) + kpi_col1.metric( + "Overall Annotation Coverage", + f"{overall_coverage:.2f}%", + help="Of all potential tags found by patterns, this is the percentage that were successfully annotated. Formula: Matched / (Matched + Unmatched)", + ) + kpi_col2.metric( + "Overall Pattern Completeness", + f"{overall_completeness:.2f}%", + help="Of all annotations created, this is the percentage that the patterns successfully predicted. Formula: Matched / (Matched + Missed by Pattern)", + ) -# Display KPIs -kpi_col1, kpi_col2 = st.columns(2) -kpi_col1.metric( - "Overall Annotation Coverage", - f"{overall_coverage:.2f}%", - help="Of all potential tags found by patterns, this is the percentage that were successfully annotated. Formula: Matched / (Matched + Unmatched)", -) -kpi_col2.metric( - "Overall Pattern Completeness", - f"{overall_completeness:.2f}%", - help="Of all annotations created, this is the percentage that the patterns successfully predicted. Formula: Matched / (Matched + Missed by Pattern)", -) + st.divider() + chart_data = [] + for resource_type in all_resource_types[1:]: + df_patterns_filtered = df_patterns[df_patterns["resourceType"] == resource_type] + df_annotations_filtered = ( + df_annotations[df_annotations["endNodeResourceType"] == resource_type] + if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns + else pd.DataFrame() + ) + potential = set(df_patterns_filtered["text"]) + actual = ( + set(df_annotations_filtered["startNodeText"]) + if not df_annotations_filtered.empty and "startNodeText" in df_annotations_filtered.columns + else set() + ) + matched = len(potential.intersection(actual)) + unmatched = len(potential - actual) + missed = len(actual - potential) + coverage = (matched / (matched + unmatched)) * 100 if (matched + unmatched) > 0 else 0 + completeness = (matched / (matched + missed)) * 100 if (matched + missed) > 0 else 0 + chart_data.append( + { + "resourceType": resource_type, + "coverageRate": coverage, + "completenessRate": completeness, + "matchedTags": matched, + "unmatchedByAnnotation": unmatched, + "missedByPattern": missed, + } + ) -st.divider() + df_chart_data = pd.DataFrame(chart_data) + df_chart_display = ( + df_chart_data[df_chart_data["resourceType"] == selected_resource_type] + if selected_resource_type != "All" + else df_chart_data + ) -# --- Annotation Quality by Resource Type --- -st.subheader("Analysis by Resource Type") + if not df_chart_display.empty: + chart_col1, chart_col2 = st.columns(2) + with chart_col1: + coverage_chart = ( + alt.Chart(df_chart_display) + .mark_bar() + .encode( + x=alt.X("resourceType:N", title="Resource Type", sort="-y"), + y=alt.Y("coverageRate:Q", title="Annotation Coverage (%)", scale=alt.Scale(domain=[0, 100])), + tooltip=["resourceType", "coverageRate", "matchedTags", "unmatchedByAnnotation"], + ) + .properties(title="Annotation Coverage by Resource Type") + ) + st.altair_chart(coverage_chart, use_container_width=True) + with chart_col2: + completeness_chart = ( + alt.Chart(df_chart_display) + .mark_bar() + .encode( + x=alt.X("resourceType:N", title="Resource Type", sort="-y"), + y=alt.Y( + "completenessRate:Q", title="Pattern Completeness (%)", scale=alt.Scale(domain=[0, 100]) + ), + tooltip=["resourceType", "completenessRate", "matchedTags", "missedByPattern"], + ) + .properties(title="Pattern Completeness by Resource Type") + ) + st.altair_chart(completeness_chart, use_container_width=True) + else: + st.info("No data available for the selected resource type to generate charts.") + + st.divider() + # --- Pattern Catalog --- + with st.expander("View Full Pattern Catalog"): + df_auto_patterns = fetch_pattern_catalog(db_name, cache_table) + df_manual_patterns = fetch_manual_patterns(db_name, manual_patterns_table) + + df_auto_patterns.rename(columns={"resourceType": "resource_type", "pattern": "sample"}, inplace=True) + df_combined_patterns = ( + pd.concat( + [df_auto_patterns[["resource_type", "sample"]], df_manual_patterns[["resource_type", "sample"]]] + ) + .drop_duplicates() + .sort_values(by=["resource_type", "sample"]) + ) -# --- Prepare data for charts --- -chart_data = [] -# Use all_resource_types[1:] to skip the "All" option -for resource_type in all_resource_types[1:]: - # Filter data for this specific resource type - df_patterns_filtered = df_patterns[df_patterns["resourceType"] == resource_type] + if df_combined_patterns.empty: + st.info("Pattern catalog is empty or could not be loaded.") + else: + resource_types = sorted(df_combined_patterns["resource_type"].unique()) + tabs = st.tabs(resource_types) + for i, resource_type in enumerate(resource_types): + with tabs[i]: + df_filtered_patterns = df_combined_patterns[ + df_combined_patterns["resource_type"] == resource_type + ] + st.dataframe( + df_filtered_patterns[["sample"]], + use_container_width=True, + hide_index=True, + column_config={"sample": "Pattern"}, + ) + tag_col1, tag_col2, tag_col3 = st.columns(3) + with tag_col1: + st.metric( + "✅ Matched Tags", + f"{total_matched}", + help="Tags that were correctly identified by the pattern catalog and were also created as final annotations. This represents the successful overlap between the two processes.", + ) + st.dataframe( + pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), + use_container_width=True, + hide_index=True, + ) + with tag_col2: + st.metric( + "❓ Unmatched by Annotation", + f"{total_unmatched}", + help="Tags that were found by the pattern catalog but do not exist as final annotations. This can help identify if patterns are too broad (false positives) or if the standard annotation process missed them.", + ) + st.dataframe( + pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), + use_container_width=True, + hide_index=True, + ) + with tag_col3: + st.metric( + "❗️ Missed by Pattern", + f"{total_missed}", + help="Created annotations that were not found by the pattern catalog. This can help us measure the reliability of pattern mode as a denominator.", + ) + st.dataframe( + pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), + use_container_width=True, + hide_index=True, + ) - if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: - df_annotations_filtered = df_annotations[df_annotations["endNodeResourceType"] == resource_type] - else: - df_annotations_filtered = pd.DataFrame() +# ========================================== +# PER-FILE ANALYSIS TAB +# ========================================== +with per_file_tab: + st.subheader( + "Per-File Annotation Quality", + help="A deep-dive tool for investigating the quality scores of individual files. Filter the table to find specific examples of high or low performance, then select a file to see a detailed breakdown of its specific matched, unmatched, and missed tags.", + ) - # Calculate metrics using global unique tags for THIS resource type - potential = set(df_patterns_filtered["text"]) + df_patterns_file = fetch_raw_table_data(db_name, pattern_table) + df_tags_file = fetch_raw_table_data(db_name, tag_table) + df_docs_file = fetch_raw_table_data(db_name, doc_table) - if not df_annotations_filtered.empty and "startNodeText" in df_annotations_filtered.columns: - actual = set(df_annotations_filtered["startNodeText"]) + if df_patterns_file.empty: + st.info("The pattern catalog is empty. Run the pipeline with patternMode enabled to generate data.") else: - actual = set() - - matched = len(potential.intersection(actual)) - unmatched = len(potential - actual) - missed = len(actual - potential) - - coverage = (matched / (matched + unmatched)) * 100 if (matched + unmatched) > 0 else 0 - completeness = (matched / (matched + missed)) * 100 if (matched + missed) > 0 else 0 - - chart_data.append( - { - "resourceType": resource_type, - "coverageRate": coverage, - "completenessRate": completeness, - "matchedTags": matched, - "unmatchedByAnnotation": unmatched, - "missedByPattern": missed, - } - ) + df_annotations_file = pd.concat([df_tags_file, df_docs_file], ignore_index=True) + df_patterns_agg_file = ( + df_patterns_file.groupby("startNode")["text"].apply(set).reset_index(name="potentialTags") + ) + df_annotations_agg_file = ( + df_annotations_file.groupby("startNode")["startNodeText"].apply(set).reset_index(name="actualAnnotations") + if not df_annotations_file.empty + else pd.DataFrame(columns=["startNode", "actualAnnotations"]) + ) -df_chart_data = pd.DataFrame(chart_data) - -# --- Filter chart data based on dropdown selection --- -if selected_resource_type != "All": - df_chart_display = df_chart_data[df_chart_data["resourceType"] == selected_resource_type] -else: - df_chart_display = df_chart_data - -# --- Render Charts --- -if not df_chart_display.empty: - chart_col1, chart_col2 = st.columns(2) - with chart_col1: - coverage_chart = ( - alt.Chart(df_chart_display) - .mark_bar() - .encode( - x=alt.X("resourceType:N", title="Resource Type", sort="-y"), - y=alt.Y("coverageRate:Q", title="Annotation Coverage (%)", scale=alt.Scale(domain=[0, 100])), - tooltip=["resourceType", "coverageRate", "matchedTags", "unmatchedByAnnotation"], - ) - .properties(title="Annotation Coverage by Resource Type") + df_quality_file = pd.merge(df_patterns_agg_file, df_annotations_agg_file, on="startNode", how="left") + df_quality_file["actualAnnotations"] = df_quality_file["actualAnnotations"].apply( + lambda x: x if isinstance(x, set) else set() + ) + df_quality_file["matchedTags"] = df_quality_file.apply( + lambda row: len(row["potentialTags"].intersection(row["actualAnnotations"])), axis=1 ) - st.altair_chart(coverage_chart, use_container_width=True) - with chart_col2: - completeness_chart = ( - alt.Chart(df_chart_display) - .mark_bar() - .encode( - x=alt.X("resourceType:N", title="Resource Type", sort="-y"), - y=alt.Y("completenessRate:Q", title="Pattern Completeness (%)", scale=alt.Scale(domain=[0, 100])), - tooltip=["resourceType", "completenessRate", "matchedTags", "missedByPattern"], + df_quality_file["unmatchedByAnnotation"] = df_quality_file.apply( + lambda row: len(row["potentialTags"] - row["actualAnnotations"]), axis=1 + ) + df_quality_file["missedByPattern"] = df_quality_file.apply( + lambda row: len(row["actualAnnotations"] - row["potentialTags"]), axis=1 + ) + df_quality_file["coverageRate"] = ( + ( + df_quality_file["matchedTags"] + / (df_quality_file["matchedTags"] + df_quality_file["unmatchedByAnnotation"]) ) - .properties(title="Pattern Completeness by Resource Type") + * 100 + ).fillna(0) + df_quality_file["completenessRate"] = ( + (df_quality_file["matchedTags"] / (df_quality_file["matchedTags"] + df_quality_file["missedByPattern"])) + * 100 + ).fillna(0) + + df_file_meta = fetch_annotation_states(annotation_state_view, file_view) + df_display_unfiltered = ( + pd.merge(df_quality_file, df_file_meta, left_on="startNode", right_on="fileExternalId", how="left") + if not df_file_meta.empty + else df_quality_file ) - st.altair_chart(completeness_chart, use_container_width=True) -else: - st.info("No data available for the selected resource type to generate charts.") - -# --- Display Matched, Unmatched and Missed Tags --- -st.subheader("Tag Details") -tag_col1, tag_col2, tag_col3 = st.columns(3) - -with tag_col1: - st.metric("✅ Matched Tags", f"{total_matched}") - st.dataframe( - pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), use_container_width=True, hide_index=True - ) -with tag_col2: - st.metric("❓ Unmatched by Annotation", f"{total_unmatched}") - st.dataframe( - pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), - use_container_width=True, - hide_index=True, - ) - -with tag_col3: - st.metric("❗️ Missed by Pattern", f"{total_missed}") - st.dataframe( - pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), use_container_width=True, hide_index=True - ) - -# --- Pattern Catalog Expander with Tabs --- -with st.expander("View Full Pattern Catalog"): - # Standardize column names for merging - df_auto_patterns.rename(columns={"resourceType": "resource_type", "pattern": "sample"}, inplace=True) + with st.expander("Filter Per-File Quality Table"): + excluded_columns = [ + "Select", + "startNode", + "potentialTags", + "actualAnnotations", + "matchedTags", + "unmatchedByAnnotation", + "missedByPattern", + "coverageRate", + "completenessRate", + "externalId", + "space", + "annotatedPageCount", + "annotationMessage", + "fileAliases", + "fileAssets", + "fileIsuploaded", + "jobId", + "linkedFile", + "pageCount", + "patternModeJobId", + "sourceCreatedUser", + "sourceCreatedTime", + "sourceUpdatedTime", + "sourceUpdatedUser", + "fileSourceupdateduser", + "fileSourcecreatedUser", + "fileSourceId", + "createdTime", + "fileSourcecreateduser", + "patternModeMessage", + "fileSourceupdatedtime", + "fileSourcecreatedtime", + "fileUploadedtime", + ] + filterable_columns = sorted([col for col in df_display_unfiltered.columns if col not in excluded_columns]) + filter_col1, filter_col2 = st.columns(2) + with filter_col1: + selected_column = st.selectbox( + "Filter by Metadata Property", + options=["None"] + filterable_columns, + on_change=reset_selection, + key="per_file_filter", + ) + selected_values = [] + if selected_column != "None": + unique_values = sorted(df_display_unfiltered[selected_column].dropna().unique().tolist()) + with filter_col2: + selected_values = st.multiselect( + f"Select Value(s) for {selected_column}", options=unique_values, on_change=reset_selection + ) + coverage_range = st.slider( + "Filter by Annotation Coverage (%)", 0, 100, (0, 100), on_change=reset_selection, key="coverage_slider" + ) + completeness_range = st.slider( + "Filter by Pattern Completeness (%)", + 0, + 100, + (0, 100), + on_change=reset_selection, + key="completeness_slider", + ) - # Select and combine relevant columns - df_combined_patterns = ( - pd.concat([df_auto_patterns[["resource_type", "sample"]], df_manual_patterns[["resource_type", "sample"]]]) - .drop_duplicates() - .sort_values(by=["resource_type", "sample"]) - ) + df_display = df_display_unfiltered.copy() + if selected_column != "None" and selected_values: + df_display = df_display[df_display[selected_column].isin(selected_values)] + df_display = df_display[ + (df_display["coverageRate"] >= coverage_range[0]) & (df_display["coverageRate"] <= coverage_range[1]) + ] + df_display = df_display[ + (df_display["completenessRate"] >= completeness_range[0]) + & (df_display["completenessRate"] <= completeness_range[1]) + ] + df_display = df_display.reset_index(drop=True) + df_display.insert(0, "Select", False) + + default_columns = [ + "Select", + "fileName", + "fileSourceid", + "fileMimetype", + "coverageRate", + "completenessRate", + "annotationMessage", + "patternModeMessage", + "lastUpdatedTime", + ] + all_columns = df_display.columns.tolist() + + with st.popover("Customize Table Columns"): + selected_columns = st.multiselect( + "Select columns to display:", + options=all_columns, + default=[col for col in default_columns if col in all_columns], + ) + if not selected_columns: + st.warning("Please select at least one column to display.") + st.stop() + + if st.session_state.get("selected_row_index") is not None and st.session_state.selected_row_index < len( + df_display + ): + df_display.at[st.session_state.selected_row_index, "Select"] = True + + edited_df = st.data_editor( + df_display[selected_columns], + key="quality_table_editor", + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "fileName": "File Name", + "fileSourceid": "Source ID", + "fileMimetype": "Mime Type", + "fileExternalId": "File External ID", + "coverageRate": st.column_config.ProgressColumn( + "Annotation Coverage ℹ️", + help="How many potential tags were found? (Matched / Potential)", + format="%.2f%%", + min_value=0, + max_value=100, + ), + "completenessRate": st.column_config.ProgressColumn( + "Pattern Completeness ℹ️", + help="How many final annotations did patterns find? (Matched / Actual)", + format="%.2f%%", + min_value=0, + max_value=100, + ), + "annotationMessage": "Annotation Message", + "patternModeMessage": "Pattern Mode Message", + "lastUpdatedTime": "Last Updated Time", + }, + use_container_width=True, + hide_index=True, + disabled=df_display.columns.difference(["Select"]), + ) - if df_combined_patterns.empty: - st.info("Pattern catalog is empty or could not be loaded.") - else: - resource_types = sorted(df_combined_patterns["resource_type"].unique()) - tabs = st.tabs(resource_types) - - for i, resource_type in enumerate(resource_types): - with tabs[i]: - df_filtered_patterns = df_combined_patterns[df_combined_patterns["resource_type"] == resource_type] - st.dataframe( - df_filtered_patterns[["sample"]], - use_container_width=True, - hide_index=True, - column_config={"sample": "Pattern"}, + selected_indices = edited_df[edited_df.Select].index.tolist() + if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.get("selected_row_index")] + st.session_state.selected_row_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_row_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.get("selected_row_index") is not None: + st.session_state.selected_row_index = None + st.rerun() + + st.divider() + if st.session_state.get("selected_row_index") is not None and st.session_state.selected_row_index < len( + df_display + ): + selected_file_data = df_display.iloc[st.session_state.selected_row_index] + selected_file = selected_file_data["startNode"] + st.markdown(f"**Displaying Tag Comparison for file:** `{selected_file}`") + file_space_series = df_patterns_file[df_patterns_file["startNode"] == selected_file]["startNodeSpace"] + if not file_space_series.empty: + file_space = file_space_series.iloc[0] + file_node_id = NodeId(space=file_space, external_id=selected_file) + df_potential_tags_details = df_patterns_file[df_patterns_file["startNode"] == selected_file][ + ["text", "resourceType", "regions"] + ] + df_actual_annotations_details = ( + df_annotations_file[df_annotations_file["startNode"] == selected_file][ + ["startNodeText", "endNodeResourceType"] + ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) + if not df_annotations_file.empty + else pd.DataFrame(columns=["text", "resourceType"]) ) + potential_set = set(df_potential_tags_details["text"]) + actual_set = set(df_actual_annotations_details["text"]) + matched_set = potential_set.intersection(actual_set) + unmatched_set = potential_set - actual_set + missed_set = actual_set - potential_set + matched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(matched_set) + ].drop_duplicates(subset=["text", "resourceType"]) + unmatched_df = df_potential_tags_details[ + df_potential_tags_details["text"].isin(unmatched_set) + ].drop_duplicates(subset=["text", "resourceType"]) + missed_df = df_actual_annotations_details[ + df_actual_annotations_details["text"].isin(missed_set) + ].drop_duplicates() + + if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): + with st.spinner("Generating Industrial Canvas with bounding boxes..."): + _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) + unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") + canvas_url = generate_file_canvas( + file_id=file_node_id, + file_view=file_view_config, + ep_config=ep_config, + unmatched_tags_with_regions=unmatched_tags_for_canvas, + ) + if canvas_url: + st.session_state["generated_canvas_url"] = canvas_url + else: + st.session_state.pop("generated_canvas_url", None) + + if "generated_canvas_url" in st.session_state and st.session_state.generated_canvas_url: + st.markdown( + f"**[Open Last Generated Canvas]({st.session_state.generated_canvas_url})**", + unsafe_allow_html=True, + ) -# --- File-Level Table --- -st.header("Per-File Annotation Quality") + st.divider() + col1, col2, col3 = st.columns(3) + with col1: + st.metric( + "✅ Matched Tags", + len(matched_df), + help="Tags that were correctly identified by the pattern catalog and were also created as final annotations. This represents the successful overlap between the two processes.", + ) + st.dataframe( + matched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col2: + st.metric( + "❓ Unmatched by Annotation", + len(unmatched_df), + help="Tags that were found by the pattern catalog but do not exist as final annotations. This can help identify if patterns are too broad (false positives) or if the standard annotation process missed them.", + ) + st.dataframe( + unmatched_df[["text", "resourceType"]], + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + with col3: + st.metric( + "❗️ Missed by Pattern", + len(missed_df), + help="Created annotations that were not found by the pattern catalog. This can help us measure the reliability of pattern mode as a denominator.", + ) + st.dataframe( + missed_df, + column_config={"text": "Tag", "resourceType": "Resource Type"}, + use_container_width=True, + hide_index=True, + ) + else: + st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") -# --- Fetch file metadata and merge it with the quality data --- -df_file_meta = fetch_annotation_states(annotation_state_view, file_view) -if not df_file_meta.empty: - df_display_unfiltered = pd.merge( - df_quality, df_file_meta, left_on="startNode", right_on="fileExternalId", how="left" - ) -else: - df_display_unfiltered = df_quality - -# --- Advanced Filtering Section --- -with st.expander("Filter Per-File Quality Table"): - - # --- DYNAMIC METADATA FILTER --- - # Define columns that should not be offered as metadata filters - excluded_columns = [ - "Select", - "startNode", - "potentialTags", - "actualAnnotations", - "matchedTags", - "unmatchedByAnnotation", - "missedByPattern", - "coverageRate", - "completenessRate", - "fileExternalId", - "externalId", - "space", - "annotatedPageCount", - "annotationMessage", - "fileAliases", - "fileAssets", - "fileIsuploaded", - "jobId", - "linkedFile", - "pageCount", - "patternModeJobId", - "sourceCreatedUser", - "sourceCreatedTime", - "sourceUpdatedTime", - "sourceUpdatedUser", - "fileSourceupdateduser", - "fileSourcecreatedUser", - "fileSourceId", - "createdTime", - "fileSourcecreateduser", - "patternModeMessage", - "fileSourceupdatedtime", - "fileSourcecreatedtime", - "fileUploadedtime", - ] - - # Get the list of available metadata columns for filtering - filterable_columns = sorted([col for col in df_display_unfiltered.columns if col not in excluded_columns]) - - filter_col1, filter_col2 = st.columns(2) - - with filter_col1: - selected_column = st.selectbox( - "Filter by Metadata Property", options=["None"] + filterable_columns, on_change=reset_selection - ) - selected_values = [] - if selected_column != "None": - unique_values = sorted(df_display_unfiltered[selected_column].dropna().unique().tolist()) - with filter_col2: - selected_values = st.multiselect( - f"Select Value(s) for {selected_column}", options=unique_values, on_change=reset_selection - ) +# ========================================== +# PATTERN MANAGEMENT TAB +# ========================================== +with management_tab: + primary_scope_prop = ep_config.get("launchFunction", {}).get("primaryScopeProperty") + secondary_scope_prop = ep_config.get("launchFunction", {}).get("secondaryScopeProperty") - coverage_range = st.slider("Filter by Annotation Coverage (%)", 0, 100, (0, 100), on_change=reset_selection) - completeness_range = st.slider("Filter by Pattern Completeness (%)", 0, 100, (0, 100), on_change=reset_selection) - -df_display = df_display_unfiltered.copy() -# Apply filters -if selected_column != "None" and selected_values: - df_display = df_display[df_display[selected_column].isin(selected_values)] - -df_display = df_display[ - (df_display["coverageRate"] >= coverage_range[0]) & (df_display["coverageRate"] <= coverage_range[1]) -] -df_display = df_display[ - (df_display["completenessRate"] >= completeness_range[0]) - & (df_display["completenessRate"] <= completeness_range[1]) -] - -# --- Reset the index after all filtering is complete --- -df_display = df_display.reset_index(drop=True) - -df_display.insert(0, "Select", False) - -# --- Column configuration for the data editor --- -default_columns = [ - "Select", - "fileName", - "fileSourceid", - "fileMimetype", - "coverageRate", - "completenessRate", - "annotationMessage", - "patternModeMessage", - "lastUpdatedTime", -] -all_columns = df_display.columns.tolist() - -with st.popover("Customize Table Columns"): - selected_columns = st.multiselect( - "Select columns to display:", - options=all_columns, - default=[col for col in default_columns if col in all_columns], # Ensure default is valid + st.subheader( + "Existing Manual Patterns", + help="An action-oriented tool for improving pattern quality. After identifying missed tags in the other tabs, come here to add new manual patterns or edit existing ones to enhance the detection logic for future pipeline runs.", ) + df_manual_patterns_manage = fetch_manual_patterns(db_name, manual_patterns_table) -if not selected_columns: - st.warning("Please select at least one column to display.") - st.stop() - - -if st.session_state.get("selected_row_index") is not None: - if st.session_state.selected_row_index < len(df_display): - df_display.at[st.session_state.selected_row_index, "Select"] = True - -edited_df = st.data_editor( - df_display[selected_columns], - key="quality_table_editor", - column_config={ - "Select": st.column_config.CheckboxColumn(required=True), - "fileName": "File Name", - "fileSourceid": "Source ID", - "fileMimetype": "Mime Type", - "potentialTags": "Potential Tags", - "actualAnnotations": "Actual Annotations", - "coverageRate": st.column_config.ProgressColumn( - "Annotation Coverage ℹ️", - help="How many of the potential tags were found? (Matched / Potential)", - format="%.2f%%", - min_value=0, - max_value=100, - ), - "completenessRate": st.column_config.ProgressColumn( - "Pattern Completeness ℹ️", - help="How many of the final annotations did the patterns find? (Matched / Actual)", - format="%.2f%%", - min_value=0, - max_value=100, - ), - "annotationMessage": "Annotation Message", - "patternModeMessage": "Pattern Mode Message", - "lastUpdatedTime": "Last Updated Time", - }, - use_container_width=True, - hide_index=True, - disabled=df_display.columns.difference(["Select"]), -) + edited_df_manage = st.data_editor( + df_manual_patterns_manage, + num_rows="dynamic", + use_container_width=True, + column_config={ + "key": st.column_config.TextColumn("Scope Key", disabled=True), + "sample": st.column_config.TextColumn("Pattern String", required=True), + "resource_type": st.column_config.TextColumn("Resource Type", required=True), + "scope_level": st.column_config.SelectboxColumn( + "Scope Level", + options=["Global", "Primary Scope", "Secondary Scope"], + required=True, + ), + "primary_scope": st.column_config.TextColumn("Primary Scope"), + "secondary_scope": st.column_config.TextColumn("Secondary Scope"), + "created_by": st.column_config.TextColumn("Created By", required=True), + }, + ) -# --- Logic to enforce single selection --- -selected_indices = edited_df[edited_df.Select].index.tolist() -if len(selected_indices) > 1: - new_selection = [idx for idx in selected_indices if idx != st.session_state.get("selected_row_index")] - if new_selection: - st.session_state.selected_row_index = new_selection[0] - st.rerun() -elif len(selected_indices) == 1: - st.session_state.selected_row_index = selected_indices[0] -elif len(selected_indices) == 0 and st.session_state.get("selected_row_index") is not None: - st.session_state.selected_row_index = None - st.rerun() + if st.button("Save Changes", type="primary", key="save_patterns"): + with st.spinner("Saving changes to RAW..."): + try: + save_manual_patterns(edited_df_manage, db_name, manual_patterns_table) + st.success("Changes saved successfully!") + st.cache_data.clear() + st.rerun() + except Exception as e: + st.error(f"Failed to save changes: {e}") + + st.divider() + + st.subheader("Add a New Pattern") + scope_level = st.selectbox( + "1. Select Scope Level", ["Global", "Primary Scope", "Secondary Scope"], key="scope_level_selector" + ) -# --- Interactive Drill-Down Section --- -st.subheader("Tag Comparison Drill-Down") + with st.form(key="new_pattern_form", clear_on_submit=True): + st.write("2. Enter Pattern Details") + new_pattern = st.text_input("Pattern String", placeholder="e.g., [PI]-00000") + new_resource_type = st.text_input("Resource Type", placeholder="e.g., Asset") -if st.session_state.get("selected_row_index") is not None: - if st.session_state.selected_row_index < len(df_display): - selected_file_data = df_display.iloc[st.session_state.selected_row_index] - selected_file = selected_file_data["startNode"] - st.markdown(f"Displaying details for file: **{selected_file}**") + primary_scope_value = "" + if scope_level in ["Primary Scope", "Secondary Scope"]: + primary_scope_value = st.text_input(f"Primary Scope Value ({primary_scope_prop or 'not configured'})") - file_space_series = df_patterns[df_patterns["startNode"] == selected_file]["startNodeSpace"] - if not file_space_series.empty: - file_space = file_space_series.iloc[0] - file_node_id = NodeId(space=file_space, external_id=selected_file) + secondary_scope_value = "" + if scope_level == "Secondary Scope": + secondary_scope_value = st.text_input(f"Secondary Scope Value ({secondary_scope_prop or 'not configured'})") - # --- Three-Column Tag Comparison (prepare dataframes first) --- - df_potential_tags_details = df_patterns[df_patterns["startNode"] == selected_file][ - ["text", "resourceType", "regions"] - ] + submit_button = st.form_submit_button(label="Add New Pattern") - if not df_annotations.empty: - df_actual_annotations_details = df_annotations[df_annotations["startNode"] == selected_file][ - ["startNodeText", "endNodeResourceType"] - ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) + if submit_button: + if not all([new_pattern, new_resource_type]): + st.warning("Pattern String and Resource Type are required.") else: - df_actual_annotations_details = pd.DataFrame(columns=["text", "resourceType"]) - - potential_tags_set = set(df_potential_tags_details["text"]) - actual_tags_set = set(df_actual_annotations_details["text"]) - - matched_tags_set = potential_tags_set.intersection(actual_tags_set) - unmatched_tags_set = potential_tags_set - actual_tags_set - missed_tags_set = actual_tags_set - potential_tags_set - - matched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(matched_tags_set) - ].drop_duplicates(subset=["text", "resourceType"]) - unmatched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(unmatched_tags_set) - ].drop_duplicates(subset=["text", "resourceType"]) - missed_df = df_actual_annotations_details[ - df_actual_annotations_details["text"].isin(missed_tags_set) - ].drop_duplicates() - - if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): - with st.spinner("Generating Industrial Canvas with bounding boxes..."): - _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) - - unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") - - canvas_url = generate_file_canvas( - file_id=file_node_id, - file_view=file_view_config, - ep_config=ep_config, - unmatched_tags_with_regions=unmatched_tags_for_canvas, - ) - if canvas_url: - st.session_state["generated_canvas_url"] = canvas_url - else: - st.session_state.pop("generated_canvas_url", None) - - if "generated_canvas_url" in st.session_state and st.session_state.generated_canvas_url: - st.markdown( - f"**[Open Last Generated Canvas]({st.session_state.generated_canvas_url})**", unsafe_allow_html=True - ) - - col1, col2, col3 = st.columns(3) - - with col1: - st.metric("✅ Matched Tags", len(matched_df)) - st.dataframe( - matched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - with col2: - st.metric("❓ Unmatched by Annotation", len(unmatched_df)) - st.dataframe( - unmatched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - with col3: - st.metric("❗️ Missed by Pattern", len(missed_df)) - st.dataframe( - missed_df, - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - else: - st.info("✔️ Previous selection is not in the filtered view. Please select a new file.") - -else: - st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") + with st.spinner("Adding new pattern..."): + try: + new_row = pd.DataFrame( + [ + { + "sample": new_pattern, + "resource_type": new_resource_type, + "scope_level": scope_level, + "primary_scope": primary_scope_value, + "secondary_scope": secondary_scope_value, + "created_by": "streamlit", + } + ] + ) + updated_df = pd.concat([edited_df_manage, new_row], ignore_index=True) + + save_manual_patterns(updated_df, db_name, manual_patterns_table) + st.success("New pattern added successfully!") + st.cache_data.clear() + st.rerun() + except Exception as e: + st.error(f"Failed to add pattern: {e}") diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py deleted file mode 100644 index d8f383ec..00000000 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Pattern_Management.py +++ /dev/null @@ -1,132 +0,0 @@ -import streamlit as st -import pandas as pd -from datetime import datetime, timezone - -from helper import ( - fetch_extraction_pipeline_config, - find_pipelines, - fetch_manual_patterns, - save_manual_patterns, -) - -st.set_page_config(page_title="Pattern Management", page_icon="✏️", layout="wide") - -st.title("Pattern Management") -st.markdown("Add, edit, or delete manual patterns to improve the quality of the pattern detection job.") - -# --- Sidebar for Pipeline Selection --- -st.sidebar.title("Pipeline Selection") -pipeline_ids = find_pipelines() - -if not pipeline_ids: - st.info("No active file annotation pipelines found to monitor.") - st.stop() - -selected_pipeline = st.sidebar.selectbox( - "Select a pipeline to manage patterns for:", options=pipeline_ids, key="pattern_pipeline_selector" -) - -# --- Data Fetching --- -config_result = fetch_extraction_pipeline_config(selected_pipeline) -if not config_result: - st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") - st.stop() - -ep_config, _, _ = config_result -cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) -db_name = cache_config.get("rawDb") -manual_patterns_table = cache_config.get("rawManualPatternsCatalog") -primary_scope_prop = ep_config.get("launchFunction", {}).get("primaryScopeProperty") -secondary_scope_prop = ep_config.get("launchFunction", {}).get("secondaryScopeProperty") - - -if not all([db_name, manual_patterns_table]): - st.error("RAW DB name or manual patterns table name is not configured in the extraction pipeline.") - st.stop() - -# --- Load and Display Existing Patterns --- -st.subheader("Existing Manual Patterns") - -df_patterns = fetch_manual_patterns(db_name, manual_patterns_table) - -edited_df = st.data_editor( - df_patterns, - num_rows="dynamic", - use_container_width=True, - column_config={ - "key": st.column_config.TextColumn("Scope Key", disabled=True), - "sample": st.column_config.TextColumn("Pattern String", required=True), - "resource_type": st.column_config.TextColumn("Resource Type", required=True), - "scope_level": st.column_config.TextColumn("Scope Level", required=True), - "primary_scope": st.column_config.TextColumn("Primary Scope", required=False), - "secondary_scope": st.column_config.TextColumn("Secondary Scope", required=False), - "created_by": st.column_config.TextColumn("Created By", required=True), - }, -) - -if st.button("Save Changes", type="primary"): - with st.spinner("Saving changes to RAW..."): - try: - save_manual_patterns(edited_df, db_name, manual_patterns_table) - st.success("Changes saved successfully!") - st.cache_data.clear() - st.rerun() - except Exception as e: - st.error(f"Failed to save changes: {e}") - - -st.divider() - -# --- Add New Pattern Form --- -st.subheader("Add a New Pattern") - -scope_level = st.selectbox( - "1. Select Scope Level", ["Global", "Primary Scope", "Secondary Scope"], key="scope_level_selector" -) - -with st.form(key="new_pattern_form", clear_on_submit=True): - st.write("2. Enter Pattern Details") - new_pattern = st.text_input("Pattern String", placeholder="e.g., [PI]-00000") - new_resource_type = st.text_input("Resource Type", placeholder="e.g., Asset") - - primary_scope_value = "" - if scope_level in ["Primary Scope", "Secondary Scope"]: - primary_scope_value = st.text_input(f"Primary Scope Value ({primary_scope_prop or 'not configured'})") - - secondary_scope_value = "" - if scope_level == "Secondary Scope": - secondary_scope_value = st.text_input(f"Secondary Scope Value ({secondary_scope_prop or 'not configured'})") - - submit_button = st.form_submit_button(label="Add New Pattern") - - if submit_button: - if not all([new_pattern, new_resource_type]): - st.warning("Pattern String and Resource Type are required.") - else: - with st.spinner("Adding new pattern..."): - try: - updated_df = pd.concat( - [ - edited_df, - pd.DataFrame( - [ - { - "sample": new_pattern, - "resource_type": new_resource_type, - "scope_level": scope_level, - "primary_scope": primary_scope_value, - "secondary_scope": secondary_scope_value, - "created_by": "streamlit", - } - ] - ), - ], - ignore_index=True, - ) - - save_manual_patterns(updated_df, db_name, manual_patterns_table) - st.success("New pattern added successfully!") - st.cache_data.clear() - st.rerun() - except Exception as e: - st.error(f"Failed to add pattern: {e}") From ddeb478a404ab500c96e06772536a7aef8b89233 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 11 Sep 2025 08:18:40 -0500 Subject: [PATCH 048/128] fixed bug where multi-page documents that have clean old annotations as true will have the annotations on prior runs deleted --- .../services/FinalizeService.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 0a4f99b9..833327eb 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -230,7 +230,13 @@ def run(self) -> Literal["Done"] | None: regular_item = results.get("regular") if regular_item and regular_item.get("annotations"): self.logger.info(f"Applying annotations to file {str(file_id)}") - if self.clean_old_annotations: + # NOTE: Only clean annotations on the very first run as to not delete past annotations for multi-page files + if self.clean_old_annotations and ( + annotation_state_node.properties[self.annotation_state_view.as_view_id()].get( + "annotatedPageCount" + ) + is None + ): # This should only run once, so we tie it to the regular annotation processing doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) self.report_service.delete_annotations(doc_delete, tag_delete) From e41a18e17c864136893cc54632186a1e2457e2cd Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 11 Sep 2025 08:29:01 -0500 Subject: [PATCH 049/128] log deleted annotations --- .../fn_file_annotation_finalize/services/FinalizeService.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 833327eb..f6c09db4 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -240,7 +240,8 @@ def run(self) -> Literal["Done"] | None: # This should only run once, so we tie it to the regular annotation processing doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) self.report_service.delete_annotations(doc_delete, tag_delete) - + self.logger.info(f"\t- Deleted {len(doc_delete)} doc and {len(tag_delete)} tag annotations.") + doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_node) self.report_service.add_annotations(doc_rows=doc_add, tag_rows=tag_add) annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." From 651a060059e1bfef17a597dffb94ebe792a23c45 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 12 Sep 2025 15:55:10 -0500 Subject: [PATCH 050/128] skip empty and unhashable columns in pipeline health dashboard --- .../Pipeline_Health.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py index 023fb6fb..5f0ebc7e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py @@ -156,13 +156,20 @@ def reset_table_selection(): "fileSourcecreatedtime", "fileUploadedtime", ] - filterable_columns = sorted( - [ - col - for col in df_annotation_states.columns - if col not in excluded_columns and df_annotation_states[col].nunique() < 100 - ] - ) + potential_columns = [col for col in df_annotation_states.columns if col not in excluded_columns] + filterable_columns = [] + for col in potential_columns: + # Skip empty columns or columns where the first item is a list/dict + if df_annotation_states[col].dropna().empty or isinstance( + df_annotation_states[col].dropna().iloc[0], (list, dict) + ): + continue + + # Final check to ensure the column is suitable for filtering + if df_annotation_states[col].nunique() < 100: + filterable_columns.append(col) + + filterable_columns = sorted(filterable_columns) filter_col1, filter_col2 = st.columns(2) selected_column = filter_col1.selectbox( From fe4795501dbcb67f49b82a31fe6bb8806738874c Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Thu, 25 Sep 2025 08:46:09 -0300 Subject: [PATCH 051/128] Making canvas URL generation dynamic and environment-aware --- .../streamlit/file_annotation_dashboard/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 3b15a679..c40ff3be 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -40,7 +40,7 @@ def generate_file_canvas( canvas_name = f"Annotation Quality Analysis - {file_node.external_id}" try: - domain = ep_config.get("streamlitDashboard", {}).get("industrialCanvasDomain", "cog-shadow-projects") + domain = os.getenv("COGNITE_ORGANIZATION", "cog-shadow-projects") project = client.config.project cluster = client.config.cdf_cluster From ec3627d23ef7b6f654e5d44165ec539b79ea1e1d Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Fri, 26 Sep 2025 13:11:58 -0300 Subject: [PATCH 052/128] Removing default domain value when generating file canvas --- .../streamlit/file_annotation_dashboard/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index c40ff3be..2391253e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -40,7 +40,7 @@ def generate_file_canvas( canvas_name = f"Annotation Quality Analysis - {file_node.external_id}" try: - domain = os.getenv("COGNITE_ORGANIZATION", "cog-shadow-projects") + domain = os.getenv("COGNITE_ORGANIZATION") project = client.config.project cluster = client.config.cdf_cluster From 0fffed8a66b67e1687dda7632c752974c0076d5a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 26 Sep 2025 16:52:59 -0500 Subject: [PATCH 053/128] added instance space in default config --- .../cdf_file_annotation/default.config.yaml | 5 +++-- .../extraction_pipelines/ep_file_annotation.config.yaml | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index 897990fd..5eab5036 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -3,10 +3,10 @@ annotationDatasetExternalId: ds_file_annotation # used in /data_models and /extraction_pipelines annotationStateExternalId: FileAnnotationState -annotationStateInstanceSpace: sp_dat_cdf_annotation_states annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model annotationStateVersion: v1.0.0 fileSchemaSpace: +fileInstanceSpace: fileExternalId: fileVersion: @@ -21,6 +21,7 @@ rawManualPatternsCatalog: manual_patterns_catalog # used in /extraction_pipelines extractionPipelineExternalId: ep_file_annotation targetEntitySchemaSpace: +targetEntityInstanceSpace: targetEntityExternalId: targetEntityVersion: @@ -33,7 +34,7 @@ functionClientId: ${IDP_CLIENT_ID} functionClientSecret: ${IDP_CLIENT_SECRET} # used in /workflows -workflowSchedule: "3-59/10 * * * *" # NOTE: runs every 10 minutes with a 3 minute offset +workflowSchedule: "3-59/15 * * * *" # NOTE: runs every 15 minutes with a 3 minute offset workflowExternalId: wf_file_annotation workflowVersion: v1 diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 948163f1..0ae7999e 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -7,16 +7,18 @@ config: version: v1 annotationStateView: schemaSpace: {{ annotationStateSchemaSpace }} - instanceSpace: {{annotationStateInstanceSpace}} + instanceSpace: {{fileInstanceSpace}} externalId: {{ annotationStateExternalId }} version: {{ annotationStateVersion }} fileView: schemaSpace: {{ fileSchemaSpace }} + instanceSpace: {{fileInstanceSpace}} externalId: {{ fileExternalId }} version: {{ fileVersion }} annotationType: diagrams.FileLink targetEntitiesView: schemaSpace: {{ targetEntitySchemaSpace }} + instanceSpace: {{targetEntityInstanceSpace}} externalId: {{ targetEntityExternalId }} version: {{ targetEntityVersion }} annotationType: diagrams.AssetLink @@ -52,10 +54,9 @@ config: targetEntitiesSearchProperty: aliases primaryScopeProperty: None secondaryScopeProperty: - # NOTE: below configurations are used by pattern mode patternMode: True - fileResourceProperty: # optional - targetEntitiesResourceProperty: # optional + fileResourceProperty: + targetEntitiesResourceProperty: dataModelService: getFilesToProcessQuery: targetView: From 0025b3062350e30164308cd3a3aa20eedd8e505d Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 26 Sep 2025 16:53:13 -0500 Subject: [PATCH 054/128] deleted auto creation of annotation state instance space --- .../cdf_file_annotation/data_models/hdm.space.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml index 4c110ed6..ac152224 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml @@ -1,6 +1,3 @@ - description: Helper data model space name: {{ annotationStateSchemaSpace }} space: {{ annotationStateSchemaSpace }} -- description: Instance space for contextualization pipeline annotation states - name: {{ annotationStateInstanceSpace }} - space: {{ annotationStateInstanceSpace }} From 0ac7dc282d36824cec7e1079980804c0b375695f Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 29 Sep 2025 14:23:10 -0500 Subject: [PATCH 055/128] normalized results of diagram detect and pattern detect such that comparisons to create percentages is accurate --- .../file_annotation_dashboard/helper.py | 8 ++ .../pages/Annotation_Quality.py | 84 ++++++++++++++----- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 2391253e..ece1182d 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -452,3 +452,11 @@ def filter_log_lines(log_text: str, search_string: str) -> str: break relevant_blocks.append("\n".join(current_block)) return "\n\n".join(relevant_blocks) + + +# --- Remove all non-alphanumeric characters and convert to lowercase --- +def normalize(s): + # Ensure input is a string before applying regex + if not isinstance(s, str): + return "" + return re.sub(r"[^a-zA-Z0-9]", "", s).lower() diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index c01267e9..76c69fd4 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -11,6 +11,7 @@ fetch_manual_patterns, fetch_annotation_states, save_manual_patterns, + normalize, ) from cognite.client.data_classes.data_modeling import NodeId @@ -105,18 +106,37 @@ def reset_selection(): else: df_annotations_input = pd.DataFrame() - potential_tags_set = set(df_metrics_input["text"]) - actual_annotations_set = ( + # 1. Get the original, un-normalized sets of strings + potential_tags_original = set(df_metrics_input["text"]) + actual_annotations_original = ( set(df_annotations_input["startNodeText"]) if not df_annotations_input.empty and "startNodeText" in df_annotations_input.columns else set() ) - matched_tags_set = potential_tags_set.intersection(actual_annotations_set) - unmatched_by_annotation_set = potential_tags_set - actual_annotations_set - missed_by_pattern_set = actual_annotations_set - potential_tags_set + + # 2. Create a mapping from normalized text back to an original version for display + text_map = { + normalize(text): text for text in potential_tags_original.union(actual_annotations_original) if text + } + + # 3. Create fully normalized sets for logical comparison + normalized_potential_set = {normalize(t) for t in potential_tags_original} + normalized_actual_set = {normalize(t) for t in actual_annotations_original} + + # 4. Perform all set operations on the normalized data for accurate logic + normalized_matched = normalized_potential_set.intersection(normalized_actual_set) + normalized_unmatched = normalized_potential_set - normalized_actual_set + normalized_missed = normalized_actual_set - normalized_potential_set + + # 5. Use the map to get the final sets with original text for display + matched_tags_set = {text_map[t] for t in normalized_matched if t in text_map} + unmatched_by_annotation_set = {text_map[t] for t in normalized_unmatched if t in text_map} + missed_by_pattern_set = {text_map[t] for t in normalized_missed if t in text_map} + total_matched = len(matched_tags_set) total_unmatched = len(unmatched_by_annotation_set) total_missed = len(missed_by_pattern_set) + overall_coverage = ( (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 ) @@ -151,9 +171,12 @@ def reset_selection(): if not df_annotations_filtered.empty and "startNodeText" in df_annotations_filtered.columns else set() ) - matched = len(potential.intersection(actual)) - unmatched = len(potential - actual) - missed = len(actual - potential) + # Use normalized comparison for chart data as well + norm_potential = {normalize(p) for p in potential} + norm_actual = {normalize(a) for a in actual} + matched = len(norm_potential.intersection(norm_actual)) + unmatched = len(norm_potential - norm_actual) + missed = len(norm_actual - norm_potential) coverage = (matched / (matched + unmatched)) * 100 if (matched + unmatched) > 0 else 0 completeness = (matched / (matched + missed)) * 100 if (matched + missed) > 0 else 0 chart_data.append( @@ -301,15 +324,22 @@ def reset_selection(): df_quality_file["actualAnnotations"] = df_quality_file["actualAnnotations"].apply( lambda x: x if isinstance(x, set) else set() ) - df_quality_file["matchedTags"] = df_quality_file.apply( - lambda row: len(row["potentialTags"].intersection(row["actualAnnotations"])), axis=1 - ) - df_quality_file["unmatchedByAnnotation"] = df_quality_file.apply( - lambda row: len(row["potentialTags"] - row["actualAnnotations"]), axis=1 - ) - df_quality_file["missedByPattern"] = df_quality_file.apply( - lambda row: len(row["actualAnnotations"] - row["potentialTags"]), axis=1 - ) + + # Apply normalized comparison for per-file metrics + def calculate_metrics(row): + potential = row["potentialTags"] + actual = row["actualAnnotations"] + norm_potential = {normalize(p) for p in potential} + norm_actual = {normalize(a) for a in actual} + + matched = len(norm_potential.intersection(norm_actual)) + unmatched = len(norm_potential - norm_actual) + missed = len(norm_actual - norm_potential) + return matched, unmatched, missed + + metrics = df_quality_file.apply(calculate_metrics, axis=1, result_type="expand") + df_quality_file[["matchedTags", "unmatchedByAnnotation", "missedByPattern"]] = metrics + df_quality_file["coverageRate"] = ( ( df_quality_file["matchedTags"] @@ -498,11 +528,25 @@ def reset_selection(): if not df_annotations_file.empty else pd.DataFrame(columns=["text", "resourceType"]) ) + + # Use normalized comparison for per-file detail view potential_set = set(df_potential_tags_details["text"]) actual_set = set(df_actual_annotations_details["text"]) - matched_set = potential_set.intersection(actual_set) - unmatched_set = potential_set - actual_set - missed_set = actual_set - potential_set + norm_potential = {normalize(p) for p in potential_set} + norm_actual = {normalize(a) for a in actual_set} + + # We need a map from normalized text back to original for accurate filtering + potential_map = {normalize(text): text for text in potential_set} + actual_map = {normalize(text): text for text in actual_set} + + norm_matched = norm_potential.intersection(norm_actual) + norm_unmatched = norm_potential - norm_actual + norm_missed = norm_actual - norm_potential + + matched_set = {potential_map[t] for t in norm_matched} + unmatched_set = {potential_map[t] for t in norm_unmatched} + missed_set = {actual_map[t] for t in norm_missed} + matched_df = df_potential_tags_details[ df_potential_tags_details["text"].isin(matched_set) ].drop_duplicates(subset=["text", "resourceType"]) From 44070bb905c75634e7f8d05aee0b2348d4b155a6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 30 Sep 2025 15:56:36 -0500 Subject: [PATCH 056/128] remove leading zero before comparison --- .../file_annotation_dashboard/helper.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index ece1182d..e5f05d8e 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -454,9 +454,26 @@ def filter_log_lines(log_text: str, search_string: str) -> str: return "\n\n".join(relevant_blocks) -# --- Remove all non-alphanumeric characters and convert to lowercase --- +# --- Remove all non-alphanumeric characters, convert to lowercase, and strip leading zeros from numbers --- def normalize(s): - # Ensure input is a string before applying regex + """ + Normalizes a string by: + 1. Ensuring it's a string. + 2. Removing all non-alphanumeric characters. + 3. Converting to lowercase. + 4. Removing leading zeros from any sequence of digits found within the string. + """ if not isinstance(s, str): return "" - return re.sub(r"[^a-zA-Z0-9]", "", s).lower() + + # Step 1: Basic cleaning (e.g., "V-0912" -> "v0912") + s = re.sub(r"[^a-zA-Z0-9]", "", s).lower() + + # Step 2: Define a replacer function that converts any matched number to an int and back to a string + def strip_leading_zeros(match): + # match.group(0) is the matched string (e.g., "0912") + return str(int(match.group(0))) + + # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string + # This turns "v0912" into "v912" + return re.sub(r"\d+", strip_leading_zeros, s) From 9a05fecd9006b921b9ece7a527acdc469b9e13a7 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 30 Sep 2025 16:04:42 -0500 Subject: [PATCH 057/128] always turn remove leading zero off for pattern detect --- .../services/AnnotationService.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index c1480e05..2969461b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -1,4 +1,5 @@ import abc +import copy from typing import Any from cognite.client import CogniteClient from services.ConfigService import Config @@ -41,6 +42,12 @@ def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctio self.diagram_detect_config: DiagramDetectConfig | None = None if config.launch_function.annotation_service.diagram_detect_config: self.diagram_detect_config = config.launch_function.annotation_service.diagram_detect_config.as_config() + # NOTE: Remove Leading Zeros has a weird interaction with pattern mode so will always turn off + if config.launch_function.pattern_mode: + # NOTE: Shallow copy that still references Mutable objects in self.diagram_detect_config. + # Since RemoveLeadingZeros is a boolean value, it is immutable and we can modify the copy without effecting the original. + self.pattern_detect_config = copy.copy(self.diagram_detect_config) + self.pattern_detect_config.remove_leading_zeros = False def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: detect_job: DiagramDetectResults = self.client.diagrams.detect( @@ -54,9 +61,9 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str if detect_job.job_id: return detect_job.job_id else: - raise Exception(f"API call to diagram/detect in pattern mode did not return a job ID") + raise Exception(f"API call to diagram/detect did not return a job ID") - def run_pattern_mode_detect(self, files: list, pattern_samples: list[dict[str, Any]]) -> int: + def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: """Generates patterns and runs the diagram detection job in pattern mode.""" detect_job: DiagramDetectResults = self.client.diagrams.detect( file_references=files, @@ -64,7 +71,7 @@ def run_pattern_mode_detect(self, files: list, pattern_samples: list[dict[str, A partial_match=self.annotation_config.partial_match, min_tokens=self.annotation_config.min_tokens, search_field="sample", - configuration=self.diagram_detect_config, + configuration=self.pattern_detect_config, pattern_mode=True, ) if detect_job.job_id: From 73c681755e7bb950e59297d4f8593b25a523eadf Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 1 Oct 2025 11:32:55 -0500 Subject: [PATCH 058/128] create a CogniteSolutionTag instance used for canvas labels --- .../cdf_file_annotation/data_models/canvas.node.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml diff --git a/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml b/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml new file mode 100644 index 00000000..1abfdeb0 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml @@ -0,0 +1,12 @@ +- space: SolutionTagsInstanceSpace # NOTE: space that comes from enabling labels in Canvas UI + externalId: file_annotations_solution_tag + sources: + - source: + space: cdf_apps_shared + externalId: CogniteSolutionTag + version: 'v1' + type: view + properties: + name: 'File Annotations' + description: 'Label is used by canvases generated by the file annotation streamlit module. Can be used for any canvas related to file annotatons.' + color: Green # NOTE: can't seem to get this working \ No newline at end of file From d1793a70adef118c2fbaa76d1c57ce620e51e172 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 1 Oct 2025 11:44:11 -0500 Subject: [PATCH 059/128] add the file annotation label to the generated canvas --- .../streamlit/file_annotation_dashboard/canvas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py index 6263820e..337415a0 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -51,6 +51,7 @@ def generate_properties(file_node: Node, file_view_id: ViewId, node_id: str, off def create_canvas(name: str, client: CogniteClient): """Creates the main canvas node.""" canvas_id = generate_id() + file_annotation_label = {"externalId": "file_annotations_solution_tag", "space": "SolutionTagsInstanceSpace"} canvas = NodeApply( space=CANVAS_SPACE_INSTANCE, external_id=canvas_id, @@ -63,6 +64,7 @@ def create_canvas(name: str, client: CogniteClient): "updatedAt": get_time(), "createdBy": get_user_id(client), "updatedBy": get_user_id(client), + "solutionTags": [file_annotation_label], }, ) ], From aed58ce369a078162703927b5b17dbf63ca2305d Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 6 Oct 2025 09:40:09 -0500 Subject: [PATCH 060/128] updated cache storage to support creation of edges --- .../services/CacheService.py | 102 +++++++++--------- .../services/ConfigService.py | 2 + 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index df27e0d8..f44fd9ac 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -209,7 +209,7 @@ def _convert_instances_to_entities( for instance in file_instances: instance_properties = instance.properties.get(self.file_view.as_view_id()) - if target_entities_resource_type: + if file_resource_type: resource_type: str = instance_properties[file_resource_type] else: resource_type: str = self.file_view.external_id @@ -233,9 +233,8 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict - Letters are grouped into bracketed alternatives, even when mixed with numbers. - Example: '629P' and '629X' will merge to create a pattern piece '000[P|X]'. """ - # Structure: { resource_type: { full_template_key: list_of_collected_variable_parts } } - # where list_of_collected_variable_parts is [ [{'L1_alt1', 'L1_alt2'}], [{'L2_alt1'}], ... ] - pattern_builders: dict[str, dict[str, list[list[set[str]]]]] = {} + # Structure: { resource_type: { 'templates': { template_key: list_of_variable_parts }, 'annotation_type': str } } + pattern_builders = defaultdict(lambda: {"templates": defaultdict(list), "annotation_type": None}) self.logger.info(f"Generating pattern samples from {len(entities)} entities.") def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: @@ -251,66 +250,58 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str for i, part in enumerate(alias_parts): if not part: continue - # Handle delimiters if part in [" ", "-"]: full_template_key_parts.append(part) continue - - # Handle fixed constants (override everything else) left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) right_ok = (i == len(alias_parts) - 1) or (alias_parts[i + 1] in [" ", "-"]) if left_ok and right_ok and part == resource_type_key: full_template_key_parts.append(f"[{part}]") continue - - # --- Dissect the segment to create its template and find variable letters --- - # 1. Create the structural template for the segment (e.g., '629P' -> '000A') segment_template = re.sub(r"\d", "0", part) segment_template = re.sub(r"[A-Za-z]", "A", segment_template) full_template_key_parts.append(segment_template) - - # 2. Extract all groups of letters from the segment variable_letters = re.findall(r"[A-Za-z]+", part) if variable_letters: all_variable_parts.append(variable_letters) return "".join(full_template_key_parts), all_variable_parts for entity in entities: - # NOTE: - key = entity["resource_type"] - if key not in pattern_builders: - pattern_builders[key] = {} - + resource_type = entity["resource_type"] + annotation_type = entity.get("annotation_type") aliases = entity.get("search_property", []) + + if pattern_builders[resource_type]["annotation_type"] is None: + pattern_builders[resource_type]["annotation_type"] = annotation_type + for alias in aliases: if not alias: continue + template_key, variable_parts_from_alias = _parse_alias(alias, resource_type) - template_key, variable_parts_from_alias = _parse_alias(alias, key) - - if template_key in pattern_builders[key]: - # Merge with existing variable parts - existing_variable_sets = pattern_builders[key][template_key] - for i, part_group in enumerate(variable_parts_from_alias): - for j, letter_group in enumerate(part_group): - existing_variable_sets[i][j].add(letter_group) - else: - # Create a new entry with the correct structure (list of lists of sets) + if not pattern_builders[resource_type]["templates"][template_key]: new_variable_sets = [] for part_group in variable_parts_from_alias: new_variable_sets.append([set([lg]) for lg in part_group]) - pattern_builders[key][template_key] = new_variable_sets + pattern_builders[resource_type]["templates"][template_key] = new_variable_sets + else: + existing_variable_sets = pattern_builders[resource_type]["templates"][template_key] + for i, part_group in enumerate(variable_parts_from_alias): + for j, letter_group in enumerate(part_group): + while i >= len(existing_variable_sets): + existing_variable_sets.append([]) + while j >= len(existing_variable_sets[i]): + existing_variable_sets[i].append(set()) + existing_variable_sets[i][j].add(letter_group) - # --- Build the final result from the processed patterns --- result = [] - for resource_type, templates in pattern_builders.items(): + for resource_type, data in pattern_builders.items(): final_samples = [] - for template_key, collected_vars in templates.items(): - # Create an iterator for the collected letter groups + annotation_type = data["annotation_type"] + for template_key, collected_vars in data["templates"].items(): var_iter: Iterator[list[set[str]]] = iter(collected_vars) def build_segment(segment_template: str) -> str: - # This function rebuilds one segment, substituting 'A's with bracketed alternatives if "A" not in segment_template: return segment_template try: @@ -319,20 +310,25 @@ def build_segment(segment_template: str) -> str: def replace_A(match): alternatives = sorted(list(next(letter_group_iter))) - return f"[{'|'.join(alternatives)}]" + return f"[{'|'.join(alternatives)}]" if len(alternatives) > 1 else alternatives[0] return re.sub(r"A+", replace_A, segment_template) except StopIteration: - return segment_template # Should not happen in normal flow + return segment_template - # Split the full template by delimiters, process each part, then rejoin final_pattern_parts = [ build_segment(p) if p not in " -" else p for p in re.split(r"([ -])", template_key) ] final_samples.append("".join(final_pattern_parts)) if final_samples: - result.append({"sample": sorted(final_samples), "resource_type": resource_type}) + result.append( + { + "sample": sorted(final_samples), + "resource_type": resource_type, + "annotation_type": annotation_type, + } + ) return result def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) -> list[dict]: @@ -357,38 +353,40 @@ def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) self.logger.info(f"No manual patterns found for keys: {keys_to_fetch}. This may be expected.") except Exception as e: self.logger.error(f"Failed to retrieve manual patterns: {e}") - return all_manual_patterns def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict]) -> list[dict]: """Merges auto-generated and manual patterns, de-duplicating samples.""" - # The structure of manual_patterns is [{"sample": "P-1", "resource_type": "A"}, ...] - # The structure of auto_patterns is [{"sample": ["P-2", "P-3"], "resource_type": "A"}, ...] - - # Use a dictionary with sets for efficient merging and de-duplication - merged = defaultdict(set) + merged = defaultdict(lambda: {"samples": set(), "annotation_type": None}) # Process auto-generated patterns for item in auto_patterns: resource_type = item.get("resource_type") samples = item.get("sample", []) - if resource_type and isinstance(samples, list): - merged[resource_type].update(samples) + annotation_type = item.get("annotation_type") + if resource_type: + merged[resource_type]["samples"].update(samples) + if not merged[resource_type]["annotation_type"]: + merged[resource_type]["annotation_type"] = annotation_type # Process manual patterns for item in manual_patterns: resource_type = item.get("resource_type") sample = item.get("sample") + annotation_type = item.get("annotation_type") if resource_type and sample: - merged[resource_type].add(sample) + merged[resource_type]["samples"].add(sample) + if not merged[resource_type]["annotation_type"]: + merged[resource_type]["annotation_type"] = annotation_type - # Convert the merged dictionary back to the required list format final_list = [ - {"resource_type": resource_type, "sample": sorted(list(samples))} - for resource_type, samples in merged.items() + { + "resource_type": resource_type, + "sample": sorted(list(data["samples"])), + "annotation_type": data["annotation_type"], + } + for resource_type, data in merged.items() ] - self.logger.info( - f"Merged {len(auto_patterns)} auto-patterns and {len(manual_patterns)} manual patterns into {len(final_list)} resource types." - ) + self.logger.info(f"Merged auto-generated and manual patterns into {len(final_list)} resource types.") return final_list diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 7556b016..15f71ede 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -8,6 +8,7 @@ CustomizeFuzziness, DirectionWeights, ) +from cognite.client.data_classes.data_modeling import NodeId from cognite.client.data_classes.filters import Filter from cognite.client import CogniteClient from cognite.client import data_modeling as dm @@ -203,6 +204,7 @@ class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): class ApplyServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId auto_approval_threshold: float = Field(gt=0.0, le=1.0) auto_suggest_threshold: float = Field(gt=0.0, le=1.0) From c536f07f984bd3c171b911c8b09e48c04a457ccd Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 6 Oct 2025 09:40:36 -0500 Subject: [PATCH 061/128] updated finalize function to support creation of edges --- .../dependencies.py | 7 - .../fn_file_annotation_finalize/handler.py | 17 +- .../services/ApplyService.py | 352 ++++++++++-------- .../services/ConfigService.py | 2 + .../services/FinalizeService.py | 317 ++++------------ .../services/ReportService.py | 145 -------- 6 files changed, 288 insertions(+), 552 deletions(-) delete mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/dependencies.py index 4bffc16b..33adf057 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/dependencies.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/dependencies.py @@ -12,7 +12,6 @@ from services.RetrieveService import GeneralRetrieveService from services.ApplyService import GeneralApplyService from services.LoggerService import CogniteFunctionLogger -from services.ReportService import GeneralReportService from services.PipelineService import GeneralPipelineService @@ -101,12 +100,6 @@ def create_general_retrieve_service( return GeneralRetrieveService(client, config, logger) -def create_general_report_service( - client: CogniteClient, config: Config, logger: CogniteFunctionLogger -) -> GeneralReportService: - return GeneralReportService(client, config, logger) - - def create_general_apply_service( client: CogniteClient, config: Config, logger: CogniteFunctionLogger ) -> GeneralApplyService: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py index 26da2556..63f70303 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py @@ -7,14 +7,12 @@ create_config_service, create_logger_service, create_write_logger_service, - create_general_report_service, create_general_retrieve_service, create_general_apply_service, create_general_pipeline_service, ) from services.FinalizeService import AbstractFinalizeService, GeneralFinalizeService from services.ApplyService import IApplyService -from services.ReportService import IReportService from services.RetrieveService import IRetrieveService from services.PipelineService import IPipelineService from utils.DataStructures import PerformanceTracker @@ -44,7 +42,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: client, pipeline_ext_id=data["ExtractionPipelineExtId"] ) - finalize_instance, report_instance = _create_finalize_service( + finalize_instance = _create_finalize_service( config_instance, client, logger_instance, tracker_instance, function_call_info ) @@ -61,7 +59,6 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: logger_instance.error(message=msg, section="BOTH") return {"status": run_status, "message": msg} finally: - logger_instance.info(report_instance.update_report()) logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") # only want to report on the count of successful and failed files in ep_logs if there were files that were processed or an error occured # else run log will be too messy @@ -94,7 +91,7 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): tracker_instance = PerformanceTracker() - finalize_instance, report_instance = _create_finalize_service( + finalize_instance = _create_finalize_service( config_instance, client, logger_instance, @@ -113,8 +110,6 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): section="BOTH", ) finally: - result = report_instance.update_report() - logger_instance.info(result) logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") logger_instance.close() @@ -142,13 +137,10 @@ def run_locally_parallel( thread_4.join() -def _create_finalize_service( - config, client, logger, tracker, function_call_info -) -> tuple[AbstractFinalizeService, IReportService]: +def _create_finalize_service(config, client, logger, tracker, function_call_info) -> AbstractFinalizeService: """ Instantiate Finalize with interfaces. """ - report_instance: IReportService = create_general_report_service(client, config, logger) retrieve_instance: IRetrieveService = create_general_retrieve_service(client, config, logger) apply_instance: IApplyService = create_general_apply_service(client, config, logger) finalize_instance = GeneralFinalizeService( @@ -158,10 +150,9 @@ def _create_finalize_service( tracker=tracker, retrieve_service=retrieve_instance, apply_service=apply_instance, - report_service=report_instance, function_call_info=function_call_info, ) - return finalize_instance, report_instance + return finalize_instance if __name__ == "__main__": diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 69867416..a20b996e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -14,16 +14,11 @@ Node, NodeId, NodeApply, - NodeApplyResultList, EdgeId, InstancesApplyResult, ) - -from cognite.client.data_classes.filters import ( - In, - Or, -) - +from cognite.client.data_classes.filters import And, Equals, Not +from cognite.client import data_modeling as dm from services.ConfigService import Config, ViewPropertyConfig from utils.DataStructures import DiagramAnnotationStatus @@ -36,11 +31,9 @@ class IApplyService(abc.ABC): """ @abc.abstractmethod - def apply_annotations(self, result_item: dict, file_node: Node) -> tuple[list, list]: - pass - - @abc.abstractmethod - def process_pattern_results(self, result_item: dict, file_node: Node) -> list[RowWrite]: + def process_and_apply_annotations_for_file( + self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool + ) -> tuple[str, str]: pass @abc.abstractmethod @@ -51,18 +44,13 @@ def update_instances( ) -> InstancesApplyResult: pass - @abc.abstractmethod - def delete_annotations_for_file(self, file_id: NodeId) -> tuple[list[str], list[str]]: - pass - class GeneralApplyService(IApplyService): """ - Interface for applying/deleting annotations to a node + Implementation of the ApplyService interface. """ EXTERNAL_ID_LIMIT = 256 - FUNCTION_ID = "fn_file_annotation_finalize" def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): @@ -77,52 +65,196 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.approve_threshold = self.config.finalize_function.apply_service.auto_approval_threshold self.suggest_threshold = self.config.finalize_function.apply_service.auto_suggest_threshold - # NOTE: could implement annotation edges to be updated in batches for performance gains but leaning towards no. Since it will over complicate error handling. - def apply_annotations(self, result_item: dict, file_node: Node) -> tuple[list[RowWrite], list[RowWrite]]: + self.sink_node_id = DirectRelationReference( + space=config.finalize_function.apply_service.sink_node.space, + external_id=config.finalize_function.apply_service.sink_node.external_id, + ) + + def process_and_apply_annotations_for_file( + self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool + ) -> tuple[str, str]: """ - Push the annotations to the file and set the "AnnotationInProcess" tag to "Annotated" + Performs the entire annotation transaction for a single file. """ - node_apply: NodeApply = file_node.as_write() - node_apply.existing_version = None + file_id = file_node.as_id() + source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) - tags_property: list[str] = cast(list[str], node_apply.sources[0].properties["tags"]) + # Step 1: Clean old annotations if required + if clean_old: + deleted_counts = self._delete_annotations_for_file(file_id) + self.logger.info( + f"\t- Deleted {deleted_counts['doc']} doc and {deleted_counts['tag']} tag annotations\n\t- Deleted {deleted_counts['pattern']} pattern annotations." + ) - # NOTE: There are cases where the 'annotated' tag is set but a job was queued up again for the file. - # This is because the rate at which the jobs are processed by finalize is slower than the rate at which launch fills up the queue. - # So if the wait time that was set in the extractor config file goes passed the time it takes for the finalize function to get to the job. Annotate will appear in the tags list. - if "AnnotationInProcess" in tags_property: - index = tags_property.index("AnnotationInProcess") - tags_property[index] = "Annotated" - elif "Annotated" not in tags_property: - raise ValueError("Annotated and AnnotationInProcess not found in tag property of file node") - source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) - doc_doc, doc_tag = [], [] - edge_applies: list[EdgeApply] = [] - for detect_annotation in result_item["annotations"]: - edge_apply_dict: dict[tuple, EdgeApply] = self._detect_annotation_to_edge_applies( - file_node.as_id(), - source_id, - doc_doc, - doc_tag, - detect_annotation, + # Step 2: Process and apply regular annotations + regular_edges, doc_rows, tag_rows = [], [], [] + if regular_item and regular_item.get("annotations"): + for annotation in regular_item["annotations"]: + edges = self._detect_annotation_to_edge_applies(file_id, source_id, doc_rows, tag_rows, annotation) + regular_edges.extend(edges.values()) + + # Step 3: Process and apply pattern annotations + pattern_edges, pattern_rows = [], [] + if pattern_item and pattern_item.get("annotations"): + pattern_edges, pattern_rows = self._process_pattern_results(pattern_item, file_node) + + # Step 4: Apply all changes in batches + node_apply = file_node.as_write() + node_apply.existing_version = None + tags = cast(list[str], node_apply.sources[0].properties["tags"]) + if "AnnotationInProcess" in tags: + tags[tags.index("AnnotationInProcess")] = "Annotated" + elif "Annotated" not in tags: + raise ValueError("Annotated and AnnotationInProcess not found in tag property") + + self.update_instances(list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges) + + if doc_rows: + self.client.raw.rows.insert( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_doc, + row=doc_rows, + ensure_parent=True, + ) + if tag_rows: + self.client.raw.rows.insert( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_tag, + row=tag_rows, + ensure_parent=True, + ) + if pattern_rows: + self.client.raw.rows.insert( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_pattern, + row=pattern_rows, + ensure_parent=True, ) - edge_applies.extend(edge_apply_dict.values()) - self.update_instances(list_node_apply=node_apply, list_edge_apply=edge_applies) + annotation_msg = f"Applied {len(doc_rows)} doc and {len(tag_rows)} tag annotations." + pattern_msg = f"Applied {len(pattern_rows)} pattern detections." - return doc_doc, doc_tag + return annotation_msg, pattern_msg def update_instances( self, list_node_apply: list[NodeApply] | NodeApply | None = None, list_edge_apply: list[EdgeApply] | EdgeApply | None = None, ) -> InstancesApplyResult: - update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( - nodes=list_node_apply, - edges=list_edge_apply, - replace=False, # ensures we don't delete other properties in the view - ) - return update_results + return self.client.data_modeling.instances.apply(nodes=list_node_apply, edges=list_edge_apply, replace=False) + + def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: + """Deletes all standard and pattern edges and their corresponding RAW rows for a file.""" + counts = {"doc": 0, "tag": 0, "pattern": 0} + + # Standard annotations + std_edges = self._list_annotations_for_file(file_id, self.sink_node_id, negate=True) + if std_edges: + edge_ids, doc_keys, tag_keys = [], [], [] + for edge in std_edges: + edge_ids.append(edge.as_id()) + if edge.type.external_id == self.file_annotation_type: + doc_keys.append(edge.external_id) + else: + tag_keys.append(edge.external_id) + + if edge_ids: + self.client.data_modeling.instances.delete(edges=edge_ids) + if doc_keys: + self.client.raw.rows.delete( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_doc, + key=doc_keys, + ) + if tag_keys: + self.client.raw.rows.delete( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_tag, + key=tag_keys, + ) + counts["doc"], counts["tag"] = len(doc_keys), len(tag_keys) + + # Pattern annotations + pattern_edges = self._list_annotations_for_file(file_id, self.sink_node_id, negate=False) + if pattern_edges: + edge_ids = [edge.as_id() for edge in pattern_edges] + row_keys = [edge.external_id for edge in pattern_edges] + if edge_ids: + self.client.data_modeling.instances.delete(edges=edge_ids) + if row_keys: + self.client.raw.rows.delete( + db_name=self.config.finalize_function.report_service.raw_db, + table_name=self.config.finalize_function.report_service.raw_table_doc_pattern, + key=row_keys, + ) + counts["pattern"] = len(row_keys) + + return counts + + def _process_pattern_results(self, result_item: dict, file_node: Node) -> tuple[list[EdgeApply], list[RowWrite]]: + # ... (This method's internal logic remains the same as the previous version) + file_id: NodeId = file_node.as_id() + source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) + + doc_patterns, edge_applies = [], [] + for detect_annotation in result_item["annotations"]: + for entity in detect_annotation.get("entities", []): + external_id = self._create_pattern_annotation_id(file_id, detect_annotation) + now = datetime.now(timezone.utc).replace(microsecond=0) + annotation_type = entity.get( + "annotation_type", self.config.data_model_views.target_entities_view.annotation_type + ) + + annotation_properties = { + "name": file_id.external_id, + "confidence": detect_annotation.get("confidence", 0.0), + "status": DiagramAnnotationStatus.SUGGESTED.value, + "startNodePageNumber": detect_annotation["region"]["page"], + "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), + "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), + "startNodeXMax": max(v["x"] for v in detect_annotation["region"]["vertices"]), + "startNodeYMax": max(v["y"] for v in detect_annotation["region"]["vertices"]), + "startNodeText": detect_annotation["text"], + "sourceCreatedUser": self.FUNCTION_ID, + "sourceUpdatedUser": self.FUNCTION_ID, + "sourceCreatedTime": now.isoformat(), + "sourceUpdatedTime": now.isoformat(), + } + + edge_apply = EdgeApply( + space=file_id.space, + external_id=external_id, + type=DirectRelationReference( + space=self.core_annotation_view_id.space, + external_id=annotation_type, + ), + start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), + end_node=self.sink_node_id, + sources=[ + NodeOrEdgeData( + source=self.core_annotation_view_id, + properties=annotation_properties, + ) + ], + ) + edge_applies.append(edge_apply) + + row_columns = { + "externalId": external_id, + "startSourceId": source_id, + "startNode": file_id.external_id, + "startNodeSpace": file_id.space, + "endNode": self.sink_node_id.external_id, + "endNodeSpace": self.sink_node_id.space, + "endNodeResourceType": entity.get("resource_type", "Unknown"), + "viewId": self.core_annotation_view_id.external_id, + "viewSpace": self.core_annotation_view_id.space, + "viewVersion": self.core_annotation_view_id.version, + } + row_columns.update(annotation_properties) + doc_patterns.append(RowWrite(key=external_id, columns=row_columns)) + + return edge_applies, doc_patterns def _detect_annotation_to_edge_applies( self, @@ -133,7 +265,6 @@ def _detect_annotation_to_edge_applies( detect_annotation: dict[str, Any], ) -> dict[tuple, EdgeApply]: - # NOTE: Using a set to ensure uniqueness and solve the duplicate external edge ID problem diagram_annotations: dict[tuple, EdgeApply] = {} annotation_schema_space: str = self.config.data_model_views.core_annotation_view.schema_space @@ -235,122 +366,39 @@ def _create_annotation_id( return prefix[: self.EXTERNAL_ID_LIMIT - 10] + hash_ - def delete_annotations_for_file( - self, - file_id: NodeId, - ) -> tuple[list[str], list[str]]: - """ - Delete all annotation edges for a file node. - - Args: - client (CogniteClient): The Cognite client instance. - annotation_view_id (ViewId): The ViewId of the annotation view. - node (NodeId): The NodeId of the file node. - """ - annotations = self._list_annotations_for_file(file_id) - - if not annotations: - return [], [] - - doc_annotations_delete: list[str] = [] - tag_annotations_delete: list[str] = [] - edge_ids = [] - for edge in annotations: - edge_ids.append(EdgeId(space=file_id.space, external_id=edge.external_id)) - if edge.type.external_id == self.file_annotation_type: - doc_annotations_delete.append(edge.external_id) - else: - tag_annotations_delete.append(edge.external_id) - self.client.data_modeling.instances.delete(edges=edge_ids) - - return doc_annotations_delete, tag_annotations_delete - - def process_pattern_results(self, result_item: dict, file_node: Node) -> list[RowWrite]: - if not result_item.get("annotations"): - return [] - if not file_node: - return [] - - file_id: NodeId = file_node.as_id() - source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) - - # Step 1: Group all detections by their text content - # The key is the detected tag text, e.g., "P-101A" - aggregated_detections = {} - - for detect_annotation in result_item["annotations"]: - tag_text = detect_annotation["text"] - - if tag_text not in aggregated_detections: - # Initialize the entry for this tag if it's the first time we've seen it - aggregated_detections[tag_text] = { - "regions": [], - "resource_type": "Unknown", # Default resource_type - } - - # Add the location of the current detection - # The region dict contains page, vertices, etc. - aggregated_detections[tag_text]["regions"].append(detect_annotation["region"]) - - # Assume the resource_type is consistent for a given tag text - if "entities" in detect_annotation and detect_annotation["entities"]: - aggregated_detections[tag_text]["resource_type"] = detect_annotation["entities"][0].get( - "resource_type", "Unknown" - ) - - # Step 2: Create one RowWrite object for each unique tag - doc_patterns: list[RowWrite] = [] - for tag_text, data in aggregated_detections.items(): - # The columns for the RAW table row - catalog_properties = { - "startSourceId": source_id, - "startNode": file_id.external_id, - "startNodeSpace": file_id.space, - "text": tag_text, - "resourceType": data["resource_type"], - # Store the entire list of region dicts - # Note: The RAW table will automatically serialize this list of dicts into a JSON string - "regions": data["regions"], - "sourceCreatedUser": self.FUNCTION_ID, - "sourceUpdatedUser": self.FUNCTION_ID, - } + def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[str, Any]) -> str: + text = raw_annotation["text"] + hash_ = sha256(json.dumps(raw_annotation, sort_keys=True).encode()).hexdigest()[:10] + prefix = f"pattern:{file_id.external_id}:{text}" - # Create a deterministic key based on the tag text and file - row_key = f"{tag_text}:{file_id.space}:{file_id.external_id}" - row = RowWrite(key=row_key, columns=catalog_properties) - doc_patterns.append(row) + if len(prefix) > self.EXTERNAL_ID_LIMIT - 11: + prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] - return doc_patterns + return f"{prefix}:{hash_}" - def _list_annotations_for_file( - self, - node_id: NodeId, - ): + def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationReference, negate: bool = False): """ - List all annotation edges for a file node. + List all annotation edges for a file node, optionally filtering by the end node. + """ + start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) + end_node_filter = Equals(["edge", "endNode"], {"space": end_node.space, "externalId": end_node.external_id}) - Args: - client (CogniteClient): The Cognite client instance. - annotation_view_id (ViewId): The ViewId of the annotation view. - node (NodeId): The NodeId of the file node. + if negate: + final_filter = And(start_node_filter, dm.filters.Not(end_node_filter)) + else: + final_filter = And(start_node_filter, end_node_filter) - Returns: - list: A list of edges (annotations) linked to the file node. - """ annotations = self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view_id], space=node_id.space, - filter=Or(In(["edge", "startNode"], [node_id])), + filter=final_filter, limit=-1, ) return annotations def _get_edge_apply_unique_key(self, edge_apply_instance: EdgeApply) -> tuple: - """ - Create a hashable value for EdgeApply objects to use as a key for any hashable collection - """ start_node_key = ( edge_apply_instance.start_node.space, edge_apply_instance.start_node.external_id, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 7556b016..15f71ede 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -8,6 +8,7 @@ CustomizeFuzziness, DirectionWeights, ) +from cognite.client.data_classes.data_modeling import NodeId from cognite.client.data_classes.filters import Filter from cognite.client import CogniteClient from cognite.client import data_modeling as dm @@ -203,6 +204,7 @@ class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): class ApplyServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId auto_approval_threshold: float = Field(gt=0.0, le=1.0) auto_suggest_threshold: float = Field(gt=0.0, le=1.0) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index f6c09db4..2ee6aed8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -4,13 +4,11 @@ from datetime import datetime, timezone from cognite.client import CogniteClient from cognite.client.exceptions import CogniteAPIError -from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ( Node, NodeId, NodeList, NodeApply, - NodeApplyList, NodeOrEdgeData, ) @@ -18,7 +16,6 @@ from services.LoggerService import CogniteFunctionLogger from services.RetrieveService import IRetrieveService from services.ApplyService import IApplyService -from services.ReportService import IReportService from utils.DataStructures import ( BatchOfNodes, PerformanceTracker, @@ -29,8 +26,6 @@ class AbstractFinalizeService(abc.ABC): """ Orchestrates the file annotation finalize process. - This service retrieves the results of the diagram detect jobs from the launch function and then applies annotations to the file. - Additionally, it captures the file and asset annotations into separate RAW tables. """ def __init__( @@ -41,7 +36,6 @@ def __init__( tracker: PerformanceTracker, retrieve_service: IRetrieveService, apply_service: IApplyService, - report_service: IReportService, ): self.client: CogniteClient = client self.config: Config = config @@ -49,7 +43,6 @@ def __init__( self.tracker: PerformanceTracker = tracker self.retrieve_service: IRetrieveService = retrieve_service self.apply_service: IApplyService = apply_service - self.report_service: IReportService = report_service @abc.abstractmethod def run(self) -> str | None: @@ -58,9 +51,7 @@ def run(self) -> str | None: class GeneralFinalizeService(AbstractFinalizeService): """ - Orchestrates the file annotation finalize process. - This service retrieves the results of the diagram detect jobs from the launch function and then applies annotations to the file. - Additionally, it captures the file and asset annotations into separate RAW tables. + Implementation of the FinalizeService. """ def __init__( @@ -71,7 +62,6 @@ def __init__( tracker: PerformanceTracker, retrieve_service: IRetrieveService, apply_service: IApplyService, - report_service: IReportService, function_call_info: dict, ): super().__init__( @@ -81,58 +71,26 @@ def __init__( tracker, retrieve_service, apply_service, - report_service, ) self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = config.data_model_views.file_view - self.page_range: int = config.launch_function.annotation_service.page_range self.max_retries: int = config.finalize_function.max_retry_attempts self.clean_old_annotations: bool = config.finalize_function.clean_old_annotations - self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") def run(self) -> Literal["Done"] | None: - """ - Retrieves the result of a diagram detect job and then pushes the annotation to mpcFile. - Specifically, - 1. Get a unique jobId and all instances of mpcAnnotationState that share that jobId - 2. If an error occurs - - Retrieve another job - 3. If no error occurs - - Continue - 4. Check the status of the job - 5. If the job is complete - - Iterate through all items in the diagram detect job results push the annotation to mpcFile - 6. If a file does have annotations - - Push the annotations to the file - - Update status of FileAnnotationState to "Annotated" - - Add annotations to the annotations report - 7. If a file doesn't have any annotations or an error occurs - - Update status of mpcAnnotationState to "Retry" or "Fail" - 8. If the job isn't complete - - Update status of FileAnnotationState to "Processing" - - End the run - """ - self.logger.info( - message="Starting Finalize Function", - section="START", - ) + self.logger.info("Starting Finalize Function", section="START") try: job_id, pattern_mode_job_id, file_to_state_map = self.retrieve_service.get_job_id() if not job_id or not file_to_state_map: - self.logger.info(message="No diagram detect jobs found", section="END") + self.logger.info("No diagram detect jobs found", section="END") return "Done" - else: - self.logger.info( - message=f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files" - ) + self.logger.info(f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files") except CogniteAPIError as e: - # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if e.code == 400 and e.message == "A version conflict caused the ingest to fail.": - # NOTE: Expected behavior. Means jobs has been claimed already. self.logger.info( message=f"Retrieved job id that has already been claimed. Grabbing another job.", section="END", @@ -142,7 +100,6 @@ def run(self) -> Literal["Done"] | None: e.code == 408 and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): - # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") return else: @@ -165,9 +122,17 @@ def run(self) -> Literal["Done"] | None: failed=True, ) - if not job_results or not pattern_mode_job_results: + # A job is considered complete if: + # 1. The main job is finished, AND + # 2. EITHER pattern mode was not enabled (no pattern job ID) + # OR pattern mode was enabled AND its job is also finished. + jobs_complete: bool = job_results is not None and ( + not pattern_mode_job_id or pattern_mode_job_results is not None + ) + + if not jobs_complete: self.logger.info( - message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) and/or pattern id ({pattern_mode_job_id} not complete)", + message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) and/or pattern id ({pattern_mode_job_id}) not complete", section="END", ) self._update_batch_state( @@ -179,192 +144,117 @@ def run(self) -> Literal["Done"] | None: return self.logger.info( - message=f"Both jobs ({job_id}, {pattern_mode_job_id}) complete. Applying all annotations.", - section="END", + f"Both jobs ({job_id}, {pattern_mode_job_id}) complete. Applying all annotations.", section="END" ) - # NOTE: Merge the results by file ID for easier processing - # Ensures that for each job, both the regular annotations and its pattern results will be updated within the same transaction. - # This prevents a scenario where the regular annotation is successfully processed but an error occurs before the pattern results are successfully processed. - # That would leave the file in a partially completed state. merged_results = { (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): {"regular": item} for item in job_results["items"] } - if pattern_mode_job_results: for item in pattern_mode_job_results["items"]: - # FIX: Use the same tuple format for the key when adding pattern results. key = (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]) if key in merged_results: merged_results[key]["pattern"] = item else: merged_results[key] = {"pattern": item} - count_retry = 0 - count_failed = 0 - count_success = 0 - annotation_state_node_applies: list[NodeApply] = [] - failed_files: NodeList[Node] = NodeList(resources=[]) + count_retry, count_failed, count_success = 0, 0, 0 + annotation_state_node_applies = [] - # Loop through the merged results, processing one file at a time for (space, external_id), results in merged_results.items(): - file_id: NodeId = NodeId(space, external_id) - file_node: Node | None = self.client.data_modeling.instances.retrieve_nodes( + file_id = NodeId(space, external_id) + file_node = self.client.data_modeling.instances.retrieve_nodes( nodes=file_id, sources=self.file_view.as_view_id() ) if not file_node: - self.logger.debug(f"No file node found for file id {str(file_id)}") continue - annotation_state_node: Node = file_to_state_map[file_id] - current_attempt_count: int = cast( - int, - annotation_state_node.properties[self.annotation_state_view.as_view_id()]["attemptCount"], + annotation_state_node = file_to_state_map[file_id] + current_attempt = cast( + int, annotation_state_node.properties[self.annotation_state_view.as_view_id()]["attemptCount"] ) - next_attempt_count = current_attempt_count + 1 - job_node_to_update: NodeApply | None = None + next_attempt = current_attempt + 1 try: - # Process Regular Annotations - regular_item = results.get("regular") - if regular_item and regular_item.get("annotations"): - self.logger.info(f"Applying annotations to file {str(file_id)}") - # NOTE: Only clean annotations on the very first run as to not delete past annotations for multi-page files - if self.clean_old_annotations and ( - annotation_state_node.properties[self.annotation_state_view.as_view_id()].get( - "annotatedPageCount" - ) - is None - ): - # This should only run once, so we tie it to the regular annotation processing - doc_delete, tag_delete = self.apply_service.delete_annotations_for_file(file_id) - self.report_service.delete_annotations(doc_delete, tag_delete) - self.logger.info(f"\t- Deleted {len(doc_delete)} doc and {len(tag_delete)} tag annotations.") - - doc_add, tag_add = self.apply_service.apply_annotations(regular_item, file_node) - self.report_service.add_annotations(doc_rows=doc_add, tag_rows=tag_add) - annotation_msg: str = f"Applied {len(doc_add)} doc and {len(tag_add)} tag annotations." - self.logger.info(f"\t- {annotation_msg}") - elif regular_item and regular_item.get("errorMessage"): - raise Exception(regular_item.get("errorMessage")) - else: - annotation_msg: str = "Found no annotations to apply" - - # Process Pattern Mode Annotations - pattern_item = results.get("pattern") - if pattern_item and pattern_item.get("annotations"): - self.logger.info(f"Processing pattern mode results for file {str(file_id)}") - # responsible for converting pattern results into RAW rows and adding them to its internal batch for later upload. - pattern_add: list[RowWrite] = self.apply_service.process_pattern_results(pattern_item, file_node) - self.report_service.add_pattern_tags(pattern_rows=pattern_add) - pattern_msg: str = f"Found {len(pattern_item['annotations'])} pattern annotations." - self.logger.info(f"\t- {pattern_msg}") - elif pattern_item and pattern_item.get("errorMessage"): - pattern_msg = pattern_item.get("errorMessage") - else: - pattern_msg: str = "Found no tags from pattern samples" + self.logger.info(f"Processing file {file_id}:") + annotation_msg, pattern_msg = self.apply_service.process_and_apply_annotations_for_file( + file_node, + results.get("regular"), + results.get("pattern"), + self.clean_old_annotations + and annotation_state_node.properties[self.annotation_state_view.as_view_id()].get( + "annotatedPageCount" + ) + is None, + ) + self.logger.info(f"\t- {annotation_msg}\n\t- {pattern_msg}") - # Determine Final State - if regular_item and regular_item.get("pageCount"): - page_count: int = regular_item["pageCount"] - annotated_page_count: int = self._check_all_pages_annotated(annotation_state_node, page_count) - else: - page_count = 1 - annotated_page_count = page_count + # Logic to handle multi-page files + page_count = results.get("regular", {}).get("pageCount", 1) + annotated_pages = self._check_all_pages_annotated(annotation_state_node, page_count) - if annotated_page_count == page_count: + if annotated_pages == page_count: job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.ANNOTATED, - attempt_count=next_attempt_count, - annotated_page_count=annotated_page_count, - page_count=page_count, - annotation_message=annotation_msg, - pattern_mode_message=pattern_msg, + annotation_state_node, + AnnotationStatus.ANNOTATED, + next_attempt, + annotated_pages, + page_count, + annotation_msg, + pattern_msg, ) count_success += 1 else: - # File has more pages to process job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.NEW, - attempt_count=current_attempt_count, # Do not increment attempt count - annotated_page_count=annotated_page_count, - page_count=page_count, - annotation_message="Processed page batch, more pages remaining", - pattern_mode_message=pattern_msg, + annotation_state_node, + AnnotationStatus.NEW, + current_attempt, + annotated_pages, + page_count, + "Processed page batch, more pages remaining", + pattern_msg, ) - # This is still a success for the current batch - count_success += 1 + count_success += 1 # Still a success for this batch except Exception as e: - # If anything fails for this file, mark it for retry or failure - self.logger.error(f"Failed to process annotations for file {str(file_id)}: {str(e)}") - if next_attempt_count >= self.max_retries: + self.logger.error(f"Failed to process annotations for file {file_id}: {e}") + if next_attempt >= self.max_retries: job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.FAILED, - attempt_count=next_attempt_count, + annotation_state_node, + AnnotationStatus.FAILED, + next_attempt, annotation_message=str(e), pattern_mode_message=str(e), ) count_failed += 1 - failed_files.append(file_node) else: job_node_to_update = self._process_annotation_state( - node=annotation_state_node, - status=AnnotationStatus.RETRY, - attempt_count=next_attempt_count, + annotation_state_node, + AnnotationStatus.RETRY, + next_attempt, annotation_message=str(e), pattern_mode_message=str(e), ) count_retry += 1 - if job_node_to_update: - annotation_state_node_applies.append(job_node_to_update) - - if failed_files: - file_applies: NodeApplyList = failed_files.as_write() - for node_apply in file_applies: - node_apply.existing_version = None - tags_property: list[str] = cast(list[str], node_apply.sources[0].properties["tags"]) - if "AnnotationInProcess" in tags_property: - index = tags_property.index("AnnotationInProcess") - tags_property[index] = "AnnotationFailed" - elif "Annotated" in tags_property: - self.logger.debug( - f"Annotated is in the tags property of {node_apply.as_id()}\nTherefore, this set of pages does not contain any annotations while the prior pages do" - ) - elif "AnnotationFailed" not in tags_property: - self.logger.error( - f"AnnotationFailed and AnnotationInProcess not found in tag property of {node_apply.as_id()}" - ) - try: - self.client.data_modeling.instances.apply(nodes=file_applies, replace=False) - except CogniteAPIError as e: - self.logger.error(f"Ran into the following error:\n\t{str(e)}\nTrying again in 30 seconds") - time.sleep(30) - self.client.data_modeling.instances.apply(nodes=file_applies, replace=False) + annotation_state_node_applies.append(job_node_to_update) + # Batch update the state nodes at the end if annotation_state_node_applies: - node_count = len(annotation_state_node_applies) self.logger.info( - message=f"Updating {node_count} annotation state instances", - section="START", + f"Updating {len(annotation_state_node_applies)} annotation state instances", section="START" ) try: self.apply_service.update_instances(list_node_apply=annotation_state_node_applies) self.logger.info( - f"\t- {count_success} set to Annotated\n- {count_retry} set to retry\n- {count_failed} set to failed" + f"\t- {count_success} set to Annotated/New\n\t- {count_retry} set to Retry\n\t- {count_failed} set to Failed" ) except Exception as e: - self.logger.error( - message=f"Error during batch update of individual annotation states: \n{e}", - section="END", - ) + self.logger.error(f"Error during batch update of annotation states: {e}", section="END") self.tracker.add_files(success=count_success, failed=(count_failed + count_retry)) + return None def _process_annotation_state( self, @@ -378,48 +268,24 @@ def _process_annotation_state( ) -> NodeApply: """ Create a node apply from the node passed into the function. - The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. - Thus, we set it here and include logic to handle the scneario where it is set. - NOTE: Always want to use the latest page count from the diagram detect results - e.g.) let page_range = 50 - - If the pdf has less than 50 pages, say 3 pages, then... - - annotationStatus property will get set to 'complete' - - annotatedPageCount and pageCount properties will be set to 3. - - Elif the pdf has more than 50 pages, say 80, then... - - annotationStatus property will get set to 'new' - - annotatedPageCount set to 50 - - pageCount set to 80 - - attemptCount doesn't get incremented - - If an error occurs, the annotated_page_count and page_count won't be passed - - Don't want to touch the pageCount and annotatedPageCount properties in this scenario """ - if not annotated_page_count or not page_count: - update_properties = { - "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), - "annotationMessage": annotation_message, - "patternModeMessage": pattern_mode_message, - "attemptCount": attempt_count, - "finalizeFunctionId": self.function_id, - "finalizeFunctionCallId": self.call_id, - } - else: - update_properties = { - "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), - "annotationMessage": annotation_message, - "patternModeMessage": pattern_mode_message, - "attemptCount": attempt_count, - "annotatedPageCount": annotated_page_count, - "pageCount": page_count, - "finalizeFunctionId": self.function_id, - "finalizeFunctionCallId": self.call_id, - } + update_properties = { + "annotationStatus": status, + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "annotationMessage": annotation_message, + "patternModeMessage": pattern_mode_message, + "attemptCount": attempt_count, + "finalizeFunctionId": self.function_id, + "finalizeFunctionCallId": self.call_id, + } + if annotated_page_count and page_count: + update_properties["annotatedPageCount"] = annotated_page_count + update_properties["pageCount"] = page_count node_apply = NodeApply( space=node.space, external_id=node.external_id, - existing_version=None, # update the node regardless of existing version + existing_version=None, sources=[ NodeOrEdgeData( source=self.annotation_state_view.as_view_id(), @@ -432,18 +298,7 @@ def _process_annotation_state( def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: """ - The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. - - - if annotated_page_count is not set (first run): - - if page_range >= to the page count: - - annotated_page_count = page_count b/c all of the pages were passed into the FileReference during LaunchService - - else: - - annotated_page_count = page_range b/c there are more pages to annotate - - else the annotation_page_count property is set: - - if (annotated_page_count + page_range) >= page_count: - - annotated_page_count = page_count b/c all of the pages were passed into the FileReference during LaunchService - else: - - annotated_page_count = self.page_range + annotated_page_count b/c there are more pages to annotate + Checks if all pages have been annotated and returns the new annotated page count. """ annotated_page_count: int | None = cast( int, @@ -474,14 +329,6 @@ def _update_batch_state( ): """ Updates the properties of FileAnnnotationState - 1. If failed set to True - - update the status and delete the diagram detect jobId of the nodes - 2. If there's an annoatation message and attempt count - - if status is "Processing": - - Update the status of the nodes - - Set 'sourceUpdateTime' to the time it was claimed so that the jobs first in line for pickup again - - else: - - Update the status of the nodes """ if len(batch.nodes) == 0: return diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py deleted file mode 100644 index 1f3243b9..00000000 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ReportService.py +++ /dev/null @@ -1,145 +0,0 @@ -import abc - -from cognite.client import CogniteClient -from cognite.client.data_classes import RowWrite - -from services.ConfigService import Config -from services.LoggerService import CogniteFunctionLogger - - -class IReportService(abc.ABC): - """ - Interface for reporting the annotations that have been applied - e.g.) Used as the numerator for annotation link rate at Marathon - """ - - @abc.abstractmethod - def add_annotations(self, doc_rows: list[RowWrite], tag_rows: list[RowWrite]) -> None: - pass - - @abc.abstractmethod - def add_pattern_tags(self, pattern_rows: list[RowWrite]) -> None: - pass - - @abc.abstractmethod - def delete_annotations(self, doc_row_keys: list[str], tag_row_keys: list[str]) -> None: - pass - - @abc.abstractmethod - def update_report(self) -> str: - pass - - -class GeneralReportService(IReportService): - """ - Interface for reporting the annotations that have been applied - e.g.) Used as the numerator for annotation link rate at Marathon - """ - - def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): - self.client = client - self.config = config - self.logger = logger - - self.db: str = config.finalize_function.report_service.raw_db - self.doc_table: tuple[str, list[RowWrite], list[str]] = ( - config.finalize_function.report_service.raw_table_doc_doc, - [], # NOTE: rows to upload - [], # NOTE: rows to delete -> holds list of keys - ) - self.tag_table: tuple[str, list[RowWrite], list[str]] = ( - config.finalize_function.report_service.raw_table_doc_tag, - [], # NOTE: rows to upload - [], # NOTE: rows to delete -> holds list of keys - ) - self.pattern_table: tuple[str, list[RowWrite], list[str]] = ( - config.finalize_function.report_service.raw_table_doc_pattern, - [], # NOTE: rows to upload - [], # TODO: figure out best way of implementing this. Hard to generate deterministic key without effecting performance. No edges to retrieve and delete like in clean old annotations function. - ) - self.batch_size: int = config.finalize_function.report_service.raw_batch_size - self.delete: bool = self.config.finalize_function.clean_old_annotations - - def add_annotations(self, doc_rows: list[RowWrite], tag_rows: list[RowWrite]) -> None: - """ - NOTE: Using batch size to ensure that we're writing to raw efficiently. IMO report doesn't need to be pushed to raw at the end of every diagram detect job. - Though we don't want to be too efficient to where we lose out on data in case anything happens to the thread. Thus this balances efficiency with data secureness. - Updating report at the end of every job with 50 files that's processed leads to around 15 seconds of additional time added. - Thus, for 61,000 files / 50 files per job = 1220 jobs * 15 seconds added = 18300 seconds = 305 minutes saved by writing to RAW more efficiently. - """ - self.doc_table[1].extend(doc_rows) - self.tag_table[1].extend(tag_rows) - if len(self.doc_table[1]) + len(self.tag_table[1]) > self.batch_size: - msg = self.update_report() - self.logger.info(f"{msg}", "BOTH") - return - - def delete_annotations( - self, - doc_row_keys: list[str], - tag_row_keys: list[str], - ) -> None: - self.doc_table[2].extend(doc_row_keys) - self.tag_table[2].extend(tag_row_keys) - return - - def update_report(self) -> str: - """ - Upload annotation edges to RAW for reporting. - If clean old annotations is set to true, delete the rows before uploading the rows in RAW. - NOTE: tuple meaning -> self.doc_table[0] = tbl_name, [1] = rows to upload, [2] = keys of the rows to delete - """ - delete_msg = None - if self.delete: - self.client.raw.rows.delete( - db_name=self.db, - table_name=self.doc_table[0], - key=self.doc_table[2], - ) - self.client.raw.rows.delete( - db_name=self.db, - table_name=self.tag_table[0], - key=self.tag_table[2], - ) - delete_msg = f"Deleted annotations from db: {self.db}\n- deleted {len(self.doc_table[2])} rows from tbl: {self.doc_table[0]}\n- deleted {len(self.tag_table[2])} rows from tbl: {self.tag_table[0]}" - - update_msg = "No annotations to upload" - if len(self.doc_table[1]) > 0 or len(self.tag_table[1]) > 0: - update_msg = f"Uploaded annotations to db: {self.db}\n- added {len(self.doc_table[1])} rows to tbl: {self.doc_table[0]}\n- added {len(self.tag_table[1])} rows to tbl: {self.tag_table[0]}\n- added {len(self.pattern_table[1])} rows to tbl: {self.pattern_table[0]}" - self.client.raw.rows.insert( - db_name=self.db, - table_name=self.doc_table[0], - row=self.doc_table[1], - ensure_parent=True, - ) - self.client.raw.rows.insert( - db_name=self.db, - table_name=self.tag_table[0], - row=self.tag_table[1], - ensure_parent=True, - ) - if self.pattern_table[1]: - self.client.raw.rows.insert( - db_name=self.db, - table_name=self.pattern_table[0], - row=self.pattern_table[1], - ensure_parent=True, - ) - self._clear_tables() - - if delete_msg: - return f" {delete_msg}\n{update_msg}" - return f" {update_msg}" - - def add_pattern_tags(self, pattern_rows: list[RowWrite]): - self.pattern_table[1].extend(pattern_rows) - return - - def _clear_tables(self) -> None: - self.doc_table[1].clear() - self.tag_table[1].clear() - self.pattern_table[1].clear() - if self.delete: - self.doc_table[2].clear() - self.tag_table[2].clear() - # self.pattern_table[2].clear() # TODO: figure out best approach From 20a9e46ff91d06cb02bb3fe378ee2c4ba12a662c Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 6 Oct 2025 13:16:27 -0500 Subject: [PATCH 062/128] added new instance space for holding cogniteDiagramAnnotation instances --- .../data_models/canvas.node.yaml | 14 ++++++++++- .../data_models/hdm.space.yaml | 3 +++ .../cdf_file_annotation/default.config.yaml | 2 ++ .../ep_file_annotation.config.yaml | 3 +++ .../services/ApplyService.py | 24 +++++++++++++------ .../utils/DataStructures.py | 1 + .../utils/DataStructures.py | 1 + 7 files changed, 40 insertions(+), 8 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml b/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml index 1abfdeb0..1c46051c 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml @@ -9,4 +9,16 @@ properties: name: 'File Annotations' description: 'Label is used by canvases generated by the file annotation streamlit module. Can be used for any canvas related to file annotatons.' - color: Green # NOTE: can't seem to get this working \ No newline at end of file + color: Green # NOTE: can't seem to get this working + +- space: {{patternModeInstanceSpace}} + externalId: {{patternDetectSink}} + sources: + - source: + space: cdf_cdm + externalId: CogniteFile # Using CogniteFile as a base type for simplicity + version: 'v1' + type: view + properties: + name: 'Pattern Detection Sink Node' + description: 'A single, static node used as the end target for all pattern detection edges. The actual detection details are stored on the edge itself.' \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml index ac152224..51e2d2c1 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml @@ -1,3 +1,6 @@ - description: Helper data model space name: {{ annotationStateSchemaSpace }} space: {{ annotationStateSchemaSpace }} +- description: Pattern mode results instance space + name: {{ patternModeInstanceSpace }} + space: {{ patternModeInstanceSpace }} \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index 5eab5036..9389ec03 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -5,6 +5,8 @@ annotationDatasetExternalId: ds_file_annotation annotationStateExternalId: FileAnnotationState annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model annotationStateVersion: v1.0.0 +patternModeInstanceSpace: sp_dat_pattern_mode_results +patternDetectSink: pattern_detection_sink_node fileSchemaSpace: fileInstanceSpace: fileExternalId: diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 0ae7999e..17e43f98 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -120,6 +120,9 @@ config: operator: Exists targetProperty: diagramDetectJobId applyService: + sinkNode: + space: {{ patternModeInstanceSpace }} + externalId: {{patternDetectSink}} autoApprovalThreshold: 1.0 autoSuggestThreshold: 1.0 reportService: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index a20b996e..8e2c4287 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -65,7 +65,7 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.approve_threshold = self.config.finalize_function.apply_service.auto_approval_threshold self.suggest_threshold = self.config.finalize_function.apply_service.auto_suggest_threshold - self.sink_node_id = DirectRelationReference( + self.sink_node_id = NodeId( space=config.finalize_function.apply_service.sink_node.space, external_id=config.finalize_function.apply_service.sink_node.external_id, ) @@ -192,13 +192,18 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: return counts def _process_pattern_results(self, result_item: dict, file_node: Node) -> tuple[list[EdgeApply], list[RowWrite]]: - # ... (This method's internal logic remains the same as the previous version) file_id: NodeId = file_node.as_id() source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) doc_patterns, edge_applies = [], [] for detect_annotation in result_item["annotations"]: for entity in detect_annotation.get("entities", []): + if detect_annotation["confidence"] >= self.approve_threshold: + annotation_status = DiagramAnnotationStatus.APPROVED.value + elif detect_annotation["confidence"] >= self.suggest_threshold: + annotation_status = DiagramAnnotationStatus.SUGGESTED.value + else: + continue external_id = self._create_pattern_annotation_id(file_id, detect_annotation) now = datetime.now(timezone.utc).replace(microsecond=0) annotation_type = entity.get( @@ -207,8 +212,8 @@ def _process_pattern_results(self, result_item: dict, file_node: Node) -> tuple[ annotation_properties = { "name": file_id.external_id, - "confidence": detect_annotation.get("confidence", 0.0), - "status": DiagramAnnotationStatus.SUGGESTED.value, + "confidence": detect_annotation["confidence"], + "status": annotation_status, "startNodePageNumber": detect_annotation["region"]["page"], "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), @@ -222,14 +227,17 @@ def _process_pattern_results(self, result_item: dict, file_node: Node) -> tuple[ } edge_apply = EdgeApply( - space=file_id.space, + # NOTE: Don't want to store the edge in the instance space that the files are in. Should prevent these edges from being seen when using the location configurations ~ e.g. the search and canvas UI + space=self.sink_node_id.space, external_id=external_id, type=DirectRelationReference( space=self.core_annotation_view_id.space, external_id=annotation_type, ), start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), - end_node=self.sink_node_id, + end_node=DirectRelationReference( + space=self.sink_node_id.space, external_id=self.sink_node_id.external_id + ), sources=[ NodeOrEdgeData( source=self.core_annotation_view_id, @@ -385,13 +393,15 @@ def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationRe if negate: final_filter = And(start_node_filter, dm.filters.Not(end_node_filter)) + space = node_id.space else: + space = self.sink_node_id.space final_filter = And(start_node_filter, end_node_filter) annotations = self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view_id], - space=node_id.space, + space=space, filter=final_filter, limit=-1, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py index e7eff7a8..f1db22d9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py @@ -31,6 +31,7 @@ class EnvConfig: class DiagramAnnotationStatus(str, Enum): SUGGESTED = "Suggested" APPROVED = "Approved" + REJECTED = "Rejected" class AnnotationStatus(str, Enum): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index e7eff7a8..f1db22d9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -31,6 +31,7 @@ class EnvConfig: class DiagramAnnotationStatus(str, Enum): SUGGESTED = "Suggested" APPROVED = "Approved" + REJECTED = "Rejected" class AnnotationStatus(str, Enum): From 0eef9d8abee943fc745f32b77cc1e8f9131b2f1c Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 6 Oct 2025 17:57:39 -0500 Subject: [PATCH 063/128] only capture non-duplicate edges between pattern mode and regular diagram detect --- .../services/ApplyService.py | 325 +++++++----------- 1 file changed, 132 insertions(+), 193 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 8e2c4287..bf16a0e2 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -26,10 +26,6 @@ class IApplyService(abc.ABC): - """ - Interface for applying/deleting annotations to a node - """ - @abc.abstractmethod def process_and_apply_annotations_for_file( self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool @@ -46,10 +42,6 @@ def update_instances( class GeneralApplyService(IApplyService): - """ - Implementation of the ApplyService interface. - """ - EXTERNAL_ID_LIMIT = 256 FUNCTION_ID = "fn_file_annotation_finalize" @@ -57,15 +49,12 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger - - self.core_annotation_view_id: ViewId = self.config.data_model_views.core_annotation_view.as_view_id() - self.file_view_id: ViewId = self.config.data_model_views.file_view.as_view_id() + self.core_annotation_view_id: ViewId = config.data_model_views.core_annotation_view.as_view_id() + self.file_view_id: ViewId = config.data_model_views.file_view.as_view_id() self.file_annotation_type = config.data_model_views.file_view.annotation_type - - self.approve_threshold = self.config.finalize_function.apply_service.auto_approval_threshold - self.suggest_threshold = self.config.finalize_function.apply_service.auto_suggest_threshold - - self.sink_node_id = NodeId( + self.approve_threshold = config.finalize_function.apply_service.auto_approval_threshold + self.suggest_threshold = config.finalize_function.apply_service.auto_suggest_threshold + self.sink_node_ref = DirectRelationReference( space=config.finalize_function.apply_service.sink_node.space, external_id=config.finalize_function.apply_service.sink_node.external_id, ) @@ -73,82 +62,77 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio def process_and_apply_annotations_for_file( self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool ) -> tuple[str, str]: - """ - Performs the entire annotation transaction for a single file. - """ file_id = file_node.as_id() - source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) + source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) - # Step 1: Clean old annotations if required if clean_old: deleted_counts = self._delete_annotations_for_file(file_id) self.logger.info( - f"\t- Deleted {deleted_counts['doc']} doc and {deleted_counts['tag']} tag annotations\n\t- Deleted {deleted_counts['pattern']} pattern annotations." + f"\t- Deleted {deleted_counts['doc']} doc, {deleted_counts['tag']} tag, and {deleted_counts['pattern']} pattern annotations." ) - # Step 2: Process and apply regular annotations + # Step 1: Process regular annotations and collect their stable hashes regular_edges, doc_rows, tag_rows = [], [], [] + processed_hashes = set() if regular_item and regular_item.get("annotations"): for annotation in regular_item["annotations"]: + stable_hash = self._create_stable_hash(annotation) + processed_hashes.add(stable_hash) edges = self._detect_annotation_to_edge_applies(file_id, source_id, doc_rows, tag_rows, annotation) regular_edges.extend(edges.values()) - # Step 3: Process and apply pattern annotations + # Step 2: Process pattern annotations, skipping any that were already processed pattern_edges, pattern_rows = [], [] if pattern_item and pattern_item.get("annotations"): - pattern_edges, pattern_rows = self._process_pattern_results(pattern_item, file_node) + pattern_edges, pattern_rows = self._process_pattern_results(pattern_item, file_node, processed_hashes) - # Step 4: Apply all changes in batches + # Step 3: Update the file node tag node_apply = file_node.as_write() node_apply.existing_version = None tags = cast(list[str], node_apply.sources[0].properties["tags"]) if "AnnotationInProcess" in tags: tags[tags.index("AnnotationInProcess")] = "Annotated" elif "Annotated" not in tags: - raise ValueError("Annotated and AnnotationInProcess not found in tag property") + self.logger.warning( + f"File {file_id.external_id} was processed, but 'AnnotationInProcess' tag was not found." + ) + # Step 4: Apply all data model and RAW changes self.update_instances(list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges) - + db_name = self.config.finalize_function.report_service.raw_db if doc_rows: self.client.raw.rows.insert( - db_name=self.config.finalize_function.report_service.raw_db, + db_name=db_name, table_name=self.config.finalize_function.report_service.raw_table_doc_doc, row=doc_rows, ensure_parent=True, ) if tag_rows: self.client.raw.rows.insert( - db_name=self.config.finalize_function.report_service.raw_db, + db_name=db_name, table_name=self.config.finalize_function.report_service.raw_table_doc_tag, row=tag_rows, ensure_parent=True, ) if pattern_rows: self.client.raw.rows.insert( - db_name=self.config.finalize_function.report_service.raw_db, + db_name=db_name, table_name=self.config.finalize_function.report_service.raw_table_doc_pattern, row=pattern_rows, ensure_parent=True, ) - annotation_msg = f"Applied {len(doc_rows)} doc and {len(tag_rows)} tag annotations." - pattern_msg = f"Applied {len(pattern_rows)} pattern detections." - - return annotation_msg, pattern_msg + return ( + f"Applied {len(doc_rows)} doc and {len(tag_rows)} tag annotations.", + f"Created {len(pattern_rows)} new pattern detections.", + ) - def update_instances( - self, - list_node_apply: list[NodeApply] | NodeApply | None = None, - list_edge_apply: list[EdgeApply] | EdgeApply | None = None, - ) -> InstancesApplyResult: + def update_instances(self, list_node_apply=None, list_edge_apply=None) -> InstancesApplyResult: return self.client.data_modeling.instances.apply(nodes=list_node_apply, edges=list_edge_apply, replace=False) def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: - """Deletes all standard and pattern edges and their corresponding RAW rows for a file.""" counts = {"doc": 0, "tag": 0, "pattern": 0} - - # Standard annotations - std_edges = self._list_annotations_for_file(file_id, self.sink_node_id, negate=True) + std_edges = self._list_annotations_for_file(file_id, self.sink_node_ref, negate=True) if std_edges: edge_ids, doc_keys, tag_keys = [], [], [] for edge in std_edges: @@ -157,7 +141,6 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: doc_keys.append(edge.external_id) else: tag_keys.append(edge.external_id) - if edge_ids: self.client.data_modeling.instances.delete(edges=edge_ids) if doc_keys: @@ -174,8 +157,7 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: ) counts["doc"], counts["tag"] = len(doc_keys), len(tag_keys) - # Pattern annotations - pattern_edges = self._list_annotations_for_file(file_id, self.sink_node_id, negate=False) + pattern_edges = self._list_annotations_for_file(file_id, self.sink_node_ref, negate=False) if pattern_edges: edge_ids = [edge.as_id() for edge in pattern_edges] row_keys = [edge.external_id for edge in pattern_edges] @@ -188,80 +170,72 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: key=row_keys, ) counts["pattern"] = len(row_keys) - return counts - def _process_pattern_results(self, result_item: dict, file_node: Node) -> tuple[list[EdgeApply], list[RowWrite]]: - file_id: NodeId = file_node.as_id() - source_id: str | None = cast(str, file_node.properties[self.file_view_id].get("sourceId")) - + def _process_pattern_results( + self, result_item: dict, file_node: Node, existing_hashes: set + ) -> tuple[list[EdgeApply], list[RowWrite]]: + file_id = file_node.as_id() + source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) doc_patterns, edge_applies = [], [] - for detect_annotation in result_item["annotations"]: + for detect_annotation in result_item.get("annotations", []): + stable_hash = self._create_stable_hash(detect_annotation) + if stable_hash in existing_hashes: + continue # Skip creating a pattern edge if a regular one already exists for this detection + for entity in detect_annotation.get("entities", []): - if detect_annotation["confidence"] >= self.approve_threshold: - annotation_status = DiagramAnnotationStatus.APPROVED.value - elif detect_annotation["confidence"] >= self.suggest_threshold: - annotation_status = DiagramAnnotationStatus.SUGGESTED.value - else: - continue external_id = self._create_pattern_annotation_id(file_id, detect_annotation) now = datetime.now(timezone.utc).replace(microsecond=0) annotation_type = entity.get( "annotation_type", self.config.data_model_views.target_entities_view.annotation_type ) - annotation_properties = { "name": file_id.external_id, - "confidence": detect_annotation["confidence"], - "status": annotation_status, - "startNodePageNumber": detect_annotation["region"]["page"], - "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), - "startNodeXMax": max(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMax": max(v["y"] for v in detect_annotation["region"]["vertices"]), - "startNodeText": detect_annotation["text"], + "confidence": detect_annotation.get("confidence", 0.0), + "status": DiagramAnnotationStatus.SUGGESTED.value, + "tags": [], + "startNodePageNumber": detect_annotation.get("region", {}).get("page"), + "startNodeXMin": min( + v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMin": min( + v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeXMax": max( + v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMax": max( + v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, "sourceCreatedTime": now.isoformat(), "sourceUpdatedTime": now.isoformat(), } - edge_apply = EdgeApply( - # NOTE: Don't want to store the edge in the instance space that the files are in. Should prevent these edges from being seen when using the location configurations ~ e.g. the search and canvas UI - space=self.sink_node_id.space, + space=self.sink_node_ref.space, external_id=external_id, - type=DirectRelationReference( - space=self.core_annotation_view_id.space, - external_id=annotation_type, - ), + type=DirectRelationReference(space=self.core_annotation_view_id.space, external_id=annotation_type), start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), - end_node=DirectRelationReference( - space=self.sink_node_id.space, external_id=self.sink_node_id.external_id - ), - sources=[ - NodeOrEdgeData( - source=self.core_annotation_view_id, - properties=annotation_properties, - ) - ], + end_node=self.sink_node_ref, + sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], ) edge_applies.append(edge_apply) - row_columns = { "externalId": external_id, "startSourceId": source_id, "startNode": file_id.external_id, "startNodeSpace": file_id.space, - "endNode": self.sink_node_id.external_id, - "endNodeSpace": self.sink_node_id.space, + "endNode": self.sink_node_ref.external_id, + "endNodeSpace": self.sink_node_ref.space, "endNodeResourceType": entity.get("resource_type", "Unknown"), "viewId": self.core_annotation_view_id.external_id, "viewSpace": self.core_annotation_view_id.space, "viewVersion": self.core_annotation_view_id.version, + **annotation_properties, } - row_columns.update(annotation_properties) doc_patterns.append(RowWrite(key=external_id, columns=row_columns)) - return edge_applies, doc_patterns def _detect_annotation_to_edge_applies( @@ -272,133 +246,105 @@ def _detect_annotation_to_edge_applies( doc_tag: list[RowWrite], detect_annotation: dict[str, Any], ) -> dict[tuple, EdgeApply]: - - diagram_annotations: dict[tuple, EdgeApply] = {} - annotation_schema_space: str = self.config.data_model_views.core_annotation_view.schema_space - - for entity in detect_annotation["entities"]: - if detect_annotation["confidence"] >= self.approve_threshold: - annotation_status = DiagramAnnotationStatus.APPROVED.value - elif detect_annotation["confidence"] >= self.suggest_threshold: - annotation_status = DiagramAnnotationStatus.SUGGESTED.value + # ... (This method remains largely the same) + diagram_annotations = {} + for entity in detect_annotation.get("entities", []): + if detect_annotation.get("confidence", 0.0) >= self.approve_threshold: + status = DiagramAnnotationStatus.APPROVED.value + elif detect_annotation.get("confidence", 0.0) >= self.suggest_threshold: + status = DiagramAnnotationStatus.SUGGESTED.value else: continue - external_id = self._create_annotation_id( - file_instance_id, - entity, - detect_annotation["text"], - detect_annotation, - ) - - doc_log = { - "externalId": external_id, - "startSourceId": source_id, - "startNode": file_instance_id.external_id, - "startNodeSpace": file_instance_id.space, - "endNode": entity["external_id"], - "endNodeSpace": entity["space"], - "endNodeResourceType": entity["resource_type"], - "viewId": self.core_annotation_view_id.external_id, - "viewSpace": self.core_annotation_view_id.space, - "viewVersion": self.core_annotation_view_id.version, - } + external_id = self._create_annotation_id(file_instance_id, entity, detect_annotation) now = datetime.now(timezone.utc).replace(microsecond=0) - annotation_properties = { "name": file_instance_id.external_id, - "confidence": detect_annotation["confidence"], - "status": annotation_status, - "startNodePageNumber": detect_annotation["region"]["page"], - "startNodeXMin": min(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMin": min(v["y"] for v in detect_annotation["region"]["vertices"]), - "startNodeXMax": max(v["x"] for v in detect_annotation["region"]["vertices"]), - "startNodeYMax": max(v["y"] for v in detect_annotation["region"]["vertices"]), - "startNodeText": detect_annotation["text"], + "confidence": detect_annotation.get("confidence"), + "status": status, + "startNodePageNumber": detect_annotation.get("region", {}).get("page"), + "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, + "sourceCreatedTime": now.isoformat(), + "sourceUpdatedTime": now.isoformat(), } - - doc_log.update(annotation_properties) - annotation_properties["sourceCreatedTime"] = now.isoformat() - annotation_properties["sourceUpdatedTime"] = now.isoformat() - - edge_apply_instance = EdgeApply( + edge = EdgeApply( space=file_instance_id.space, external_id=external_id, - existing_version=None, type=DirectRelationReference( - space=annotation_schema_space, - external_id=entity["annotation_type"], + space=self.core_annotation_view_id.space, external_id=entity.get("annotation_type") ), start_node=DirectRelationReference( - space=file_instance_id.space, - external_id=file_instance_id.external_id, + space=file_instance_id.space, external_id=file_instance_id.external_id ), - end_node=DirectRelationReference(space=entity["space"], external_id=entity["external_id"]), - sources=[ - NodeOrEdgeData( - source=self.core_annotation_view_id, - properties=annotation_properties, - ) - ], + end_node=DirectRelationReference(space=entity.get("space"), external_id=entity.get("external_id")), + sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], ) + key = self._get_edge_apply_unique_key(edge) + if key not in diagram_annotations: + diagram_annotations[key] = edge - edge_apply_key = self._get_edge_apply_unique_key(edge_apply_instance) - if edge_apply_key not in diagram_annotations: - diagram_annotations[edge_apply_key] = edge_apply_instance - - if entity["annotation_type"] == self.file_annotation_type: - doc_doc.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) + doc_log = { + "externalId": external_id, + "startSourceId": source_id, + "startNode": file_instance_id.external_id, + "startNodeSpace": file_instance_id.space, + "endNode": entity.get("external_id"), + "endNodeSpace": entity.get("space"), + "endNodeResourceType": entity.get("resource_type"), + "viewId": self.core_annotation_view_id.external_id, + "viewSpace": self.core_annotation_view_id.space, + "viewVersion": self.core_annotation_view_id.version, + **annotation_properties, + } + if entity.get("annotation_type") == self.file_annotation_type: + doc_doc.append(RowWrite(key=external_id, columns=doc_log)) else: - doc_tag.append(RowWrite(key=doc_log["externalId"], columns=doc_log)) - + doc_tag.append(RowWrite(key=external_id, columns=doc_log)) return diagram_annotations - def _create_annotation_id( - self, - file_id: NodeId, - entity: dict[str, Any], - text: str, - raw_annotation: dict[str, Any], - ) -> str: - hash_ = sha256(json.dumps(raw_annotation, sort_keys=True).encode()).hexdigest()[:10] - naive = f"{file_id.space}:{file_id.external_id}:{entity['space']}:{entity['external_id']}:{text}:{hash_}" + def _create_stable_hash(self, raw_annotation: dict[str, Any]) -> str: + text = raw_annotation.get("text", "") + region = raw_annotation.get("region", {}) + vertices = region.get("vertices", []) + sorted_vertices = sorted(vertices, key=lambda v: (v.get("x", 0), v.get("y", 0))) + stable_representation = {"text": text, "page": region.get("page"), "vertices": sorted_vertices} + return sha256(json.dumps(stable_representation, sort_keys=True).encode()).hexdigest()[:10] + + def _create_annotation_id(self, file_id: NodeId, entity: dict[str, Any], raw_annotation: dict[str, Any]) -> str: + hash_ = self._create_stable_hash(raw_annotation) + text = raw_annotation.get("text", "") + naive = f"{file_id.external_id}:{entity.get('external_id')}:{text}:{hash_}" if len(naive) < self.EXTERNAL_ID_LIMIT: return naive - - prefix = f"{file_id.external_id}:{entity['external_id']}:{text}" - shorten = f"{prefix}:{hash_}" - if len(shorten) < self.EXTERNAL_ID_LIMIT: - return shorten - - return prefix[: self.EXTERNAL_ID_LIMIT - 10] + hash_ + prefix = f"{file_id.external_id}:{entity.get('external_id')}:{text}" + if len(prefix) > self.EXTERNAL_ID_LIMIT - 11: + prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] + return f"{prefix}:{hash_}" def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[str, Any]) -> str: - text = raw_annotation["text"] - hash_ = sha256(json.dumps(raw_annotation, sort_keys=True).encode()).hexdigest()[:10] + hash_ = self._create_stable_hash(raw_annotation) + text = raw_annotation.get("text", "") prefix = f"pattern:{file_id.external_id}:{text}" - if len(prefix) > self.EXTERNAL_ID_LIMIT - 11: prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] - return f"{prefix}:{hash_}" def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationReference, negate: bool = False): - """ - List all annotation edges for a file node, optionally filtering by the end node. - """ start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) end_node_filter = Equals(["edge", "endNode"], {"space": end_node.space, "externalId": end_node.external_id}) - if negate: final_filter = And(start_node_filter, dm.filters.Not(end_node_filter)) space = node_id.space else: - space = self.sink_node_id.space + space = self.sink_node_ref.space final_filter = And(start_node_filter, end_node_filter) - - annotations = self.client.data_modeling.instances.list( + return self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view_id], space=space, @@ -406,19 +352,12 @@ def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationRe limit=-1, ) - return annotations - def _get_edge_apply_unique_key(self, edge_apply_instance: EdgeApply) -> tuple: - start_node_key = ( - edge_apply_instance.start_node.space, - edge_apply_instance.start_node.external_id, - ) - end_node_key = ( - edge_apply_instance.end_node.space, - edge_apply_instance.end_node.external_id, - ) - type_key = ( - edge_apply_instance.type.space, - edge_apply_instance.type.external_id, + start_node = edge_apply_instance.start_node + end_node = edge_apply_instance.end_node + type_ = edge_apply_instance.type + return ( + (start_node.space, start_node.external_id) if start_node else None, + (end_node.space, end_node.external_id) if end_node else None, + (type_.space, type_.external_id) if type_ else None, ) - return (start_node_key, end_node_key, type_key) From 59fb057484a70be86e8b1d6b0c9f45fde2a24db0 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 6 Oct 2025 19:38:08 -0500 Subject: [PATCH 064/128] updated comments --- .../services/ApplyService.py | 20 ++++++++++++- .../services/FinalizeService.py | 28 ++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index bf16a0e2..b0cc9d43 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -26,6 +26,10 @@ class IApplyService(abc.ABC): + """ + Interface for applying/deleting annotations to a node + """ + @abc.abstractmethod def process_and_apply_annotations_for_file( self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool @@ -42,6 +46,10 @@ def update_instances( class GeneralApplyService(IApplyService): + """ + Implementation of the ApplyService interface. + """ + EXTERNAL_ID_LIMIT = 256 FUNCTION_ID = "fn_file_annotation_finalize" @@ -62,6 +70,9 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio def process_and_apply_annotations_for_file( self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool ) -> tuple[str, str]: + """ + Performs the entire annotation transaction for a single file. + """ file_id = file_node.as_id() source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) @@ -131,6 +142,8 @@ def update_instances(self, list_node_apply=None, list_edge_apply=None) -> Instan return self.client.data_modeling.instances.apply(nodes=list_node_apply, edges=list_edge_apply, replace=False) def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: + """Deletes all standard and pattern edges and their corresponding RAW rows for a file.""" + counts = {"doc": 0, "tag": 0, "pattern": 0} std_edges = self._list_annotations_for_file(file_id, self.sink_node_ref, negate=True) if std_edges: @@ -246,7 +259,6 @@ def _detect_annotation_to_edge_applies( doc_tag: list[RowWrite], detect_annotation: dict[str, Any], ) -> dict[tuple, EdgeApply]: - # ... (This method remains largely the same) diagram_annotations = {} for entity in detect_annotation.get("entities", []): if detect_annotation.get("confidence", 0.0) >= self.approve_threshold: @@ -309,6 +321,9 @@ def _detect_annotation_to_edge_applies( return diagram_annotations def _create_stable_hash(self, raw_annotation: dict[str, Any]) -> str: + """ + Creates a hash based off items of a potential annotation. This is used such that we don't create duplicate annotations for pattern mode and regular results. + """ text = raw_annotation.get("text", "") region = raw_annotation.get("region", {}) vertices = region.get("vertices", []) @@ -336,6 +351,9 @@ def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[st return f"{prefix}:{hash_}" def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationReference, negate: bool = False): + """ + List all annotation edges for a file node, optionally filtering by the end node. + """ start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) end_node_filter = Equals(["edge", "endNode"], {"space": end_node.space, "externalId": end_node.external_id}) if negate: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 2ee6aed8..15bb077b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -26,6 +26,8 @@ class AbstractFinalizeService(abc.ABC): """ Orchestrates the file annotation finalize process. + This service retrieves the results of the diagram detect jobs from the launch function and then applies annotations to the file. + Additionally, it captures the file and asset annotations into separate RAW tables. """ def __init__( @@ -268,6 +270,20 @@ def _process_annotation_state( ) -> NodeApply: """ Create a node apply from the node passed into the function. + The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. + Thus, we set it here and include logic to handle the scneario where it is set. + NOTE: Always want to use the latest page count from the diagram detect results + e.g.) let page_range = 50 + - If the pdf has less than 50 pages, say 3 pages, then... + - annotationStatus property will get set to 'complete' + - annotatedPageCount and pageCount properties will be set to 3. + - Elif the pdf has more than 50 pages, say 80, then... + - annotationStatus property will get set to 'new' + - annotatedPageCount set to 50 + - pageCount set to 80 + - attemptCount doesn't get incremented + - If an error occurs, the annotated_page_count and page_count won't be passed + - Don't want to touch the pageCount and annotatedPageCount properties in this scenario """ update_properties = { "annotationStatus": status, @@ -298,7 +314,17 @@ def _process_annotation_state( def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: """ - Checks if all pages have been annotated and returns the new annotated page count. + The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. + - if annotated_page_count is not set (first run): + - if page_range >= to the page count: + - annotated_page_count = page_count b/c all of the pages were passed into the FileReference during LaunchService + - else: + - annotated_page_count = page_range b/c there are more pages to annotate + - else the annotation_page_count property is set: + - if (annotated_page_count + page_range) >= page_count: + - annotated_page_count = page_count b/c all of the pages were passed into the FileReference during LaunchService + else: + - annotated_page_count = self.page_range + annotated_page_count b/c there are more pages to annotate """ annotated_page_count: int | None = cast( int, From 4f8dbb8fa76c66b7d8c70d04178a9a320d04ce80 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 7 Oct 2025 16:50:18 -0500 Subject: [PATCH 065/128] updated streamlit module --- .../file_annotation_dashboard/helper.py | 27 +- .../pages/Annotation_Quality.py | 243 ++++++------------ 2 files changed, 103 insertions(+), 167 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index e5f05d8e..5f790b22 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -5,8 +5,16 @@ import pandas as pd from datetime import datetime, timedelta from cognite.client import CogniteClient -from cognite.client.data_classes import RowWrite -from cognite.client.data_classes.data_modeling import ViewId, NodeId, Node, filters +from cognite.client.data_classes import RowWrite, Asset, AssetFilter +from cognite.client.data_classes.data_modeling import ( + ViewId, + NodeId, + Node, + filters, + EdgeApply, + NodeOrEdgeData, + DirectRelationReference, +) from cognite.client.data_classes.functions import FunctionCallLog from data_structures import ViewPropertyConfig from canvas import dm_generate @@ -477,3 +485,18 @@ def strip_leading_zeros(match): # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string # This turns "v0912" into "v912" return re.sub(r"\d+", strip_leading_zeros, s) + + +@st.cache_data(ttl=600) +def fetch_potential_annotations(db_name: str, table_name: str, file_external_id: str) -> pd.DataFrame: + """Fetches potential annotations for a specific file from the patterns RAW table.""" + try: + rows = client.raw.rows.list( + db_name=db_name, table_name=table_name, limit=-1, filter={"startNode": file_external_id} + ) + if not rows: + return pd.DataFrame() + return pd.DataFrame([row.columns for row in rows]) + except Exception as e: + st.error(f"Failed to fetch potential annotations: {e}") + return pd.DataFrame() diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 76c69fd4..09ae8114 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -88,7 +88,7 @@ def reset_selection(): "Overall Annotation Quality", help="Provides a high-level summary of pattern performance across all files. Use these aggregate metrics, charts, and tag lists to understand the big picture and identify systemic trends or gaps in the pattern catalog.", ) - all_resource_types = ["All"] + sorted(df_patterns["resourceType"].unique().tolist()) + all_resource_types = ["All"] + sorted(df_patterns["endNodeResourceType"].unique().tolist()) selected_resource_type = st.selectbox( "Filter by Resource Type:", options=all_resource_types, @@ -100,14 +100,14 @@ def reset_selection(): df_metrics_input = df_patterns df_annotations_input = df_annotations else: - df_metrics_input = df_patterns[df_patterns["resourceType"] == selected_resource_type] + df_metrics_input = df_patterns[df_patterns["endNodeResourceType"] == selected_resource_type] if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns: df_annotations_input = df_annotations[df_annotations["endNodeResourceType"] == selected_resource_type] else: df_annotations_input = pd.DataFrame() # 1. Get the original, un-normalized sets of strings - potential_tags_original = set(df_metrics_input["text"]) + potential_tags_original = set(df_metrics_input["startNodeText"]) actual_annotations_original = ( set(df_annotations_input["startNodeText"]) if not df_annotations_input.empty and "startNodeText" in df_annotations_input.columns @@ -124,48 +124,35 @@ def reset_selection(): normalized_actual_set = {normalize(t) for t in actual_annotations_original} # 4. Perform all set operations on the normalized data for accurate logic - normalized_matched = normalized_potential_set.intersection(normalized_actual_set) normalized_unmatched = normalized_potential_set - normalized_actual_set - normalized_missed = normalized_actual_set - normalized_potential_set # 5. Use the map to get the final sets with original text for display - matched_tags_set = {text_map[t] for t in normalized_matched if t in text_map} - unmatched_by_annotation_set = {text_map[t] for t in normalized_unmatched if t in text_map} - missed_by_pattern_set = {text_map[t] for t in normalized_missed if t in text_map} + actual_annotations_set = {text_map[t] for t in normalized_actual_set if t in text_map} + potential_new_annotations_set = {text_map[t] for t in normalized_unmatched if t in text_map} - total_matched = len(matched_tags_set) - total_unmatched = len(unmatched_by_annotation_set) - total_missed = len(missed_by_pattern_set) + total_actual = len(actual_annotations_set) + total_potential = len(potential_new_annotations_set) overall_coverage = ( - (total_matched / (total_matched + total_unmatched)) * 100 if (total_matched + total_unmatched) > 0 else 0 - ) - overall_completeness = ( - (total_matched / (total_missed + total_matched)) * 100 if (total_missed + total_matched) > 0 else 0 + (total_actual / (total_actual + total_potential)) * 100 if (total_actual + total_potential) > 0 else 0 ) - kpi_col1, kpi_col2 = st.columns(2) - kpi_col1.metric( + st.metric( "Overall Annotation Coverage", f"{overall_coverage:.2f}%", - help="Of all potential tags found by patterns, this is the percentage that were successfully annotated. Formula: Matched / (Matched + Unmatched)", - ) - kpi_col2.metric( - "Overall Pattern Completeness", - f"{overall_completeness:.2f}%", - help="Of all annotations created, this is the percentage that the patterns successfully predicted. Formula: Matched / (Matched + Missed by Pattern)", + help="The percentage of all unique tags (both actual and potential) that have been successfully annotated. Formula: Total Actual Annotations / (Total Actual Annotations + Total Potential New Annotations)", ) st.divider() chart_data = [] for resource_type in all_resource_types[1:]: - df_patterns_filtered = df_patterns[df_patterns["resourceType"] == resource_type] + df_patterns_filtered = df_patterns[df_patterns["endNodeResourceType"] == resource_type] df_annotations_filtered = ( df_annotations[df_annotations["endNodeResourceType"] == resource_type] if not df_annotations.empty and "endNodeResourceType" in df_annotations.columns else pd.DataFrame() ) - potential = set(df_patterns_filtered["text"]) + potential = set(df_patterns_filtered["startNodeText"]) actual = ( set(df_annotations_filtered["startNodeText"]) if not df_annotations_filtered.empty and "startNodeText" in df_annotations_filtered.columns @@ -174,19 +161,21 @@ def reset_selection(): # Use normalized comparison for chart data as well norm_potential = {normalize(p) for p in potential} norm_actual = {normalize(a) for a in actual} - matched = len(norm_potential.intersection(norm_actual)) - unmatched = len(norm_potential - norm_actual) - missed = len(norm_actual - norm_potential) - coverage = (matched / (matched + unmatched)) * 100 if (matched + unmatched) > 0 else 0 - completeness = (matched / (matched + missed)) * 100 if (matched + missed) > 0 else 0 + + total_actual_rt = len(norm_actual) + total_potential_rt = len(norm_potential - norm_actual) + + coverage = ( + (total_actual_rt / (total_actual_rt + total_potential_rt)) * 100 + if (total_actual_rt + total_potential_rt) > 0 + else 0 + ) chart_data.append( { "resourceType": resource_type, "coverageRate": coverage, - "completenessRate": completeness, - "matchedTags": matched, - "unmatchedByAnnotation": unmatched, - "missedByPattern": missed, + "actualAnnotations": total_actual_rt, + "potentialNewAnnotations": total_potential_rt, } ) @@ -198,33 +187,18 @@ def reset_selection(): ) if not df_chart_display.empty: - chart_col1, chart_col2 = st.columns(2) - with chart_col1: - coverage_chart = ( - alt.Chart(df_chart_display) - .mark_bar() - .encode( - x=alt.X("resourceType:N", title="Resource Type", sort="-y"), - y=alt.Y("coverageRate:Q", title="Annotation Coverage (%)", scale=alt.Scale(domain=[0, 100])), - tooltip=["resourceType", "coverageRate", "matchedTags", "unmatchedByAnnotation"], - ) - .properties(title="Annotation Coverage by Resource Type") - ) - st.altair_chart(coverage_chart, use_container_width=True) - with chart_col2: - completeness_chart = ( - alt.Chart(df_chart_display) - .mark_bar() - .encode( - x=alt.X("resourceType:N", title="Resource Type", sort="-y"), - y=alt.Y( - "completenessRate:Q", title="Pattern Completeness (%)", scale=alt.Scale(domain=[0, 100]) - ), - tooltip=["resourceType", "completenessRate", "matchedTags", "missedByPattern"], - ) - .properties(title="Pattern Completeness by Resource Type") + coverage_chart = ( + alt.Chart(df_chart_display) + .mark_bar() + .encode( + x=alt.X("resourceType:N", title="Resource Type", sort="-y"), + y=alt.Y("coverageRate:Q", title="Annotation Coverage (%)", scale=alt.Scale(domain=[0, 100])), + tooltip=["resourceType", "coverageRate", "actualAnnotations", "potentialNewAnnotations"], ) - st.altair_chart(completeness_chart, use_container_width=True) + .properties(title="Annotation Coverage by Resource Type") + ) + st.altair_chart(coverage_chart, use_container_width=True) + else: st.info("No data available for the selected resource type to generate charts.") @@ -259,37 +233,26 @@ def reset_selection(): hide_index=True, column_config={"sample": "Pattern"}, ) - tag_col1, tag_col2, tag_col3 = st.columns(3) + tag_col1, tag_col2 = st.columns(2) with tag_col1: st.metric( - "✅ Matched Tags", - f"{total_matched}", - help="Tags that were correctly identified by the pattern catalog and were also created as final annotations. This represents the successful overlap between the two processes.", + "✅ Actual Annotations", + f"{total_actual}", + help="A list of all unique tags that have been successfully created. This is our ground truth.", ) st.dataframe( - pd.DataFrame(sorted(list(matched_tags_set)), columns=["Tag"]), + pd.DataFrame(sorted(list(actual_annotations_set)), columns=["Tag"]), use_container_width=True, hide_index=True, ) with tag_col2: st.metric( - "❓ Unmatched by Annotation", - f"{total_unmatched}", - help="Tags that were found by the pattern catalog but do not exist as final annotations. This can help identify if patterns are too broad (false positives) or if the standard annotation process missed them.", - ) - st.dataframe( - pd.DataFrame(sorted(list(unmatched_by_annotation_set)), columns=["Tag"]), - use_container_width=True, - hide_index=True, - ) - with tag_col3: - st.metric( - "❗️ Missed by Pattern", - f"{total_missed}", - help="Created annotations that were not found by the pattern catalog. This can help us measure the reliability of pattern mode as a denominator.", + "💡 Potential New Annotations", + f"{total_potential}", + help="A list of all unique tags found by the pattern-mode job that do not yet exist as actual annotations. This is now a clean 'to-do list' of tags that could be promoted or used to create new patterns.", ) st.dataframe( - pd.DataFrame(sorted(list(missed_by_pattern_set)), columns=["Tag"]), + pd.DataFrame(sorted(list(potential_new_annotations_set)), columns=["Tag"]), use_container_width=True, hide_index=True, ) @@ -312,7 +275,7 @@ def reset_selection(): else: df_annotations_file = pd.concat([df_tags_file, df_docs_file], ignore_index=True) df_patterns_agg_file = ( - df_patterns_file.groupby("startNode")["text"].apply(set).reset_index(name="potentialTags") + df_patterns_file.groupby("startNode")["startNodeText"].apply(set).reset_index(name="potentialTags") ) df_annotations_agg_file = ( df_annotations_file.groupby("startNode")["startNodeText"].apply(set).reset_index(name="actualAnnotations") @@ -332,25 +295,21 @@ def calculate_metrics(row): norm_potential = {normalize(p) for p in potential} norm_actual = {normalize(a) for a in actual} - matched = len(norm_potential.intersection(norm_actual)) - unmatched = len(norm_potential - norm_actual) - missed = len(norm_actual - norm_potential) - return matched, unmatched, missed + total_actual_pf = len(norm_actual) + total_potential_pf = len(norm_potential - norm_actual) + + return total_actual_pf, total_potential_pf metrics = df_quality_file.apply(calculate_metrics, axis=1, result_type="expand") - df_quality_file[["matchedTags", "unmatchedByAnnotation", "missedByPattern"]] = metrics + df_quality_file[["actualAnnotationsCount", "potentialNewAnnotationsCount"]] = metrics df_quality_file["coverageRate"] = ( ( - df_quality_file["matchedTags"] - / (df_quality_file["matchedTags"] + df_quality_file["unmatchedByAnnotation"]) + df_quality_file["actualAnnotationsCount"] + / (df_quality_file["actualAnnotationsCount"] + df_quality_file["potentialNewAnnotationsCount"]) ) * 100 ).fillna(0) - df_quality_file["completenessRate"] = ( - (df_quality_file["matchedTags"] / (df_quality_file["matchedTags"] + df_quality_file["missedByPattern"])) - * 100 - ).fillna(0) df_file_meta = fetch_annotation_states(annotation_state_view, file_view) df_display_unfiltered = ( @@ -365,11 +324,9 @@ def calculate_metrics(row): "startNode", "potentialTags", "actualAnnotations", - "matchedTags", - "unmatchedByAnnotation", - "missedByPattern", + "actualAnnotationsCount", + "potentialNewAnnotationsCount", "coverageRate", - "completenessRate", "externalId", "space", "annotatedPageCount", @@ -414,14 +371,6 @@ def calculate_metrics(row): coverage_range = st.slider( "Filter by Annotation Coverage (%)", 0, 100, (0, 100), on_change=reset_selection, key="coverage_slider" ) - completeness_range = st.slider( - "Filter by Pattern Completeness (%)", - 0, - 100, - (0, 100), - on_change=reset_selection, - key="completeness_slider", - ) df_display = df_display_unfiltered.copy() if selected_column != "None" and selected_values: @@ -429,10 +378,6 @@ def calculate_metrics(row): df_display = df_display[ (df_display["coverageRate"] >= coverage_range[0]) & (df_display["coverageRate"] <= coverage_range[1]) ] - df_display = df_display[ - (df_display["completenessRate"] >= completeness_range[0]) - & (df_display["completenessRate"] <= completeness_range[1]) - ] df_display = df_display.reset_index(drop=True) df_display.insert(0, "Select", False) @@ -442,7 +387,6 @@ def calculate_metrics(row): "fileSourceid", "fileMimetype", "coverageRate", - "completenessRate", "annotationMessage", "patternModeMessage", "lastUpdatedTime", @@ -475,14 +419,7 @@ def calculate_metrics(row): "fileExternalId": "File External ID", "coverageRate": st.column_config.ProgressColumn( "Annotation Coverage ℹ️", - help="How many potential tags were found? (Matched / Potential)", - format="%.2f%%", - min_value=0, - max_value=100, - ), - "completenessRate": st.column_config.ProgressColumn( - "Pattern Completeness ℹ️", - help="How many final annotations did patterns find? (Matched / Actual)", + help="The percentage of all unique tags (both actual and potential) that have been successfully annotated. Formula: Total Actual Annotations / (Total Actual Annotations + Total Potential New Annotations)", format="%.2f%%", min_value=0, max_value=100, @@ -519,53 +456,44 @@ def calculate_metrics(row): file_space = file_space_series.iloc[0] file_node_id = NodeId(space=file_space, external_id=selected_file) df_potential_tags_details = df_patterns_file[df_patterns_file["startNode"] == selected_file][ - ["text", "resourceType", "regions"] + ["startNodeText", "endNodeResourceType"] ] df_actual_annotations_details = ( df_annotations_file[df_annotations_file["startNode"] == selected_file][ ["startNodeText", "endNodeResourceType"] - ].rename(columns={"startNodeText": "text", "endNodeResourceType": "resourceType"}) + ] if not df_annotations_file.empty - else pd.DataFrame(columns=["text", "resourceType"]) + else pd.DataFrame(columns=["startNodeText", "endNodeResourceType"]) ) # Use normalized comparison for per-file detail view - potential_set = set(df_potential_tags_details["text"]) - actual_set = set(df_actual_annotations_details["text"]) + potential_set = set(df_potential_tags_details["startNodeText"]) + actual_set = set(df_actual_annotations_details["startNodeText"]) norm_potential = {normalize(p) for p in potential_set} norm_actual = {normalize(a) for a in actual_set} # We need a map from normalized text back to original for accurate filtering potential_map = {normalize(text): text for text in potential_set} - actual_map = {normalize(text): text for text in actual_set} - norm_matched = norm_potential.intersection(norm_actual) norm_unmatched = norm_potential - norm_actual - norm_missed = norm_actual - norm_potential - - matched_set = {potential_map[t] for t in norm_matched} - unmatched_set = {potential_map[t] for t in norm_unmatched} - missed_set = {actual_map[t] for t in norm_missed} - - matched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(matched_set) - ].drop_duplicates(subset=["text", "resourceType"]) - unmatched_df = df_potential_tags_details[ - df_potential_tags_details["text"].isin(unmatched_set) - ].drop_duplicates(subset=["text", "resourceType"]) - missed_df = df_actual_annotations_details[ - df_actual_annotations_details["text"].isin(missed_set) - ].drop_duplicates() + + actual_df = df_actual_annotations_details.drop_duplicates() + potential_df = df_potential_tags_details[ + df_potential_tags_details["startNodeText"].isin({potential_map[t] for t in norm_unmatched}) + ].drop_duplicates(subset=["startNodeText", "endNodeResourceType"]) if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): with st.spinner("Generating Industrial Canvas with bounding boxes..."): _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) - unmatched_tags_for_canvas = unmatched_df[["text", "regions"]].to_dict("records") + # The 'regions' column is no longer available in the RAW table. + # You will need to adjust the canvas generation logic to handle this. + # For now, we will pass an empty list. + potential_tags_for_canvas = [] canvas_url = generate_file_canvas( file_id=file_node_id, file_view=file_view_config, ep_config=ep_config, - unmatched_tags_with_regions=unmatched_tags_for_canvas, + unmatched_tags_with_regions=potential_tags_for_canvas, ) if canvas_url: st.session_state["generated_canvas_url"] = canvas_url @@ -579,47 +507,32 @@ def calculate_metrics(row): ) st.divider() - col1, col2, col3 = st.columns(3) + col1, col2 = st.columns(2) with col1: st.metric( - "✅ Matched Tags", - len(matched_df), - help="Tags that were correctly identified by the pattern catalog and were also created as final annotations. This represents the successful overlap between the two processes.", + "✅ Actual Annotations in this File", + len(actual_df), ) st.dataframe( - matched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, + actual_df, + column_config={"startNodeText": "Tag", "endNodeResourceType": "Resource Type"}, use_container_width=True, hide_index=True, ) with col2: st.metric( - "❓ Unmatched by Annotation", - len(unmatched_df), - help="Tags that were found by the pattern catalog but do not exist as final annotations. This can help identify if patterns are too broad (false positives) or if the standard annotation process missed them.", - ) - st.dataframe( - unmatched_df[["text", "resourceType"]], - column_config={"text": "Tag", "resourceType": "Resource Type"}, - use_container_width=True, - hide_index=True, - ) - with col3: - st.metric( - "❗️ Missed by Pattern", - len(missed_df), - help="Created annotations that were not found by the pattern catalog. This can help us measure the reliability of pattern mode as a denominator.", + "💡 Potential New Annotations in this File", + len(potential_df), ) st.dataframe( - missed_df, - column_config={"text": "Tag", "resourceType": "Resource Type"}, + potential_df, + column_config={"startNodeText": "Tag", "endNodeResourceType": "Resource Type"}, use_container_width=True, hide_index=True, ) else: st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") - # ========================================== # PATTERN MANAGEMENT TAB # ========================================== From 99d2597db916c8ee35a0b97c0c52b820a803ba81 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 7 Oct 2025 17:34:07 -0500 Subject: [PATCH 066/128] fixed pattern generation --- .../services/CacheService.py | 83 ++++++++----------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index f44fd9ac..e04ed942 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -209,7 +209,7 @@ def _convert_instances_to_entities( for instance in file_instances: instance_properties = instance.properties.get(self.file_view.as_view_id()) - if file_resource_type: + if target_entities_resource_type: resource_type: str = instance_properties[file_resource_type] else: resource_type: str = self.file_view.external_id @@ -227,22 +227,14 @@ def _convert_instances_to_entities( def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: """ - Generates pattern samples from entity search property by converting them into generalized templates. - This version analyzes the internal structure of each segment: - - Numbers are generalized to '0'. - - Letters are grouped into bracketed alternatives, even when mixed with numbers. - - Example: '629P' and '629X' will merge to create a pattern piece '000[P|X]'. + MODIFIED: Generates pattern samples using Implementation 1's logic + while adding the 'annotation_type' from Implementation 2. """ - # Structure: { resource_type: { 'templates': { template_key: list_of_variable_parts }, 'annotation_type': str } } - pattern_builders = defaultdict(lambda: {"templates": defaultdict(list), "annotation_type": None}) + # Structure: { resource_type: {"patterns": { template_key: [...] }, "annotation_type": "..."} } + pattern_builders = defaultdict(lambda: {"patterns": {}, "annotation_type": None}) self.logger.info(f"Generating pattern samples from {len(entities)} entities.") def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: - """ - Parses an alias into a structural template key and its variable letter components. - A segment '629P' yields a template '000A' and a variable part ['P']. - """ - alias_parts = re.split(r"([ -])", alias) full_template_key_parts: list[str] = [] all_variable_parts: list[list[str]] = [] @@ -267,38 +259,33 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str return "".join(full_template_key_parts), all_variable_parts for entity in entities: - resource_type = entity["resource_type"] - annotation_type = entity.get("annotation_type") - aliases = entity.get("search_property", []) - - if pattern_builders[resource_type]["annotation_type"] is None: - pattern_builders[resource_type]["annotation_type"] = annotation_type + key = entity["resource_type"] + if pattern_builders[key]["annotation_type"] is None: + pattern_builders[key]["annotation_type"] = entity.get("annotation_type") + aliases = entity.get("search_property", []) for alias in aliases: if not alias: continue - template_key, variable_parts_from_alias = _parse_alias(alias, resource_type) - - if not pattern_builders[resource_type]["templates"][template_key]: - new_variable_sets = [] - for part_group in variable_parts_from_alias: - new_variable_sets.append([set([lg]) for lg in part_group]) - pattern_builders[resource_type]["templates"][template_key] = new_variable_sets - else: - existing_variable_sets = pattern_builders[resource_type]["templates"][template_key] + template_key, variable_parts_from_alias = _parse_alias(alias, key) + resource_patterns = pattern_builders[key]["patterns"] + if template_key in resource_patterns: + existing_variable_sets = resource_patterns[template_key] for i, part_group in enumerate(variable_parts_from_alias): for j, letter_group in enumerate(part_group): - while i >= len(existing_variable_sets): - existing_variable_sets.append([]) - while j >= len(existing_variable_sets[i]): - existing_variable_sets[i].append(set()) existing_variable_sets[i][j].add(letter_group) + else: + new_variable_sets = [] + for part_group in variable_parts_from_alias: + new_variable_sets.append([set([lg]) for lg in part_group]) + resource_patterns[template_key] = new_variable_sets result = [] for resource_type, data in pattern_builders.items(): final_samples = [] + templates = data["patterns"] annotation_type = data["annotation_type"] - for template_key, collected_vars in data["templates"].items(): + for template_key, collected_vars in templates.items(): var_iter: Iterator[list[set[str]]] = iter(collected_vars) def build_segment(segment_template: str) -> str: @@ -310,7 +297,7 @@ def build_segment(segment_template: str) -> str: def replace_A(match): alternatives = sorted(list(next(letter_group_iter))) - return f"[{'|'.join(alternatives)}]" if len(alternatives) > 1 else alternatives[0] + return f"[{'|'.join(alternatives)}]" return re.sub(r"A+", replace_A, segment_template) except StopIteration: @@ -332,7 +319,7 @@ def replace_A(match): return result def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) -> list[dict]: - """Fetches and combines manual patterns from GLOBAL, primary, and secondary scopes.""" + """BUG FIX: Fetches manual patterns with correct error handling from Implementation 2.""" keys_to_fetch = ["GLOBAL"] if primary_scope: keys_to_fetch.append(primary_scope) @@ -350,35 +337,35 @@ def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) patterns = (row.columns or {}).get("patterns", []) all_manual_patterns.extend(patterns) except CogniteNotFoundError: - self.logger.info(f"No manual patterns found for keys: {keys_to_fetch}. This may be expected.") + self.logger.info(f"No manual patterns found for key: {key}. This may be expected.") except Exception as e: - self.logger.error(f"Failed to retrieve manual patterns: {e}") + self.logger.error(f"Failed to retrieve manual patterns for key {key}: {e}") + return all_manual_patterns def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict]) -> list[dict]: - """Merges auto-generated and manual patterns, de-duplicating samples.""" + """MODIFIED: Merges patterns while correctly handling the new 'annotation_type' field.""" merged = defaultdict(lambda: {"samples": set(), "annotation_type": None}) # Process auto-generated patterns for item in auto_patterns: resource_type = item.get("resource_type") - samples = item.get("sample", []) - annotation_type = item.get("annotation_type") if resource_type: - merged[resource_type]["samples"].update(samples) + merged[resource_type]["samples"].update(item.get("sample", [])) + # Set annotation_type if not already set if not merged[resource_type]["annotation_type"]: - merged[resource_type]["annotation_type"] = annotation_type + merged[resource_type]["annotation_type"] = item.get("annotation_type") # Process manual patterns for item in manual_patterns: resource_type = item.get("resource_type") - sample = item.get("sample") - annotation_type = item.get("annotation_type") - if resource_type and sample: - merged[resource_type]["samples"].add(sample) + if resource_type and item.get("sample"): + merged[resource_type]["samples"].add(item["sample"]) + # Set annotation_type if not already set (auto-patterns take precedence) if not merged[resource_type]["annotation_type"]: - merged[resource_type]["annotation_type"] = annotation_type + merged[resource_type]["annotation_type"] = item.get("annotation_type") + # Convert the merged dictionary back to the required list format final_list = [ { "resource_type": resource_type, @@ -388,5 +375,5 @@ def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict] for resource_type, data in merged.items() ] - self.logger.info(f"Merged auto-generated and manual patterns into {len(final_list)} resource types.") + self.logger.info(f"Merged auto and manual patterns into {len(final_list)} resource types.") return final_list From c510e3ef071f1f80d226fbcb9296267298b03a56 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 7 Oct 2025 19:05:21 -0500 Subject: [PATCH 067/128] added default annotation type for manual patterns in case it's null --- .../fn_file_annotation_launch/services/CacheService.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index e04ed942..bada7bf3 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -363,7 +363,8 @@ def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict] merged[resource_type]["samples"].add(item["sample"]) # Set annotation_type if not already set (auto-patterns take precedence) if not merged[resource_type]["annotation_type"]: - merged[resource_type]["annotation_type"] = item.get("annotation_type") + # NOTE: UI that creates manual patterns will need to also have the annotation type as a required entry + merged[resource_type]["annotation_type"] = item.get("annotation_type", "diagrams.AssetLink") # Convert the merged dictionary back to the required list format final_list = [ From 41c7a7b8198d2e16f86bdbfae7abe5b5e229aeb3 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 8 Oct 2025 12:51:29 -0500 Subject: [PATCH 068/128] resolve duplicate external ids for overlapping annotation edges from pattern mode results --- .../services/ApplyService.py | 102 +++++++++--------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index b0cc9d43..49409415 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -196,59 +196,55 @@ def _process_pattern_results( if stable_hash in existing_hashes: continue # Skip creating a pattern edge if a regular one already exists for this detection - for entity in detect_annotation.get("entities", []): - external_id = self._create_pattern_annotation_id(file_id, detect_annotation) - now = datetime.now(timezone.utc).replace(microsecond=0) - annotation_type = entity.get( - "annotation_type", self.config.data_model_views.target_entities_view.annotation_type - ) - annotation_properties = { - "name": file_id.external_id, - "confidence": detect_annotation.get("confidence", 0.0), - "status": DiagramAnnotationStatus.SUGGESTED.value, - "tags": [], - "startNodePageNumber": detect_annotation.get("region", {}).get("page"), - "startNodeXMin": min( - v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMin": min( - v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeXMax": max( - v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMax": max( - v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeText": detect_annotation.get("text"), - "sourceCreatedUser": self.FUNCTION_ID, - "sourceUpdatedUser": self.FUNCTION_ID, - "sourceCreatedTime": now.isoformat(), - "sourceUpdatedTime": now.isoformat(), - } - edge_apply = EdgeApply( - space=self.sink_node_ref.space, - external_id=external_id, - type=DirectRelationReference(space=self.core_annotation_view_id.space, external_id=annotation_type), - start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), - end_node=self.sink_node_ref, - sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], - ) - edge_applies.append(edge_apply) - row_columns = { - "externalId": external_id, - "startSourceId": source_id, - "startNode": file_id.external_id, - "startNodeSpace": file_id.space, - "endNode": self.sink_node_ref.external_id, - "endNodeSpace": self.sink_node_ref.space, - "endNodeResourceType": entity.get("resource_type", "Unknown"), - "viewId": self.core_annotation_view_id.external_id, - "viewSpace": self.core_annotation_view_id.space, - "viewVersion": self.core_annotation_view_id.version, - **annotation_properties, - } - doc_patterns.append(RowWrite(key=external_id, columns=row_columns)) + entities = detect_annotation.get("entities", []) + if not entities: + continue + entity = entities[0] + + external_id = self._create_pattern_annotation_id(file_id, detect_annotation) + now = datetime.now(timezone.utc).replace(microsecond=0) + annotation_type = entity.get( + "annotation_type", self.config.data_model_views.target_entities_view.annotation_type + ) + annotation_properties = { + "name": file_id.external_id, + "confidence": detect_annotation.get("confidence", 0.0), + "status": DiagramAnnotationStatus.SUGGESTED.value, + "tags": [], + "startNodePageNumber": detect_annotation.get("region", {}).get("page"), + "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeText": detect_annotation.get("text"), + "sourceCreatedUser": self.FUNCTION_ID, + "sourceUpdatedUser": self.FUNCTION_ID, + "sourceCreatedTime": now.isoformat(), + "sourceUpdatedTime": now.isoformat(), + } + edge_apply = EdgeApply( + space=self.sink_node_ref.space, + external_id=external_id, + type=DirectRelationReference(space=self.core_annotation_view_id.space, external_id=annotation_type), + start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), + end_node=self.sink_node_ref, + sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], + ) + edge_applies.append(edge_apply) + row_columns = { + "externalId": external_id, + "startSourceId": source_id, + "startNode": file_id.external_id, + "startNodeSpace": file_id.space, + "endNode": self.sink_node_ref.external_id, + "endNodeSpace": self.sink_node_ref.space, + "endNodeResourceType": entity.get("resource_type", "Unknown"), + "viewId": self.core_annotation_view_id.external_id, + "viewSpace": self.core_annotation_view_id.space, + "viewVersion": self.core_annotation_view_id.version, + **annotation_properties, + } + doc_patterns.append(RowWrite(key=external_id, columns=row_columns)) return edge_applies, doc_patterns def _detect_annotation_to_edge_applies( From 8140e39544e488a4f0f77e03388e0d60c51863c4 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:46:51 -0300 Subject: [PATCH 069/128] Including more views in fetch extraction pipeline config method --- .../file_annotation_dashboard/Pipeline_Health.py | 6 +++++- .../file_annotation_dashboard/helper.py | 16 +++++++++++++++- .../pages/Annotation_Quality.py | 12 +++++++++--- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py index 5f0ebc7e..826d7b0c 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/Pipeline_Health.py @@ -57,7 +57,11 @@ def reset_table_selection(): st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") st.stop() -ep_config, annotation_state_view, file_view = config_result +ep_config, view_config = config_result + +annotation_state_view = view_config["annotation_state"] +file_view = view_config["file"] + df_annotation_states = fetch_annotation_states(annotation_state_view, file_view) pipeline_runs = fetch_pipeline_run_history(selected_pipeline) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 5f790b22..d2a78f60 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -152,7 +152,21 @@ def fetch_extraction_pipeline_config(pipeline_ext_id: str) -> tuple[dict, ViewPr local_file_view.get("instanceSpace"), ) - return (config_dict, annotation_state_view, file_view) + local_target_entities_view = config_dict["dataModelViews"]["targetEntitiesView"] + target_entities_view = ViewPropertyConfig( + local_target_entities_view["schemaSpace"], + local_target_entities_view["externalId"], + local_target_entities_view["version"], + local_target_entities_view.get("instanceSpace"), + ) + + views_dict = { + "annotation_state": annotation_state_view, + "file": file_view, + "target_entities": target_entities_view, + } + + return (config_dict, views_dict) @st.cache_data(ttl=3600) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 09ae8114..ae5d9d9c 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -48,7 +48,12 @@ def reset_selection(): st.error(f"Could not fetch configuration for pipeline: {selected_pipeline}") st.stop() -ep_config, annotation_state_view, file_view = config_result +ep_config, view_config = config_result + +annotation_state_view = view_config["annotation_state"] +file_view = view_config["file"] +target_entities_view = view_config["target_entities"] + report_config = ep_config.get("finalizeFunction", {}).get("reportService", {}) cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) db_name = report_config.get("rawDb") @@ -57,7 +62,8 @@ def reset_selection(): doc_table = report_config.get("rawTableDocDoc") cache_table = cache_config.get("rawTableCache") manual_patterns_table = cache_config.get("rawManualPatternsCatalog") - +file_resource_property = ep_config.get("launchFunction", {}).get("fileResourceProperty", "") +target_entities_resource_property = ep_config.get("launchFunction", {}).get("targetEntitiesResourceProperty", "") if not all([db_name, pattern_table, tag_table, doc_table, cache_table, manual_patterns_table]): st.error("Could not find all required RAW table names in the pipeline configuration.") @@ -484,7 +490,7 @@ def calculate_metrics(row): if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): with st.spinner("Generating Industrial Canvas with bounding boxes..."): - _, _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) + _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) # The 'regions' column is no longer available in the RAW table. # You will need to adjust the canvas generation logic to handle this. # For now, we will pass an empty list. From 98c288de0faa8c64f4b249df8107bed37f3cd8a4 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:48:38 -0300 Subject: [PATCH 070/128] Including annotation_type field for patterns --- .../streamlit/file_annotation_dashboard/helper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index d2a78f60..e3db85c3 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -343,6 +343,7 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: { "key": key, "scope_level": scope_level, + "annotation_type": p.get("annotation_type"), "primary_scope": primary_scope, "secondary_scope": secondary_scope, "sample": p.get("sample"), @@ -360,6 +361,7 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: columns=[ "key", "scope_level", + "annotation_type", "primary_scope", "secondary_scope", "sample", @@ -373,7 +375,7 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: if "NotFoundError" not in str(type(e)): st.error(f"Failed to fetch manual patterns: {e}") return pd.DataFrame( - columns=["key", "scope_level", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] + columns=["key", "scope_level", "annotation_type", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] ) @@ -395,7 +397,7 @@ def create_key(row): df["key"] = df.apply(create_key, axis=1) df.dropna(subset=["key"], inplace=True) rows_to_write = [ - RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "created_by"]].to_dict("records")}) + RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "annotation_type", "created_by"]].to_dict("records")}) for key, group in df.groupby("key") ] From 6451d736e20325fd6892de5e7c8995f136453953 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:49:45 -0300 Subject: [PATCH 071/128] Refactoring canvas to fetch existing canvas and create canvas annotations for pattern matches --- .../file_annotation_dashboard/canvas.py | 62 ++++++++++++++++--- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py index 337415a0..bf5a7e30 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -1,5 +1,6 @@ from cognite.client import CogniteClient -from cognite.client.data_classes.data_modeling import NodeOrEdgeData, NodeApply, EdgeApply, ContainerId, ViewId, Node +from cognite.client.data_classes.data_modeling import NodeOrEdgeData, NodeApply, EdgeApply, ContainerId, ViewId, NodeId, EdgeId, Node, Edge +from cognite.client.data_classes.filters import Equals, And, Not import datetime import uuid import streamlit as st @@ -48,9 +49,9 @@ def generate_properties(file_node: Node, file_view_id: ViewId, node_id: str, off } -def create_canvas(name: str, client: CogniteClient): +def create_canvas(name: str, file_node: Node, client: CogniteClient): """Creates the main canvas node.""" - canvas_id = generate_id() + canvas_id = f"file_annotation_canvas_{file_node.external_id}" file_annotation_label = {"externalId": "file_annotations_solution_tag", "space": "SolutionTagsInstanceSpace"} canvas = NodeApply( space=CANVAS_SPACE_INSTANCE, @@ -69,12 +70,20 @@ def create_canvas(name: str, client: CogniteClient): ) ], ) - return canvas, canvas_id + return canvas + + +def fetch_existing_canvas(name: str, file_node: Node, client: CogniteClient): + existing_canvas = client.data_modeling.instances.retrieve( + nodes=NodeId(space=CANVAS_SPACE_INSTANCE, external_id=f"file_annotation_canvas_{file_node.external_id}") + ) + + return existing_canvas.nodes[0] if existing_canvas.nodes else None def create_objects(canvas_id: str, file_node: Node, file_view_id: ViewId): """Creates the node and edge for the file container, returning its ID.""" - file_container_id = generate_id() + file_container_id = f"file_annotation_file_container_{file_node.external_id}" properties = generate_properties(file_node, file_view_id, file_container_id) node_apply = NodeApply( @@ -169,10 +178,18 @@ def dm_generate( name: str, file_node: Node, file_view_id: ViewId, client: CogniteClient, unmatched_tags_with_regions: list = [] ): """Orchestrates the creation of the canvas, its objects, and bounding box annotations.""" - canvas, canvas_id = create_canvas(name=name, client=client) - nodes, edges, file_container_id = create_objects( - canvas_id=canvas_id, file_node=file_node, file_view_id=file_view_id - ) + canvas = fetch_existing_canvas(name, file_node, client) + + if canvas: + file_container_id = f"file_annotation_file_container_{file_node.external_id}" + reset_canvas_annotations(canvas.external_id, client) + nodes = [] + edges = [] + else: + canvas = create_canvas(name, file_node, client) + nodes, edges, file_container_id = create_objects(canvas.external_id, file_node, file_view_id) + + canvas_id = canvas.external_id if unmatched_tags_with_regions: annotation_nodes, annotation_edges = create_bounding_box_annotations( @@ -184,3 +201,30 @@ def dm_generate( client.data_modeling.instances.apply(nodes=[canvas] + nodes, edges=edges) st.session_state["canvas_id"] = canvas_id return canvas_id + + +def reset_canvas_annotations(canvas_id: str, client: CogniteClient): + """Deletes all canvas annotations, which includes nodes and edges""" + edge_filter = And( + Equals(property=['edge', 'type'], value={'space': CANVAS_SPACE_CANVAS, 'externalId': 'referencesCanvasAnnotation'}), + Equals(property=['edge', 'startNode'], value={'space': CANVAS_SPACE_INSTANCE, 'externalId': canvas_id}) + ) + + edges_to_delete = client.data_modeling.instances.list( + instance_type="edge", + filter=edge_filter, + limit=-1, + ) + + edges_to_delete_ids = [EdgeId(space=e.space, external_id=e.external_id) for e in edges_to_delete] + nodes_to_delete_ids = [NodeId(space=e.end_node.space, external_id=e.end_node.external_id) for e in edges_to_delete] + + if edges_to_delete_ids: + client.data_modeling.instances.delete( + edges=edges_to_delete_ids + ) + + if nodes_to_delete_ids: + client.data_modeling.instances.delete( + nodes=nodes_to_delete_ids + ) \ No newline at end of file From 8f566a4daad24d560a173d1d5b231a732cc0f413 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:52:40 -0300 Subject: [PATCH 072/128] Removing unnecessary second call to fetch extraction pipeline config --- .../file_annotation_dashboard/pages/Annotation_Quality.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index ae5d9d9c..55bf998c 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -490,14 +490,13 @@ def calculate_metrics(row): if st.button("Create in Canvas", key=f"canvas_btn_{selected_file}"): with st.spinner("Generating Industrial Canvas with bounding boxes..."): - _, file_view_config = fetch_extraction_pipeline_config(selected_pipeline) # The 'regions' column is no longer available in the RAW table. # You will need to adjust the canvas generation logic to handle this. # For now, we will pass an empty list. potential_tags_for_canvas = [] canvas_url = generate_file_canvas( file_id=file_node_id, - file_view=file_view_config, + file_view=file_view, ep_config=ep_config, unmatched_tags_with_regions=potential_tags_for_canvas, ) From 39ed2c79722e3af66bf4e0c84ea65ea794dd6b7e Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:53:18 -0300 Subject: [PATCH 073/128] Creating annotation_type selectbox when fetching manual patterns --- .../pages/Annotation_Quality.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 55bf998c..a987b3ca 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -558,6 +558,10 @@ def calculate_metrics(row): column_config={ "key": st.column_config.TextColumn("Scope Key", disabled=True), "sample": st.column_config.TextColumn("Pattern String", required=True), + "annotation_type": st.column_config.SelectboxColumn( + "Annotation Type", + options=["diagrams.FileLink", "diagrams.AssetLink"], + required=True), "resource_type": st.column_config.TextColumn("Resource Type", required=True), "scope_level": st.column_config.SelectboxColumn( "Scope Level", @@ -590,6 +594,11 @@ def calculate_metrics(row): with st.form(key="new_pattern_form", clear_on_submit=True): st.write("2. Enter Pattern Details") new_pattern = st.text_input("Pattern String", placeholder="e.g., [PI]-00000") + new_annotation_type = st.selectbox( + "Annotation Type", + ["diagrams.FileLink", "diagrams.AssetLink"], + key="new_annotation_type_selector" + ) new_resource_type = st.text_input("Resource Type", placeholder="e.g., Asset") primary_scope_value = "" From 3a31037c9509cb916d7569b9a1c9397f34d6b178 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Wed, 8 Oct 2025 15:54:05 -0300 Subject: [PATCH 074/128] Creating component to connect tags with entities through patterns --- .../file_annotation_dashboard/helper.py | 259 ++++++++++++++++++ .../pages/Annotation_Quality.py | 162 ++++++++++- 2 files changed, 415 insertions(+), 6 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index e3db85c3..95835ba9 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -516,3 +516,262 @@ def fetch_potential_annotations(db_name: str, table_name: str, file_external_id: except Exception as e: st.error(f"Failed to fetch potential annotations: {e}") return pd.DataFrame() + + +@st.cache_data(ttl=3600) +def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> pd.DataFrame: + """ + Fetches entity instances from the specified data model view and returns a tidy DataFrame. + """ + instances = client.data_modeling.instances.list( + instance_type="node", + space=entity_view.instance_space, + sources=entity_view.as_view_id(), + limit=-1 + ) + if not instances: + return pd.DataFrame() + data = [] + for instance in instances: + props = instance.properties.get(entity_view.as_view_id(), {}) + + data.append( + { + "name": props.get("name"), + "externalId": instance.external_id, + "resourceType": props.get(resource_property), + "sysUnit": props.get("sysUnit"), + "space": instance.space, + } + ) + return pd.DataFrame(data) + + +def show_connect_unmatched_ui( + tag_text, + file_view, + target_entities_view, + file_resource_property, + target_entities_resource_property, + associated_files, + tab, + db_name, + pattern_table +): + """ + Displays the UI to connect a single unmatched tag to either an Asset or a File. + """ + st.markdown(f"### Tag to Connect: `{tag_text}`") + st.markdown(f"Associated Files: `{associated_files}`") + col1, col2 = st.columns(2) + entity_type = None + + with col1: + if st.button("Retrieve Assets", key=f"btn_retrieve_assets_{tab}"): + st.session_state.selected_entity_type_to_connect = "asset" + st.session_state.selected_entity_to_connect_index = None + with col2: + if st.button("Retrieve Files", key=f"btn_retrieve_files_{tab}"): + st.session_state.selected_entity_type_to_connect = "file" + st.session_state.selected_entity_to_connect_index = None + + entity_type = st.session_state.selected_entity_type_to_connect + + if not entity_type: + return + + + st.write() + + if entity_type == "file": + entity_view = file_view + resource_property = file_resource_property + annotation_type = "diagrams.FileLink" + else: + entity_view = target_entities_view + resource_property = target_entities_resource_property + annotation_type = "diagrams.AssetLink" + + df_entities = fetch_entities(entity_view, resource_property) + + if df_entities.empty: + st.warning(f"No {entity_type}s found.") + return + + df_entities_display = df_entities.copy() + df_entities_display.insert(0, "Select", False) + + if st.session_state.selected_entity_to_connect_index is not None: + idx = st.session_state.selected_entity_to_connect_index + + if idx in df_entities_display.index: + df_entities_display.loc[:, "Select"] = False + df_entities_display.at[idx, "Select"] = True + + filterable_columns = ["sysUnit", "resourceType"] + + for filterable_column in filterable_columns: + unique_values = sorted(df_entities_display[filterable_column].dropna().unique().tolist()) + + selected_value = st.selectbox( + f"Filter by {filterable_column}", + key=f"sb_filterable_column_{filterable_column}_{tab}", + options=[None] + unique_values, + index=0 + ) + + if selected_value: + df_entities_display = df_entities_display[df_entities_display[filterable_column] == selected_value] + + entity_editor_key = f"{entity_type}_editor_{tag_text}_{tab}" + edited_entities = st.data_editor( + df_entities_display, + key=entity_editor_key, + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "name": "Name", + "externalId": "External ID", + "resourceType": "Resource Type", + "sysUnit": "Sys Unit" + }, + use_container_width=True, + hide_index=True, + disabled=df_entities_display.columns.difference(["Select"]), + ) + + selected_indices = edited_entities[edited_entities.Select].index.tolist() + + if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_entity_to_connect_index] + st.session_state.selected_entity_to_connect_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_entity_to_connect_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.selected_entity_to_connect_index is not None: + st.session_state.selected_entity_to_connect_index = None + st.rerun() + + if st.session_state.selected_entity_to_connect_index is not None: + selected_entity = df_entities.loc[st.session_state.selected_entity_to_connect_index] + if st.button( + f"Connect '{tag_text}' to '{selected_entity['name']}' in {str(len(associated_files)) + ' files' if len(associated_files) > 1 else str(len(associated_files)) + ' file'}", + key=f"btn_connect_tag_to_entities_{tab}" + ): + success, count, error = create_tag_connection( + client, + db_name, + pattern_table, + tag_text, + associated_files, + selected_entity, + annotation_type + ) + + if success: + st.toast( + f"{count} annotation{'s' if count > 1 else ''} created from tag '{tag_text}' to {entity_type} '{selected_entity['name']}' " + f"in {len(associated_files)} file{'s' if len(associated_files) > 1 else ''}!", + icon=":material/check_small:" + ) + st.cache_data.clear() + else: + st.toast( + body=f"Failed to connect tag '{tag_text}': {error}", + icon=":material/error:" + ) + + +def create_tag_connection( + client: CogniteClient, + db_name: str, + table_name: str, + tag_text: str, + associated_files: list[str], + selected_entity: pd.Series, + annotation_type: str +): + updated_rows = [] + updated_edges = [] + + try: + rows = client.raw.rows.list( + db_name=db_name, + table_name=table_name, + limit=-1 + ) + + for row in rows: + row_data = row.columns + + if row_data.get("startNodeText") == tag_text and row_data.get("startNode") in associated_files: + edge_external_id = row.key + file_id = row_data.get("startNode") + + row_data["endNode"] = selected_entity["externalId"] + row_data["endNodeSpace"] = selected_entity["space"] + row_data["endNodeResourceType"] = selected_entity["resourceType"] + row_data["status"] = "approved" + + updated_rows.append( + RowWrite( + key=edge_external_id, + columns=row_data + ) + ) + + updated_edges.append( + EdgeApply( + space=row_data.get("space"), + external_id=edge_external_id, + type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), + start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), + end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity["externalId"]) + ) + ) + + if updated_rows: + st.write(len(updated_rows)) + # client.raw.rows.insert( + # db_name=db_name, + # table_name=table_name, + # row=updated_rows, + # ensure_parent=True + # ) + + if updated_edges: + st.write(len(updated_edges)) + # client.data_modeling.instances.apply(edges=updated_edges) + + return True, len(updated_rows), None + except Exception as e: + return False, 0, str(e) + + +def build_unmatched_tags_with_regions( + df: pd.DataFrame, + file_id: str, + potential_new_annotations: list[str] +): + df_filtered = df[ + (df["startNode"] == file_id) & + (df["startNodeText"].isin(potential_new_annotations)) + ] + + unmatched_tags_with_regions = [] + + for _, row in df_filtered.iterrows(): + region = { + "vertices": [ + {"x": row["startNodeXMin"], "y": row["startNodeYMin"]}, + {"x": row["startNodeXMax"], "y": row["startNodeYMin"]}, + {"x": row["startNodeXMax"], "y": row["startNodeYMax"]}, + {"x": row["startNodeXMin"], "y": row["startNodeYMax"]}, + ] + } + + unmatched_tags_with_regions.append({ + "text": row["startNodeText"], + "regions": [region] + }) + + return unmatched_tags_with_regions \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index a987b3ca..7e29b9aa 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -12,6 +12,8 @@ fetch_annotation_states, save_manual_patterns, normalize, + show_connect_unmatched_ui, + build_unmatched_tags_with_regions, ) from cognite.client.data_classes.data_modeling import NodeId @@ -31,6 +33,14 @@ def reset_selection(): # --- Initialize Session State --- if "selected_row_index" not in st.session_state: st.session_state.selected_row_index = None +if "selected_unmatched_per_file_index" not in st.session_state: + st.session_state.selected_unmatched_per_file_index = None +if "selected_unmatched_overall_index" not in st.session_state: + st.session_state.selected_unmatched_overall_index = None +if "selected_entity_to_connect_index" not in st.session_state: + st.session_state.selected_entity_to_connect_index = None +if "selected_entity_type_to_connect" not in st.session_state: + st.session_state.selected_entity_type_to_connect = None # --- Sidebar for Pipeline Selection --- st.sidebar.title("Pipeline Selection") @@ -257,12 +267,91 @@ def reset_selection(): f"{total_potential}", help="A list of all unique tags found by the pattern-mode job that do not yet exist as actual annotations. This is now a clean 'to-do list' of tags that could be promoted or used to create new patterns.", ) - st.dataframe( - pd.DataFrame(sorted(list(potential_new_annotations_set)), columns=["Tag"]), + + unmatched_display = pd.DataFrame(sorted(list(potential_new_annotations_set)), columns=["text"]) + unmatched_display.insert(0, "Select", False) + + if st.session_state.selected_unmatched_overall_index is not None: + idx = st.session_state.selected_unmatched_overall_index + + if idx in unmatched_display.index: + unmatched_display.loc[:, "Select"] = False + unmatched_display.at[idx, "Select"] = True + + unmatched_tags_list = list(potential_new_annotations_set) + df_unmatched_filtered = df_metrics_input[df_metrics_input["startNodeText"].isin(unmatched_tags_list)] + + tag_to_files_unmatched = ( + df_unmatched_filtered.groupby("startNodeText")["startNode"] + .unique() + .apply(list) + .to_dict() + ) + + tag_occurrences = ( + df_unmatched_filtered.groupby("startNodeText")["startNode"] + .count() + .reset_index() + .rename(columns={"startNode": "occurrenceCount"}) + ) + + tag_file_counts = ( + df_unmatched_filtered.groupby("startNodeText")["startNode"] + .nunique() + .reset_index() + .rename(columns={"startNode": "fileCount"}) + ) + + tag_stats = tag_file_counts.merge(tag_occurrences, on="startNodeText", how="outer") + + unmatched_display = unmatched_display.merge(tag_stats, left_on="text", right_on="startNodeText", how="left") + unmatched_display.drop(columns=["startNodeText"], inplace=True) + + unmatched_editor_key = "overall_unmatched_tags_editor" + unmatched_data_editor = st.data_editor( + unmatched_display, + key=unmatched_editor_key, + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "text": "Tag", + "fileCount": "Associated Files", + "occurrenceCount": "Occurrences" + }, use_container_width=True, hide_index=True, + disabled=unmatched_display.columns.difference(["Select"]), ) + selected_indices = unmatched_data_editor[unmatched_data_editor.Select].index.tolist() + + if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_unmatched_overall_index] + st.session_state.selected_unmatched_overall_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_unmatched_overall_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.selected_unmatched_overall_index is not None: + st.session_state.selected_unmatched_overall_index = None + st.rerun() + + if st.session_state.selected_unmatched_overall_index is not None: + selected_tag_row = unmatched_display.loc[st.session_state.selected_unmatched_overall_index] + selected_tag_text = selected_tag_row["text"] + + show_connect_unmatched_ui( + selected_tag_text, + file_view, + target_entities_view, + file_resource_property, + target_entities_resource_property, + associated_files=tag_to_files_unmatched.get(selected_tag_text, []), + tab="overall", + db_name=db_name, + pattern_table=pattern_table + ) + + + # ========================================== # PER-FILE ANALYSIS TAB # ========================================== @@ -483,6 +572,8 @@ def calculate_metrics(row): norm_unmatched = norm_potential - norm_actual + potential_new_annotations_set = {potential_map[t] for t in norm_unmatched if t in potential_map} + actual_df = df_actual_annotations_details.drop_duplicates() potential_df = df_potential_tags_details[ df_potential_tags_details["startNodeText"].isin({potential_map[t] for t in norm_unmatched}) @@ -493,7 +584,11 @@ def calculate_metrics(row): # The 'regions' column is no longer available in the RAW table. # You will need to adjust the canvas generation logic to handle this. # For now, we will pass an empty list. - potential_tags_for_canvas = [] + potential_tags_for_canvas = build_unmatched_tags_with_regions( + df=df_metrics_input, + file_id=selected_file, + potential_new_annotations=potential_new_annotations_set + ) canvas_url = generate_file_canvas( file_id=file_node_id, file_view=file_view, @@ -529,12 +624,67 @@ def calculate_metrics(row): "💡 Potential New Annotations in this File", len(potential_df), ) - st.dataframe( - potential_df, - column_config={"startNodeText": "Tag", "endNodeResourceType": "Resource Type"}, + + unmatched_display = potential_df[["startNodeText", "endNodeResourceType"]].copy() + unmatched_display.insert(0, "Select", False) + + occurrences = ( + df_patterns_file[df_patterns_file["startNode"] == selected_file].groupby("startNodeText") + .size() + .reset_index(name="occurrenceCount") + ) + + unmatched_display = unmatched_display.merge(occurrences, on="startNodeText", how="left") + + if st.session_state.selected_unmatched_per_file_index is not None: + idx = st.session_state.selected_unmatched_per_file_index + + if idx in unmatched_display.index: + unmatched_display.loc[:, "Select"] = False + unmatched_display.at[idx, "Select"] = True + + unmatched_editor_key = "unmatched_tags_editor" + unmatched_data_editor = st.data_editor( + unmatched_display, + key=unmatched_editor_key, + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "startNodeText": "Tag", + "endNodeResourceType": "Resource Type", + "occurrenceCount": "Occurrences" + }, use_container_width=True, hide_index=True, + disabled=unmatched_display.columns.difference(["Select"]), + ) + + selected_indices = unmatched_data_editor[unmatched_data_editor.Select].index.tolist() + + if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_unmatched_per_file_index] + st.session_state.selected_unmatched_per_file_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_unmatched_per_file_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.selected_unmatched_per_file_index is not None: + st.session_state.selected_unmatched_per_file_index = None + st.rerun() + + if st.session_state.selected_unmatched_per_file_index is not None: + selected_tag_row = unmatched_display.loc[st.session_state.selected_unmatched_per_file_index] + selected_tag_text = selected_tag_row["startNodeText"] + show_connect_unmatched_ui( + selected_tag_text, + file_view, + target_entities_view, + file_resource_property, + target_entities_resource_property, + associated_files=[selected_file], + tab="per_file", + db_name=db_name, + pattern_table=pattern_table ) + else: st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") From a8cfe60cf530928d4001c4d809de8d5743f29aca Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 10 Oct 2025 11:04:07 -0500 Subject: [PATCH 075/128] refactored the list annotations function in apply service --- .../services/ApplyService.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 49409415..22fa6395 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -145,7 +145,9 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: """Deletes all standard and pattern edges and their corresponding RAW rows for a file.""" counts = {"doc": 0, "tag": 0, "pattern": 0} - std_edges = self._list_annotations_for_file(file_id, self.sink_node_ref, negate=True) + std_edges = self._list_annotations_for_file( + file_id, file_id.space + ) # NOTE: Annotations produced from regular diagram detect are stored in the same instance space as the file node if std_edges: edge_ids, doc_keys, tag_keys = [], [], [] for edge in std_edges: @@ -170,7 +172,9 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: ) counts["doc"], counts["tag"] = len(doc_keys), len(tag_keys) - pattern_edges = self._list_annotations_for_file(file_id, self.sink_node_ref, negate=False) + pattern_edges = self._list_annotations_for_file( + file_id, self.sink_node_ref.space + ) # NOTE: Annotations produced from pattern mode are stored in the same instance space as the sink node if pattern_edges: edge_ids = [edge.as_id() for edge in pattern_edges] row_keys = [edge.external_id for edge in pattern_edges] @@ -185,6 +189,20 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: counts["pattern"] = len(row_keys) return counts + def _list_annotations_for_file(self, node_id: NodeId, edge_instance_space: str): + """ + List all annotation edges for a file node given the instance space of where the edges are stored. + """ + start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) + + return self.client.data_modeling.instances.list( + instance_type="edge", + sources=[self.core_annotation_view_id], + space=edge_instance_space, + filter=start_node_filter, + limit=-1, + ) + def _process_pattern_results( self, result_item: dict, file_node: Node, existing_hashes: set ) -> tuple[list[EdgeApply], list[RowWrite]]: @@ -346,26 +364,6 @@ def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[st prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] return f"{prefix}:{hash_}" - def _list_annotations_for_file(self, node_id: NodeId, end_node: DirectRelationReference, negate: bool = False): - """ - List all annotation edges for a file node, optionally filtering by the end node. - """ - start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) - end_node_filter = Equals(["edge", "endNode"], {"space": end_node.space, "externalId": end_node.external_id}) - if negate: - final_filter = And(start_node_filter, dm.filters.Not(end_node_filter)) - space = node_id.space - else: - space = self.sink_node_ref.space - final_filter = And(start_node_filter, end_node_filter) - return self.client.data_modeling.instances.list( - instance_type="edge", - sources=[self.core_annotation_view_id], - space=space, - filter=final_filter, - limit=-1, - ) - def _get_edge_apply_unique_key(self, edge_apply_instance: EdgeApply) -> tuple: start_node = edge_apply_instance.start_node end_node = edge_apply_instance.end_node From 2b5624da877651bf577f597d2ff58b0afc32220f Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Fri, 10 Oct 2025 17:16:35 -0300 Subject: [PATCH 076/128] Including annotation type reference when saving manual patterns --- .../file_annotation_dashboard/pages/Annotation_Quality.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index 7e29b9aa..fecd44a3 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -773,6 +773,7 @@ def calculate_metrics(row): "sample": new_pattern, "resource_type": new_resource_type, "scope_level": scope_level, + "annotation_type": new_annotation_type, "primary_scope": primary_scope_value, "secondary_scope": secondary_scope_value, "created_by": "streamlit", From 162c2835602a8bf1d467d71ababa4431a9449c80 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Fri, 10 Oct 2025 17:26:27 -0300 Subject: [PATCH 077/128] Implementing "Customize Table Columns" multiselect box when fetching target entities --- .../file_annotation_dashboard/helper.py | 1560 +++++++++-------- 1 file changed, 784 insertions(+), 776 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index 95835ba9..aac81e78 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -1,777 +1,785 @@ -import os -import re -import yaml -import streamlit as st -import pandas as pd -from datetime import datetime, timedelta -from cognite.client import CogniteClient -from cognite.client.data_classes import RowWrite, Asset, AssetFilter -from cognite.client.data_classes.data_modeling import ( - ViewId, - NodeId, - Node, - filters, - EdgeApply, - NodeOrEdgeData, - DirectRelationReference, -) -from cognite.client.data_classes.functions import FunctionCallLog -from data_structures import ViewPropertyConfig -from canvas import dm_generate - -client = CogniteClient() - - -@st.cache_data(ttl=3600) -def get_file_node(file_id: NodeId, file_view: ViewPropertyConfig) -> Node | None: - """Fetches a single file node from CDF.""" - try: - node = client.data_modeling.instances.retrieve_nodes(nodes=file_id, sources=file_view.as_view_id()) - return node - except Exception as e: - st.error(f"Failed to retrieve file node {file_id}: {e}") - return None - - -def generate_file_canvas( - file_id: NodeId, file_view: ViewPropertyConfig, ep_config: dict, unmatched_tags_with_regions: list = [] -): - """ - Generates an Industrial Canvas, including bounding boxes for unmatched tags, - and returns the canvas URL. - """ - file_node = get_file_node(file_id, file_view) - if not file_node: - st.error("Could not generate canvas because the file node could not be retrieved.") - return None - - canvas_name = f"Annotation Quality Analysis - {file_node.external_id}" - - try: - domain = os.getenv("COGNITE_ORGANIZATION") - project = client.config.project - cluster = client.config.cdf_cluster - - canvas_id = dm_generate( - name=canvas_name, - file_node=file_node, - file_view_id=file_view.as_view_id(), - client=client, - unmatched_tags_with_regions=unmatched_tags_with_regions, - ) - st.success(f"Successfully generated canvas: {canvas_name}") - - canvas_url = f"https://{domain}.fusion.cognite.com/{project}/industrial-canvas/canvas?canvasId={canvas_id}&cluster={cluster}.cognitedata.com&env={cluster}&workspace=industrial-tools" - return canvas_url - - except Exception as e: - st.error(f"Failed to generate canvas: {e}") - return None - - -@st.cache_data(ttl=600) -def find_pipelines(name_filter: str = "file_annotation") -> list[str]: - """ - Finds the external IDs of all extraction pipelines in the project, - filtered by a substring in their external ID. - """ - try: - all_pipelines = client.extraction_pipelines.list(limit=-1) - if not all_pipelines: - st.warning(f"No extraction pipelines found in the project.") - return [] - - filtered_ids = [p.external_id for p in all_pipelines if name_filter in p.external_id] - - if not filtered_ids: - st.warning(f"No pipelines matching the filter '*{name_filter}*' found in the project.") - return [] - - return sorted(filtered_ids) - except Exception as e: - st.error(f"An error occurred while searching for extraction pipelines: {e}") - return [] - - -@st.cache_data(ttl=3600) -def fetch_raw_table_data(db_name: str, table_name: str) -> pd.DataFrame: - """Fetches all rows from a specified RAW table and returns as a DataFrame.""" - try: - rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) - if not rows: - return pd.DataFrame() - data = [row.columns for row in rows] - return pd.DataFrame(data) - except Exception as e: - st.error(f"Failed to fetch data from RAW table '{table_name}': {e}") - return pd.DataFrame() - - -def parse_run_message(message: str) -> dict: - """Parses the structured run message and returns a dictionary of its components.""" - if not message: - return {} - - pattern = re.compile( - r"\(caller:(?P\w+), function_id:(?P[\w\.-]+), call_id:(?P[\w\.-]+)\) - " - r"total files processed: (?P\d+) - " - r"successful files: (?P\d+) - " - r"failed files: (?P\d+)" - ) - match = pattern.search(message) - if match: - data = match.groupdict() - for key in ["total", "success", "failed"]: - if key in data: - data[key] = int(data[key]) - return data - return {} - - -@st.cache_data(ttl=3600) -def fetch_extraction_pipeline_config(pipeline_ext_id: str) -> tuple[dict, ViewPropertyConfig, ViewPropertyConfig]: - """ - Fetch configurations from the latest extraction - """ - ep_configuration = client.extraction_pipelines.config.retrieve(external_id=pipeline_ext_id) - config_dict = yaml.safe_load(ep_configuration.config) - - local_annotation_state_view = config_dict["dataModelViews"]["annotationStateView"] - annotation_state_view = ViewPropertyConfig( - local_annotation_state_view["schemaSpace"], - local_annotation_state_view["externalId"], - local_annotation_state_view["version"], - local_annotation_state_view["instanceSpace"], - ) - - local_file_view = config_dict["dataModelViews"]["fileView"] - file_view = ViewPropertyConfig( - local_file_view["schemaSpace"], - local_file_view["externalId"], - local_file_view["version"], - local_file_view.get("instanceSpace"), - ) - - local_target_entities_view = config_dict["dataModelViews"]["targetEntitiesView"] - target_entities_view = ViewPropertyConfig( - local_target_entities_view["schemaSpace"], - local_target_entities_view["externalId"], - local_target_entities_view["version"], - local_target_entities_view.get("instanceSpace"), - ) - - views_dict = { - "annotation_state": annotation_state_view, - "file": file_view, - "target_entities": target_entities_view, - } - - return (config_dict, views_dict) - - -@st.cache_data(ttl=3600) -def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view: ViewPropertyConfig): - """ - Fetches annotation state instances from the specified data model view - and joins them with their corresponding file instances. - """ - annotation_instances = client.data_modeling.instances.list( - instance_type="node", - space=annotation_state_view.instance_space, - sources=annotation_state_view.as_view_id(), - limit=-1, - ) - if not annotation_instances: - return pd.DataFrame() - - annotation_data = [] - nodes_to_fetch = [] - for instance in annotation_instances: - node_data = { - "externalId": instance.external_id, - "space": instance.space, - "createdTime": pd.to_datetime(instance.created_time, unit="ms"), - "lastUpdatedTime": pd.to_datetime(instance.last_updated_time, unit="ms"), - } - for prop_key, prop_value in instance.properties[annotation_state_view.as_view_id()].items(): - if prop_key == "linkedFile" and prop_value: - file_external_id = prop_value.get("externalId") - file_space = prop_value.get("space") - node_data["fileExternalId"] = file_external_id - node_data["fileSpace"] = file_space - if file_external_id and file_space: - nodes_to_fetch.append(NodeId(space=file_space, external_id=file_external_id)) - node_data[prop_key] = prop_value - annotation_data.append(node_data) - - df_annotations = pd.DataFrame(annotation_data) - if df_annotations.empty or not nodes_to_fetch: - return df_annotations - - unique_nodes_to_fetch = list(set(nodes_to_fetch)) - file_instances = client.data_modeling.instances.retrieve_nodes( - nodes=unique_nodes_to_fetch, sources=file_view.as_view_id() - ) - - file_data = [] - for instance in file_instances: - node_data = {"fileExternalId": instance.external_id, "fileSpace": instance.space} - properties = instance.properties[file_view.as_view_id()] - - for prop_key, prop_value in properties.items(): - node_data[f"file{prop_key.capitalize()}"] = ( - ", ".join(map(str, prop_value)) if isinstance(prop_value, list) else prop_value - ) - file_data.append(node_data) - - if not file_data: - return df_annotations - - df_files = pd.DataFrame(file_data) - df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") - - for col in ["createdTime", "lastUpdatedTime"]: - if col in df_merged.columns: - df_merged[col] = df_merged[col].dt.tz_localize("UTC") - - df_merged.rename(columns={"annotationStatus": "status", "attemptCount": "retries"}, inplace=True) - - return df_merged - - -@st.cache_data(ttl=3600) -def fetch_pipeline_run_history(pipeline_ext_id: str): - """Fetches the full run history for a given extraction pipeline.""" - return client.extraction_pipelines.runs.list(external_id=pipeline_ext_id, limit=-1) - - -@st.cache_data(ttl=3600) -def fetch_function_logs(function_id: int, call_id: int): - """Fetches the logs for a specific function call.""" - try: - log: FunctionCallLog = client.functions.calls.get_logs(call_id, function_id) - return log.to_text(with_timestamps=False) - except Exception as e: - return [f"Could not retrieve logs: {e}"] - - -def process_runs_for_graphing(runs): - """Transforms pipeline run data into a DataFrame for graphing.""" - launch_data, finalize_runs_to_agg = [], [] - for run in runs: - if run.status != "success": - continue - parsed = parse_run_message(run.message) - if not parsed: - continue - timestamp, count, caller = ( - pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), - parsed.get("total", 0), - parsed.get("caller"), - ) - if caller == "Launch": - launch_data.append({"timestamp": timestamp, "count": count, "type": "Launch"}) - elif caller == "Finalize": - finalize_runs_to_agg.append({"timestamp": timestamp, "count": count}) - - aggregated_finalize_data = [] - if finalize_runs_to_agg: - finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) - current_group_start_time, current_group_count = finalize_runs_to_agg[0]["timestamp"], 0 - for run in finalize_runs_to_agg: - if run["timestamp"] < current_group_start_time + timedelta(minutes=10): - current_group_count += run["count"] - else: - aggregated_finalize_data.append( - {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} - ) - current_group_start_time, current_group_count = run["timestamp"], run["count"] - if current_group_count > 0: - aggregated_finalize_data.append( - {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} - ) - - return pd.concat([pd.DataFrame(launch_data), pd.DataFrame(aggregated_finalize_data)], ignore_index=True) - - -@st.cache_data(ttl=3600) -def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: - """ - Fetches the entity cache and explodes it to create a complete - catalog of all generated patterns, indexed by resourceType. - """ - try: - rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) - if not rows: - return pd.DataFrame() - all_patterns = [] - for row in pd.DataFrame([row.columns for row in rows]).itertuples(): - for sample_list in ["AssetPatternSamples", "FilePatternSamples"]: - if hasattr(row, sample_list) and isinstance(getattr(row, sample_list), list): - for item in getattr(row, sample_list): - if item.get("sample") and item.get("resource_type"): - all_patterns.extend( - [ - {"resourceType": item["resource_type"], "pattern": pattern} - for pattern in item["sample"] - ] - ) - return pd.DataFrame(all_patterns) - except Exception as e: - st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") - return pd.DataFrame() - - -def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: - """ - Fetches all manual patterns from the RAW table and explodes them - into a tidy DataFrame for display and editing. - """ - all_patterns = [] - try: - for row in client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1): - key, patterns_list = row.key, row.columns.get("patterns", []) - scope_level, primary_scope, secondary_scope = "Global", "", "" - if key != "GLOBAL": - parts = key.split("_") - if len(parts) == 2: - scope_level, primary_scope, secondary_scope = "Secondary Scope", parts[0], parts[1] - else: - scope_level, primary_scope = "Primary Scope", key - all_patterns.extend( - [ - { - "key": key, - "scope_level": scope_level, - "annotation_type": p.get("annotation_type"), - "primary_scope": primary_scope, - "secondary_scope": secondary_scope, - "sample": p.get("sample"), - "resource_type": p.get("resource_type"), - "created_by": p.get("created_by"), - } - for p in patterns_list - ] - ) - - df = ( - pd.DataFrame(all_patterns) - if all_patterns - else pd.DataFrame( - columns=[ - "key", - "scope_level", - "annotation_type", - "primary_scope", - "secondary_scope", - "sample", - "resource_type", - "created_by", - ] - ) - ) - return df.fillna("").astype(str) - except Exception as e: - if "NotFoundError" not in str(type(e)): - st.error(f"Failed to fetch manual patterns: {e}") - return pd.DataFrame( - columns=["key", "scope_level", "annotation_type", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] - ) - - -def save_manual_patterns(df: pd.DataFrame, db_name: str, table_name: str): - """ - Takes a tidy DataFrame of patterns, groups them by scope key, - and writes them back to the RAW table. - """ - - def create_key(row): - if row["scope_level"] == "Global": - return "GLOBAL" - if row["scope_level"] == "Primary Scope" and row["primary_scope"]: - return row["primary_scope"] - if row["scope_level"] == "Secondary Scope" and row["primary_scope"] and row["secondary_scope"]: - return f"{row['primary_scope']}_{row['secondary_scope']}" - return None - - df["key"] = df.apply(create_key, axis=1) - df.dropna(subset=["key"], inplace=True) - rows_to_write = [ - RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "annotation_type", "created_by"]].to_dict("records")}) - for key, group in df.groupby("key") - ] - - existing_keys = {r.key for r in client.raw.rows.list(db_name, table_name, limit=-1)} - keys_to_delete = list(existing_keys - {r.key for r in rows_to_write}) - if keys_to_delete: - client.raw.rows.delete(db_name=db_name, table_name=table_name, key=keys_to_delete) - if rows_to_write: - client.raw.rows.insert(db_name=db_name, table_name=table_name, row=rows_to_write, ensure_parent=True) - - -@st.cache_data(ttl=600) -def get_files_by_call_id(call_id: int, annotation_state_view: ViewPropertyConfig) -> pd.DataFrame: - """ - Finds all files associated with a specific function call ID by querying - the AnnotationState data model. - """ - if not call_id: - return pd.DataFrame() - try: - call_id_filter = filters.Or( - filters.Equals(annotation_state_view.as_property_ref("launchFunctionCallId"), call_id), - filters.Equals(annotation_state_view.as_property_ref("finalizeFunctionCallId"), call_id), - ) - instances = client.data_modeling.instances.list( - instance_type="node", sources=annotation_state_view.as_view_id(), filter=call_id_filter, limit=-1 - ) - if not instances: - return pd.DataFrame() - - view_id_tuple = annotation_state_view.as_view_id() - file_ids = [ - instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") - for instance in instances - if instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") - ] - return pd.DataFrame(file_ids, columns=["File External ID"]) - except Exception as e: - st.error(f"Failed to query files by call ID: {e}") - return pd.DataFrame() - - -def calculate_overview_kpis(df: pd.DataFrame) -> dict: - """Calculates high-level KPIs from the AnnotationState dataframe.""" - kpis = {"awaiting_processing": 0, "processed_total": 0, "failed_total": 0, "failure_rate_total": 0} - if df.empty: - return kpis - kpis["awaiting_processing"] = len(df[df["status"].isin(["New", "Retry", "Processing", "Finalizing"])]) - finalized_all_time = df[df["status"].isin(["Annotated", "Failed"])] - kpis["processed_total"] = len(finalized_all_time) - kpis["failed_total"] = len(finalized_all_time[finalized_all_time["status"] == "Failed"]) - if kpis["processed_total"] > 0: - kpis["failure_rate_total"] = (kpis["failed_total"] / kpis["processed_total"]) * 100 - return kpis - - -def filter_log_lines(log_text: str, search_string: str) -> str: - """ - Takes a block of log text and a search string, returning a new string - containing the lines that include the search string, plus the subsequent - indented lines that provide context. - """ - if not log_text or not isinstance(log_text, str): - return "Log content is not available or in an invalid format." - relevant_blocks, lines = [], log_text.splitlines() - for i, line in enumerate(lines): - if search_string in line: - current_block = [line] - next_line_index = i + 1 - while next_line_index < len(lines): - next_line = lines[next_line_index] - if next_line.strip().startswith("-") or "\t" in next_line or " " in next_line: - current_block.append(next_line) - next_line_index += 1 - else: - break - relevant_blocks.append("\n".join(current_block)) - return "\n\n".join(relevant_blocks) - - -# --- Remove all non-alphanumeric characters, convert to lowercase, and strip leading zeros from numbers --- -def normalize(s): - """ - Normalizes a string by: - 1. Ensuring it's a string. - 2. Removing all non-alphanumeric characters. - 3. Converting to lowercase. - 4. Removing leading zeros from any sequence of digits found within the string. - """ - if not isinstance(s, str): - return "" - - # Step 1: Basic cleaning (e.g., "V-0912" -> "v0912") - s = re.sub(r"[^a-zA-Z0-9]", "", s).lower() - - # Step 2: Define a replacer function that converts any matched number to an int and back to a string - def strip_leading_zeros(match): - # match.group(0) is the matched string (e.g., "0912") - return str(int(match.group(0))) - - # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string - # This turns "v0912" into "v912" - return re.sub(r"\d+", strip_leading_zeros, s) - - -@st.cache_data(ttl=600) -def fetch_potential_annotations(db_name: str, table_name: str, file_external_id: str) -> pd.DataFrame: - """Fetches potential annotations for a specific file from the patterns RAW table.""" - try: - rows = client.raw.rows.list( - db_name=db_name, table_name=table_name, limit=-1, filter={"startNode": file_external_id} - ) - if not rows: - return pd.DataFrame() - return pd.DataFrame([row.columns for row in rows]) - except Exception as e: - st.error(f"Failed to fetch potential annotations: {e}") - return pd.DataFrame() - - -@st.cache_data(ttl=3600) -def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> pd.DataFrame: - """ - Fetches entity instances from the specified data model view and returns a tidy DataFrame. - """ - instances = client.data_modeling.instances.list( - instance_type="node", - space=entity_view.instance_space, - sources=entity_view.as_view_id(), - limit=-1 - ) - if not instances: - return pd.DataFrame() - data = [] - for instance in instances: - props = instance.properties.get(entity_view.as_view_id(), {}) - - data.append( - { - "name": props.get("name"), - "externalId": instance.external_id, - "resourceType": props.get(resource_property), - "sysUnit": props.get("sysUnit"), - "space": instance.space, - } - ) - return pd.DataFrame(data) - - -def show_connect_unmatched_ui( - tag_text, - file_view, - target_entities_view, - file_resource_property, - target_entities_resource_property, - associated_files, - tab, - db_name, - pattern_table -): - """ - Displays the UI to connect a single unmatched tag to either an Asset or a File. - """ - st.markdown(f"### Tag to Connect: `{tag_text}`") - st.markdown(f"Associated Files: `{associated_files}`") - col1, col2 = st.columns(2) - entity_type = None - - with col1: - if st.button("Retrieve Assets", key=f"btn_retrieve_assets_{tab}"): - st.session_state.selected_entity_type_to_connect = "asset" - st.session_state.selected_entity_to_connect_index = None - with col2: - if st.button("Retrieve Files", key=f"btn_retrieve_files_{tab}"): - st.session_state.selected_entity_type_to_connect = "file" - st.session_state.selected_entity_to_connect_index = None - - entity_type = st.session_state.selected_entity_type_to_connect - - if not entity_type: - return - - - st.write() - - if entity_type == "file": - entity_view = file_view - resource_property = file_resource_property - annotation_type = "diagrams.FileLink" - else: - entity_view = target_entities_view - resource_property = target_entities_resource_property - annotation_type = "diagrams.AssetLink" - - df_entities = fetch_entities(entity_view, resource_property) - - if df_entities.empty: - st.warning(f"No {entity_type}s found.") - return - - df_entities_display = df_entities.copy() - df_entities_display.insert(0, "Select", False) - - if st.session_state.selected_entity_to_connect_index is not None: - idx = st.session_state.selected_entity_to_connect_index - - if idx in df_entities_display.index: - df_entities_display.loc[:, "Select"] = False - df_entities_display.at[idx, "Select"] = True - - filterable_columns = ["sysUnit", "resourceType"] - - for filterable_column in filterable_columns: - unique_values = sorted(df_entities_display[filterable_column].dropna().unique().tolist()) - - selected_value = st.selectbox( - f"Filter by {filterable_column}", - key=f"sb_filterable_column_{filterable_column}_{tab}", - options=[None] + unique_values, - index=0 - ) - - if selected_value: - df_entities_display = df_entities_display[df_entities_display[filterable_column] == selected_value] - - entity_editor_key = f"{entity_type}_editor_{tag_text}_{tab}" - edited_entities = st.data_editor( - df_entities_display, - key=entity_editor_key, - column_config={ - "Select": st.column_config.CheckboxColumn(required=True), - "name": "Name", - "externalId": "External ID", - "resourceType": "Resource Type", - "sysUnit": "Sys Unit" - }, - use_container_width=True, - hide_index=True, - disabled=df_entities_display.columns.difference(["Select"]), - ) - - selected_indices = edited_entities[edited_entities.Select].index.tolist() - - if len(selected_indices) > 1: - new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_entity_to_connect_index] - st.session_state.selected_entity_to_connect_index = new_selection[0] if new_selection else None - st.rerun() - elif len(selected_indices) == 1: - st.session_state.selected_entity_to_connect_index = selected_indices[0] - elif len(selected_indices) == 0 and st.session_state.selected_entity_to_connect_index is not None: - st.session_state.selected_entity_to_connect_index = None - st.rerun() - - if st.session_state.selected_entity_to_connect_index is not None: - selected_entity = df_entities.loc[st.session_state.selected_entity_to_connect_index] - if st.button( - f"Connect '{tag_text}' to '{selected_entity['name']}' in {str(len(associated_files)) + ' files' if len(associated_files) > 1 else str(len(associated_files)) + ' file'}", - key=f"btn_connect_tag_to_entities_{tab}" - ): - success, count, error = create_tag_connection( - client, - db_name, - pattern_table, - tag_text, - associated_files, - selected_entity, - annotation_type - ) - - if success: - st.toast( - f"{count} annotation{'s' if count > 1 else ''} created from tag '{tag_text}' to {entity_type} '{selected_entity['name']}' " - f"in {len(associated_files)} file{'s' if len(associated_files) > 1 else ''}!", - icon=":material/check_small:" - ) - st.cache_data.clear() - else: - st.toast( - body=f"Failed to connect tag '{tag_text}': {error}", - icon=":material/error:" - ) - - -def create_tag_connection( - client: CogniteClient, - db_name: str, - table_name: str, - tag_text: str, - associated_files: list[str], - selected_entity: pd.Series, - annotation_type: str -): - updated_rows = [] - updated_edges = [] - - try: - rows = client.raw.rows.list( - db_name=db_name, - table_name=table_name, - limit=-1 - ) - - for row in rows: - row_data = row.columns - - if row_data.get("startNodeText") == tag_text and row_data.get("startNode") in associated_files: - edge_external_id = row.key - file_id = row_data.get("startNode") - - row_data["endNode"] = selected_entity["externalId"] - row_data["endNodeSpace"] = selected_entity["space"] - row_data["endNodeResourceType"] = selected_entity["resourceType"] - row_data["status"] = "approved" - - updated_rows.append( - RowWrite( - key=edge_external_id, - columns=row_data - ) - ) - - updated_edges.append( - EdgeApply( - space=row_data.get("space"), - external_id=edge_external_id, - type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), - start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), - end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity["externalId"]) - ) - ) - - if updated_rows: - st.write(len(updated_rows)) - # client.raw.rows.insert( - # db_name=db_name, - # table_name=table_name, - # row=updated_rows, - # ensure_parent=True - # ) - - if updated_edges: - st.write(len(updated_edges)) - # client.data_modeling.instances.apply(edges=updated_edges) - - return True, len(updated_rows), None - except Exception as e: - return False, 0, str(e) - - -def build_unmatched_tags_with_regions( - df: pd.DataFrame, - file_id: str, - potential_new_annotations: list[str] -): - df_filtered = df[ - (df["startNode"] == file_id) & - (df["startNodeText"].isin(potential_new_annotations)) - ] - - unmatched_tags_with_regions = [] - - for _, row in df_filtered.iterrows(): - region = { - "vertices": [ - {"x": row["startNodeXMin"], "y": row["startNodeYMin"]}, - {"x": row["startNodeXMax"], "y": row["startNodeYMin"]}, - {"x": row["startNodeXMax"], "y": row["startNodeYMax"]}, - {"x": row["startNodeXMin"], "y": row["startNodeYMax"]}, - ] - } - - unmatched_tags_with_regions.append({ - "text": row["startNodeText"], - "regions": [region] - }) - +import os +import re +import yaml +import streamlit as st +import pandas as pd +from datetime import datetime, timedelta +from cognite.client import CogniteClient +from cognite.client.data_classes import RowWrite, Asset, AssetFilter +from cognite.client.data_classes.data_modeling import ( + ViewId, + NodeId, + Node, + filters, + EdgeApply, + NodeOrEdgeData, + DirectRelationReference, +) +from cognite.client.data_classes.functions import FunctionCallLog +from data_structures import ViewPropertyConfig +from canvas import dm_generate + +client = CogniteClient() + + +@st.cache_data(ttl=3600) +def get_file_node(file_id: NodeId, file_view: ViewPropertyConfig) -> Node | None: + """Fetches a single file node from CDF.""" + try: + node = client.data_modeling.instances.retrieve_nodes(nodes=file_id, sources=file_view.as_view_id()) + return node + except Exception as e: + st.error(f"Failed to retrieve file node {file_id}: {e}") + return None + + +def generate_file_canvas( + file_id: NodeId, file_view: ViewPropertyConfig, ep_config: dict, unmatched_tags_with_regions: list = [] +): + """ + Generates an Industrial Canvas, including bounding boxes for unmatched tags, + and returns the canvas URL. + """ + file_node = get_file_node(file_id, file_view) + if not file_node: + st.error("Could not generate canvas because the file node could not be retrieved.") + return None + + canvas_name = f"Annotation Quality Analysis - {file_node.external_id}" + + try: + domain = os.getenv("COGNITE_ORGANIZATION") + project = client.config.project + cluster = client.config.cdf_cluster + + canvas_id = dm_generate( + name=canvas_name, + file_node=file_node, + file_view_id=file_view.as_view_id(), + client=client, + unmatched_tags_with_regions=unmatched_tags_with_regions, + ) + st.success(f"Successfully generated canvas: {canvas_name}") + + canvas_url = f"https://{domain}.fusion.cognite.com/{project}/industrial-canvas/canvas?canvasId={canvas_id}&cluster={cluster}.cognitedata.com&env={cluster}&workspace=industrial-tools" + return canvas_url + + except Exception as e: + st.error(f"Failed to generate canvas: {e}") + return None + + +@st.cache_data(ttl=600) +def find_pipelines(name_filter: str = "file_annotation") -> list[str]: + """ + Finds the external IDs of all extraction pipelines in the project, + filtered by a substring in their external ID. + """ + try: + all_pipelines = client.extraction_pipelines.list(limit=-1) + if not all_pipelines: + st.warning(f"No extraction pipelines found in the project.") + return [] + + filtered_ids = [p.external_id for p in all_pipelines if name_filter in p.external_id] + + if not filtered_ids: + st.warning(f"No pipelines matching the filter '*{name_filter}*' found in the project.") + return [] + + return sorted(filtered_ids) + except Exception as e: + st.error(f"An error occurred while searching for extraction pipelines: {e}") + return [] + + +@st.cache_data(ttl=3600) +def fetch_raw_table_data(db_name: str, table_name: str) -> pd.DataFrame: + """Fetches all rows from a specified RAW table and returns as a DataFrame.""" + try: + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) + if not rows: + return pd.DataFrame() + data = [row.columns for row in rows] + return pd.DataFrame(data) + except Exception as e: + st.error(f"Failed to fetch data from RAW table '{table_name}': {e}") + return pd.DataFrame() + + +def parse_run_message(message: str) -> dict: + """Parses the structured run message and returns a dictionary of its components.""" + if not message: + return {} + + pattern = re.compile( + r"\(caller:(?P\w+), function_id:(?P[\w\.-]+), call_id:(?P[\w\.-]+)\) - " + r"total files processed: (?P\d+) - " + r"successful files: (?P\d+) - " + r"failed files: (?P\d+)" + ) + match = pattern.search(message) + if match: + data = match.groupdict() + for key in ["total", "success", "failed"]: + if key in data: + data[key] = int(data[key]) + return data + return {} + + +@st.cache_data(ttl=3600) +def fetch_extraction_pipeline_config(pipeline_ext_id: str) -> tuple[dict, ViewPropertyConfig, ViewPropertyConfig]: + """ + Fetch configurations from the latest extraction + """ + ep_configuration = client.extraction_pipelines.config.retrieve(external_id=pipeline_ext_id) + config_dict = yaml.safe_load(ep_configuration.config) + + local_annotation_state_view = config_dict["dataModelViews"]["annotationStateView"] + annotation_state_view = ViewPropertyConfig( + local_annotation_state_view["schemaSpace"], + local_annotation_state_view["externalId"], + local_annotation_state_view["version"], + local_annotation_state_view["instanceSpace"], + ) + + local_file_view = config_dict["dataModelViews"]["fileView"] + file_view = ViewPropertyConfig( + local_file_view["schemaSpace"], + local_file_view["externalId"], + local_file_view["version"], + local_file_view.get("instanceSpace"), + ) + + local_target_entities_view = config_dict["dataModelViews"]["targetEntitiesView"] + target_entities_view = ViewPropertyConfig( + local_target_entities_view["schemaSpace"], + local_target_entities_view["externalId"], + local_target_entities_view["version"], + local_target_entities_view.get("instanceSpace"), + ) + + views_dict = { + "annotation_state": annotation_state_view, + "file": file_view, + "target_entities": target_entities_view, + } + + return (config_dict, views_dict) + + +@st.cache_data(ttl=3600) +def fetch_annotation_states(annotation_state_view: ViewPropertyConfig, file_view: ViewPropertyConfig): + """ + Fetches annotation state instances from the specified data model view + and joins them with their corresponding file instances. + """ + annotation_instances = client.data_modeling.instances.list( + instance_type="node", + space=annotation_state_view.instance_space, + sources=annotation_state_view.as_view_id(), + limit=-1, + ) + if not annotation_instances: + return pd.DataFrame() + + annotation_data = [] + nodes_to_fetch = [] + for instance in annotation_instances: + node_data = { + "externalId": instance.external_id, + "space": instance.space, + "createdTime": pd.to_datetime(instance.created_time, unit="ms"), + "lastUpdatedTime": pd.to_datetime(instance.last_updated_time, unit="ms"), + } + for prop_key, prop_value in instance.properties[annotation_state_view.as_view_id()].items(): + if prop_key == "linkedFile" and prop_value: + file_external_id = prop_value.get("externalId") + file_space = prop_value.get("space") + node_data["fileExternalId"] = file_external_id + node_data["fileSpace"] = file_space + if file_external_id and file_space: + nodes_to_fetch.append(NodeId(space=file_space, external_id=file_external_id)) + node_data[prop_key] = prop_value + annotation_data.append(node_data) + + df_annotations = pd.DataFrame(annotation_data) + if df_annotations.empty or not nodes_to_fetch: + return df_annotations + + unique_nodes_to_fetch = list(set(nodes_to_fetch)) + file_instances = client.data_modeling.instances.retrieve_nodes( + nodes=unique_nodes_to_fetch, sources=file_view.as_view_id() + ) + + file_data = [] + for instance in file_instances: + node_data = {"fileExternalId": instance.external_id, "fileSpace": instance.space} + properties = instance.properties[file_view.as_view_id()] + + for prop_key, prop_value in properties.items(): + node_data[f"file{prop_key.capitalize()}"] = ( + ", ".join(map(str, prop_value)) if isinstance(prop_value, list) else prop_value + ) + file_data.append(node_data) + + if not file_data: + return df_annotations + + df_files = pd.DataFrame(file_data) + df_merged = pd.merge(df_annotations, df_files, on=["fileExternalId", "fileSpace"], how="left") + + for col in ["createdTime", "lastUpdatedTime"]: + if col in df_merged.columns: + df_merged[col] = df_merged[col].dt.tz_localize("UTC") + + df_merged.rename(columns={"annotationStatus": "status", "attemptCount": "retries"}, inplace=True) + + return df_merged + + +@st.cache_data(ttl=3600) +def fetch_pipeline_run_history(pipeline_ext_id: str): + """Fetches the full run history for a given extraction pipeline.""" + return client.extraction_pipelines.runs.list(external_id=pipeline_ext_id, limit=-1) + + +@st.cache_data(ttl=3600) +def fetch_function_logs(function_id: int, call_id: int): + """Fetches the logs for a specific function call.""" + try: + log: FunctionCallLog = client.functions.calls.get_logs(call_id, function_id) + return log.to_text(with_timestamps=False) + except Exception as e: + return [f"Could not retrieve logs: {e}"] + + +def process_runs_for_graphing(runs): + """Transforms pipeline run data into a DataFrame for graphing.""" + launch_data, finalize_runs_to_agg = [], [] + for run in runs: + if run.status != "success": + continue + parsed = parse_run_message(run.message) + if not parsed: + continue + timestamp, count, caller = ( + pd.to_datetime(run.created_time, unit="ms").tz_localize("UTC"), + parsed.get("total", 0), + parsed.get("caller"), + ) + if caller == "Launch": + launch_data.append({"timestamp": timestamp, "count": count, "type": "Launch"}) + elif caller == "Finalize": + finalize_runs_to_agg.append({"timestamp": timestamp, "count": count}) + + aggregated_finalize_data = [] + if finalize_runs_to_agg: + finalize_runs_to_agg.sort(key=lambda x: x["timestamp"]) + current_group_start_time, current_group_count = finalize_runs_to_agg[0]["timestamp"], 0 + for run in finalize_runs_to_agg: + if run["timestamp"] < current_group_start_time + timedelta(minutes=10): + current_group_count += run["count"] + else: + aggregated_finalize_data.append( + {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} + ) + current_group_start_time, current_group_count = run["timestamp"], run["count"] + if current_group_count > 0: + aggregated_finalize_data.append( + {"timestamp": current_group_start_time, "count": current_group_count, "type": "Finalize"} + ) + + return pd.concat([pd.DataFrame(launch_data), pd.DataFrame(aggregated_finalize_data)], ignore_index=True) + + +@st.cache_data(ttl=3600) +def fetch_pattern_catalog(db_name: str, table_name: str) -> pd.DataFrame: + """ + Fetches the entity cache and explodes it to create a complete + catalog of all generated patterns, indexed by resourceType. + """ + try: + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) + if not rows: + return pd.DataFrame() + all_patterns = [] + for row in pd.DataFrame([row.columns for row in rows]).itertuples(): + for sample_list in ["AssetPatternSamples", "FilePatternSamples"]: + if hasattr(row, sample_list) and isinstance(getattr(row, sample_list), list): + for item in getattr(row, sample_list): + if item.get("sample") and item.get("resource_type"): + all_patterns.extend( + [ + {"resourceType": item["resource_type"], "pattern": pattern} + for pattern in item["sample"] + ] + ) + return pd.DataFrame(all_patterns) + except Exception as e: + st.error(f"Failed to fetch pattern catalog from '{table_name}': {e}") + return pd.DataFrame() + + +def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: + """ + Fetches all manual patterns from the RAW table and explodes them + into a tidy DataFrame for display and editing. + """ + all_patterns = [] + try: + for row in client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1): + key, patterns_list = row.key, row.columns.get("patterns", []) + scope_level, primary_scope, secondary_scope = "Global", "", "" + if key != "GLOBAL": + parts = key.split("_") + if len(parts) == 2: + scope_level, primary_scope, secondary_scope = "Secondary Scope", parts[0], parts[1] + else: + scope_level, primary_scope = "Primary Scope", key + all_patterns.extend( + [ + { + "key": key, + "scope_level": scope_level, + "annotation_type": p.get("annotation_type"), + "primary_scope": primary_scope, + "secondary_scope": secondary_scope, + "sample": p.get("sample"), + "resource_type": p.get("resource_type"), + "created_by": p.get("created_by"), + } + for p in patterns_list + ] + ) + + df = ( + pd.DataFrame(all_patterns) + if all_patterns + else pd.DataFrame( + columns=[ + "key", + "scope_level", + "annotation_type", + "primary_scope", + "secondary_scope", + "sample", + "resource_type", + "created_by", + ] + ) + ) + return df.fillna("").astype(str) + except Exception as e: + if "NotFoundError" not in str(type(e)): + st.error(f"Failed to fetch manual patterns: {e}") + return pd.DataFrame( + columns=["key", "scope_level", "annotation_type", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] + ) + + +def save_manual_patterns(df: pd.DataFrame, db_name: str, table_name: str): + """ + Takes a tidy DataFrame of patterns, groups them by scope key, + and writes them back to the RAW table. + """ + + def create_key(row): + if row["scope_level"] == "Global": + return "GLOBAL" + if row["scope_level"] == "Primary Scope" and row["primary_scope"]: + return row["primary_scope"] + if row["scope_level"] == "Secondary Scope" and row["primary_scope"] and row["secondary_scope"]: + return f"{row['primary_scope']}_{row['secondary_scope']}" + return None + + df["key"] = df.apply(create_key, axis=1) + df.dropna(subset=["key"], inplace=True) + rows_to_write = [ + RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "annotation_type", "created_by"]].to_dict("records")}) + for key, group in df.groupby("key") + ] + + existing_keys = {r.key for r in client.raw.rows.list(db_name, table_name, limit=-1)} + keys_to_delete = list(existing_keys - {r.key for r in rows_to_write}) + if keys_to_delete: + client.raw.rows.delete(db_name=db_name, table_name=table_name, key=keys_to_delete) + if rows_to_write: + client.raw.rows.insert(db_name=db_name, table_name=table_name, row=rows_to_write, ensure_parent=True) + + +@st.cache_data(ttl=600) +def get_files_by_call_id(call_id: int, annotation_state_view: ViewPropertyConfig) -> pd.DataFrame: + """ + Finds all files associated with a specific function call ID by querying + the AnnotationState data model. + """ + if not call_id: + return pd.DataFrame() + try: + call_id_filter = filters.Or( + filters.Equals(annotation_state_view.as_property_ref("launchFunctionCallId"), call_id), + filters.Equals(annotation_state_view.as_property_ref("finalizeFunctionCallId"), call_id), + ) + instances = client.data_modeling.instances.list( + instance_type="node", sources=annotation_state_view.as_view_id(), filter=call_id_filter, limit=-1 + ) + if not instances: + return pd.DataFrame() + + view_id_tuple = annotation_state_view.as_view_id() + file_ids = [ + instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") + for instance in instances + if instance.properties.get(view_id_tuple, {}).get("linkedFile", {}).get("externalId") + ] + return pd.DataFrame(file_ids, columns=["File External ID"]) + except Exception as e: + st.error(f"Failed to query files by call ID: {e}") + return pd.DataFrame() + + +def calculate_overview_kpis(df: pd.DataFrame) -> dict: + """Calculates high-level KPIs from the AnnotationState dataframe.""" + kpis = {"awaiting_processing": 0, "processed_total": 0, "failed_total": 0, "failure_rate_total": 0} + if df.empty: + return kpis + kpis["awaiting_processing"] = len(df[df["status"].isin(["New", "Retry", "Processing", "Finalizing"])]) + finalized_all_time = df[df["status"].isin(["Annotated", "Failed"])] + kpis["processed_total"] = len(finalized_all_time) + kpis["failed_total"] = len(finalized_all_time[finalized_all_time["status"] == "Failed"]) + if kpis["processed_total"] > 0: + kpis["failure_rate_total"] = (kpis["failed_total"] / kpis["processed_total"]) * 100 + return kpis + + +def filter_log_lines(log_text: str, search_string: str) -> str: + """ + Takes a block of log text and a search string, returning a new string + containing the lines that include the search string, plus the subsequent + indented lines that provide context. + """ + if not log_text or not isinstance(log_text, str): + return "Log content is not available or in an invalid format." + relevant_blocks, lines = [], log_text.splitlines() + for i, line in enumerate(lines): + if search_string in line: + current_block = [line] + next_line_index = i + 1 + while next_line_index < len(lines): + next_line = lines[next_line_index] + if next_line.strip().startswith("-") or "\t" in next_line or " " in next_line: + current_block.append(next_line) + next_line_index += 1 + else: + break + relevant_blocks.append("\n".join(current_block)) + return "\n\n".join(relevant_blocks) + + +# --- Remove all non-alphanumeric characters, convert to lowercase, and strip leading zeros from numbers --- +def normalize(s): + """ + Normalizes a string by: + 1. Ensuring it's a string. + 2. Removing all non-alphanumeric characters. + 3. Converting to lowercase. + 4. Removing leading zeros from any sequence of digits found within the string. + """ + if not isinstance(s, str): + return "" + + # Step 1: Basic cleaning (e.g., "V-0912" -> "v0912") + s = re.sub(r"[^a-zA-Z0-9]", "", s).lower() + + # Step 2: Define a replacer function that converts any matched number to an int and back to a string + def strip_leading_zeros(match): + # match.group(0) is the matched string (e.g., "0912") + return str(int(match.group(0))) + + # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string + # This turns "v0912" into "v912" + return re.sub(r"\d+", strip_leading_zeros, s) + + +@st.cache_data(ttl=600) +def fetch_potential_annotations(db_name: str, table_name: str, file_external_id: str) -> pd.DataFrame: + """Fetches potential annotations for a specific file from the patterns RAW table.""" + try: + rows = client.raw.rows.list( + db_name=db_name, table_name=table_name, limit=-1, filter={"startNode": file_external_id} + ) + if not rows: + return pd.DataFrame() + return pd.DataFrame([row.columns for row in rows]) + except Exception as e: + st.error(f"Failed to fetch potential annotations: {e}") + return pd.DataFrame() + + +@st.cache_data(ttl=3600) +def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> pd.DataFrame: + """ + Fetches entity instances from the specified data model view and returns a tidy DataFrame. + """ + instances = client.data_modeling.instances.list( + instance_type="node", + space=entity_view.instance_space, + sources=entity_view.as_view_id(), + limit=-1 + ) + if not instances: + return pd.DataFrame() + data = [] + for instance in instances: + props = instance.properties.get(entity_view.as_view_id(), {}) + + data.append( + { + "name": props.get("name"), + "externalId": instance.external_id, + "resourceType": props.get(resource_property), + "sysUnit": props.get("sysUnit"), + "space": instance.space, + } + ) + return pd.DataFrame(data) + + +def show_connect_unmatched_ui( + tag_text, + file_view, + target_entities_view, + file_resource_property, + target_entities_resource_property, + associated_files, + tab, + db_name, + pattern_table +): + """ + Displays the UI to connect a single unmatched tag to either an Asset or a File. + """ + st.markdown(f"### Tag to Connect: `{tag_text}`") + st.markdown(f"Associated Files: `{associated_files}`") + col1, col2 = st.columns(2) + entity_type = None + + with col1: + if st.button("Retrieve Assets", key=f"btn_retrieve_assets_{tab}"): + st.session_state.selected_entity_type_to_connect = "asset" + st.session_state.selected_entity_to_connect_index = None + with col2: + if st.button("Retrieve Files", key=f"btn_retrieve_files_{tab}"): + st.session_state.selected_entity_type_to_connect = "file" + st.session_state.selected_entity_to_connect_index = None + + entity_type = st.session_state.selected_entity_type_to_connect + + if not entity_type: + return + + if entity_type == "file": + entity_view = file_view + resource_property = file_resource_property + annotation_type = "diagrams.FileLink" + else: + entity_view = target_entities_view + resource_property = target_entities_resource_property + annotation_type = "diagrams.AssetLink" + + df_entities = fetch_entities(entity_view, resource_property) + + if df_entities.empty: + st.warning(f"No {entity_type}s found.") + return + + df_entities_display = df_entities.copy() + df_entities_display.insert(0, "Select", False) + + if st.session_state.selected_entity_to_connect_index is not None: + idx = st.session_state.selected_entity_to_connect_index + + if idx in df_entities_display.index: + df_entities_display.loc[:, "Select"] = False + df_entities_display.at[idx, "Select"] = True + + filterable_columns = [col for col in ["sysUnit", "resourceType"] if col in df_entities_display.columns] + + for filterable_column in filterable_columns: + unique_values = sorted(df_entities_display[filterable_column].dropna().unique().tolist()) + + selected_value = st.selectbox( + f"Filter by {filterable_column}", + key=f"sb_filterable_column_{filterable_column}_{tab}", + options=[None] + unique_values, + index=0 + ) + + if selected_value: + df_entities_display = df_entities_display[df_entities_display[filterable_column] == selected_value] + + all_columns = df_entities_display.columns.tolist() + default_columns = ["Select", "name", "resourceType", "sysUnit", "externalId"] + + with st.popover("Customize Table Columns"): + selected_columns = st.multiselect( + f"Select columns to display ({entity_type}s)", + options=all_columns, + default=[col for col in default_columns if col in all_columns], + key=f"ms_selected_columns_{tab}_{entity_type}" + ) + + entity_editor_key = f"{entity_type}_editor_{tag_text}_{tab}" + edited_entities = st.data_editor( + df_entities_display[selected_columns], + key=entity_editor_key, + column_config={ + "Select": st.column_config.CheckboxColumn(required=True), + "name": "Name", + "externalId": "External ID", + "resourceType": "Resource Type", + "sysUnit": "Sys Unit" + }, + use_container_width=True, + hide_index=True, + disabled=df_entities_display.columns.difference(["Select"]), + ) + + selected_indices = edited_entities[edited_entities.Select].index.tolist() + + if len(selected_indices) > 1: + new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_entity_to_connect_index] + st.session_state.selected_entity_to_connect_index = new_selection[0] if new_selection else None + st.rerun() + elif len(selected_indices) == 1: + st.session_state.selected_entity_to_connect_index = selected_indices[0] + elif len(selected_indices) == 0 and st.session_state.selected_entity_to_connect_index is not None: + st.session_state.selected_entity_to_connect_index = None + st.rerun() + + if st.session_state.selected_entity_to_connect_index is not None: + selected_entity = df_entities.loc[st.session_state.selected_entity_to_connect_index] + if st.button( + f"Connect '{tag_text}' to '{selected_entity['name']}' in {str(len(associated_files)) + ' files' if len(associated_files) > 1 else str(len(associated_files)) + ' file'}", + key=f"btn_connect_tag_to_entities_{tab}" + ): + success, count, error = create_tag_connection( + client, + db_name, + pattern_table, + tag_text, + associated_files, + selected_entity, + annotation_type + ) + + if success: + st.toast( + f"{count} annotation{'s' if count > 1 else ''} created from tag '{tag_text}' to {entity_type} '{selected_entity['name']}' " + f"in {len(associated_files)} file{'s' if len(associated_files) > 1 else ''}!", + icon=":material/check_small:" + ) + st.cache_data.clear() + else: + st.toast( + body=f"Failed to connect tag '{tag_text}': {error}", + icon=":material/error:" + ) + + +def create_tag_connection( + client: CogniteClient, + db_name: str, + table_name: str, + tag_text: str, + associated_files: list[str], + selected_entity: pd.Series, + annotation_type: str +): + updated_rows = [] + updated_edges = [] + + try: + rows = client.raw.rows.list( + db_name=db_name, + table_name=table_name, + limit=-1 + ) + + for row in rows: + row_data = row.columns + + if row_data.get("startNodeText") == tag_text and row_data.get("startNode") in associated_files: + edge_external_id = row.key + file_id = row_data.get("startNode") + + row_data["endNode"] = selected_entity["externalId"] + row_data["endNodeSpace"] = selected_entity["space"] + row_data["endNodeResourceType"] = selected_entity["resourceType"] + row_data["status"] = "approved" + + updated_rows.append( + RowWrite( + key=edge_external_id, + columns=row_data + ) + ) + + updated_edges.append( + EdgeApply( + space=row_data.get("space"), + external_id=edge_external_id, + type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), + start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), + end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity["externalId"]) + ) + ) + + if updated_rows: + st.write(len(updated_rows)) + # client.raw.rows.insert( + # db_name=db_name, + # table_name=table_name, + # row=updated_rows, + # ensure_parent=True + # ) + + if updated_edges: + st.write(len(updated_edges)) + # client.data_modeling.instances.apply(edges=updated_edges) + + return True, len(updated_rows), None + except Exception as e: + return False, 0, str(e) + + +def build_unmatched_tags_with_regions( + df: pd.DataFrame, + file_id: str, + potential_new_annotations: list[str] +): + df_filtered = df[ + (df["startNode"] == file_id) & + (df["startNodeText"].isin(potential_new_annotations)) + ] + + unmatched_tags_with_regions = [] + + for _, row in df_filtered.iterrows(): + region = { + "vertices": [ + {"x": row["startNodeXMin"], "y": row["startNodeYMin"]}, + {"x": row["startNodeXMax"], "y": row["startNodeYMin"]}, + {"x": row["startNodeXMax"], "y": row["startNodeYMax"]}, + {"x": row["startNodeXMin"], "y": row["startNodeYMax"]}, + ] + } + + unmatched_tags_with_regions.append({ + "text": row["startNodeText"], + "regions": [region] + }) + return unmatched_tags_with_regions \ No newline at end of file From 88b702eca2e411d4c2770cca26c0810a7092ca65 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Fri, 10 Oct 2025 17:27:24 -0300 Subject: [PATCH 078/128] Ensuring sink node space for created edges and fetching all target entity columns --- .../file_annotation_dashboard/canvas.py | 2 +- .../file_annotation_dashboard/helper.py | 74 ++++++++++--------- .../pages/Annotation_Quality.py | 7 +- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py index bf5a7e30..6ffed692 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -227,4 +227,4 @@ def reset_canvas_annotations(canvas_id: str, client: CogniteClient): if nodes_to_delete_ids: client.data_modeling.instances.delete( nodes=nodes_to_delete_ids - ) \ No newline at end of file + ) \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index aac81e78..f7cc9644 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -529,21 +529,26 @@ def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> p sources=entity_view.as_view_id(), limit=-1 ) + if not instances: return pd.DataFrame() + data = [] + for instance in instances: - props = instance.properties.get(entity_view.as_view_id(), {}) + props = instance.properties.get(entity_view.as_view_id(), {}) or {} + row = {"externalId": instance.external_id, "space": instance.space} + + row["name"] = props.get("name") + row["resourceType"] = props.get(resource_property) + row["sysUnit"] = props.get("sysUnit") - data.append( - { - "name": props.get("name"), - "externalId": instance.external_id, - "resourceType": props.get(resource_property), - "sysUnit": props.get("sysUnit"), - "space": instance.space, - } - ) + for k, v in props.items(): + if k not in row: + row[k] = v + + data.append(row) + return pd.DataFrame(data) @@ -556,7 +561,8 @@ def show_connect_unmatched_ui( associated_files, tab, db_name, - pattern_table + pattern_table, + apply_config ): """ Displays the UI to connect a single unmatched tag to either an Asset or a File. @@ -672,7 +678,8 @@ def show_connect_unmatched_ui( tag_text, associated_files, selected_entity, - annotation_type + annotation_type, + apply_config, ) if success: @@ -696,7 +703,8 @@ def create_tag_connection( tag_text: str, associated_files: list[str], selected_entity: pd.Series, - annotation_type: str + annotation_type: str, + apply_config: dict, ): updated_rows = [] updated_edges = [] @@ -708,6 +716,8 @@ def create_tag_connection( limit=-1 ) + sink_node_space = apply_config["sinkNode"]["space"] + for row in rows: row_data = row.columns @@ -715,10 +725,20 @@ def create_tag_connection( edge_external_id = row.key file_id = row_data.get("startNode") + updated_edges.append( + EdgeApply( + space=sink_node_space, + external_id=edge_external_id, + type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), + start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), + end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity.get("externalId")) + ) + ) + row_data["endNode"] = selected_entity["externalId"] row_data["endNodeSpace"] = selected_entity["space"] row_data["endNodeResourceType"] = selected_entity["resourceType"] - row_data["status"] = "approved" + row_data["status"] = "Approved" updated_rows.append( RowWrite( @@ -727,28 +747,16 @@ def create_tag_connection( ) ) - updated_edges.append( - EdgeApply( - space=row_data.get("space"), - external_id=edge_external_id, - type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), - start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), - end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity["externalId"]) - ) - ) - if updated_rows: - st.write(len(updated_rows)) - # client.raw.rows.insert( - # db_name=db_name, - # table_name=table_name, - # row=updated_rows, - # ensure_parent=True - # ) + client.raw.rows.insert( + db_name=db_name, + table_name=table_name, + row=updated_rows, + ensure_parent=True + ) if updated_edges: - st.write(len(updated_edges)) - # client.data_modeling.instances.apply(edges=updated_edges) + client.data_modeling.instances.apply(edges=updated_edges, replace=False) return True, len(updated_rows), None except Exception as e: diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index fecd44a3..eb775175 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -74,6 +74,7 @@ def reset_selection(): manual_patterns_table = cache_config.get("rawManualPatternsCatalog") file_resource_property = ep_config.get("launchFunction", {}).get("fileResourceProperty", "") target_entities_resource_property = ep_config.get("launchFunction", {}).get("targetEntitiesResourceProperty", "") +apply_config = ep_config.get("finalizeFunction", {}).get("applyService", {}) if not all([db_name, pattern_table, tag_table, doc_table, cache_table, manual_patterns_table]): st.error("Could not find all required RAW table names in the pipeline configuration.") @@ -347,7 +348,8 @@ def reset_selection(): associated_files=tag_to_files_unmatched.get(selected_tag_text, []), tab="overall", db_name=db_name, - pattern_table=pattern_table + pattern_table=pattern_table, + apply_config=apply_config ) @@ -682,7 +684,8 @@ def calculate_metrics(row): associated_files=[selected_file], tab="per_file", db_name=db_name, - pattern_table=pattern_table + pattern_table=pattern_table, + apply_config=apply_config ) else: From 25fbfd090dfa1e02803a8530c58ef8413a08dbc9 Mon Sep 17 00:00:00 2001 From: lucasguimaraes-rdx Date: Tue, 14 Oct 2025 13:20:06 -0300 Subject: [PATCH 079/128] Adding fallback for resourceType using entity view when promoting edges --- .../streamlit/file_annotation_dashboard/helper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index f7cc9644..b38a3b7a 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -680,6 +680,7 @@ def show_connect_unmatched_ui( selected_entity, annotation_type, apply_config, + entity_view, ) if success: @@ -705,6 +706,7 @@ def create_tag_connection( selected_entity: pd.Series, annotation_type: str, apply_config: dict, + entity_view: ViewPropertyConfig, ): updated_rows = [] updated_edges = [] @@ -737,7 +739,10 @@ def create_tag_connection( row_data["endNode"] = selected_entity["externalId"] row_data["endNodeSpace"] = selected_entity["space"] - row_data["endNodeResourceType"] = selected_entity["resourceType"] + + resource_type = selected_entity["resourceType"] if selected_entity["resourceType"] else entity_view.external_id + + row_data["endNodeResourceType"] = resource_type row_data["status"] = "Approved" updated_rows.append( From 4c8c8edd65a878bb20b17fea77362651fcfa8089 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 12:33:05 -0500 Subject: [PATCH 080/128] added random delay and corrected retrieval of oldest job first --- .../functions/fn_file_annotation_finalize/handler.py | 5 +++++ .../fn_file_annotation_finalize/services/RetrieveService.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py index 63f70303..de68d7c7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py @@ -1,5 +1,7 @@ import sys import threading +import time +import random from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient @@ -47,6 +49,9 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: ) run_status: str = "success" + # NOTE: a random delay to stagger API requests. Used to prevent API load shedding that can return empty results under high concurrency. + delay = random.uniform(0.1, 1.0) + time.sleep(delay) try: while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): if finalize_instance.run() == "Done": diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py index c98f9dc1..ed6ae39e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py @@ -86,7 +86,7 @@ def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, sort_by_time.append( instances.InstanceSort( property=self.annotation_state_view.as_property_ref("sourceUpdatedTime"), - direction="descending", + direction="ascending", ) ) @@ -94,7 +94,7 @@ def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, instance_type="node", sources=self.annotation_state_view.as_view_id(), space=self.annotation_state_view.instance_space, - limit=-1, + limit=1, filter=self.filter_jobs, sort=sort_by_time, ) From 59c830d11481494033a866efdc516e75f8d6bec1 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 12:38:38 -0500 Subject: [PATCH 081/128] renamed file containing nodes to deploy --- .../data_models/{canvas.node.yaml => hdm.node.yaml} | 0 .../cdf_file_annotation/data_models/hdm.space.yaml | 1 + 2 files changed, 1 insertion(+) rename modules/contextualization/cdf_file_annotation/data_models/{canvas.node.yaml => hdm.node.yaml} (100%) diff --git a/modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.node.yaml similarity index 100% rename from modules/contextualization/cdf_file_annotation/data_models/canvas.node.yaml rename to modules/contextualization/cdf_file_annotation/data_models/hdm.node.yaml diff --git a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml index 51e2d2c1..fe392ce7 100644 --- a/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml +++ b/modules/contextualization/cdf_file_annotation/data_models/hdm.space.yaml @@ -1,6 +1,7 @@ - description: Helper data model space name: {{ annotationStateSchemaSpace }} space: {{ annotationStateSchemaSpace }} + - description: Pattern mode results instance space name: {{ patternModeInstanceSpace }} space: {{ patternModeInstanceSpace }} \ No newline at end of file From 95c0b6e5e12b2846cd88e167de02005464de1cdb Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 12:58:27 -0500 Subject: [PATCH 082/128] removed the report service section in the extraction pipeline config --- .../ep_file_annotation.config.yaml | 7 +-- .../services/ApplyService.py | 20 +++---- .../services/ConfigService.py | 7 +-- .../services/ConfigService.py | 7 +-- .../pages/Annotation_Quality.py | 57 +++++++++---------- 5 files changed, 42 insertions(+), 56 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 17e43f98..5316ef9e 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -120,14 +120,13 @@ config: operator: Exists targetProperty: diagramDetectJobId applyService: + autoApprovalThreshold: 1.0 + autoSuggestThreshold: 1.0 sinkNode: space: {{ patternModeInstanceSpace }} externalId: {{patternDetectSink}} - autoApprovalThreshold: 1.0 - autoSuggestThreshold: 1.0 - reportService: rawDb: {{ rawDb }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} rawTableDocPattern: {{ rawTableDocPattern }} - rawBatchSize: 1000 + diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 22fa6395..f895e142 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -110,25 +110,25 @@ def process_and_apply_annotations_for_file( # Step 4: Apply all data model and RAW changes self.update_instances(list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges) - db_name = self.config.finalize_function.report_service.raw_db + db_name = self.config.finalize_function.apply_service.raw_db if doc_rows: self.client.raw.rows.insert( db_name=db_name, - table_name=self.config.finalize_function.report_service.raw_table_doc_doc, + table_name=self.config.finalize_function.apply_service.raw_table_doc_doc, row=doc_rows, ensure_parent=True, ) if tag_rows: self.client.raw.rows.insert( db_name=db_name, - table_name=self.config.finalize_function.report_service.raw_table_doc_tag, + table_name=self.config.finalize_function.apply_service.raw_table_doc_tag, row=tag_rows, ensure_parent=True, ) if pattern_rows: self.client.raw.rows.insert( db_name=db_name, - table_name=self.config.finalize_function.report_service.raw_table_doc_pattern, + table_name=self.config.finalize_function.apply_service.raw_table_doc_pattern, row=pattern_rows, ensure_parent=True, ) @@ -160,14 +160,14 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: self.client.data_modeling.instances.delete(edges=edge_ids) if doc_keys: self.client.raw.rows.delete( - db_name=self.config.finalize_function.report_service.raw_db, - table_name=self.config.finalize_function.report_service.raw_table_doc_doc, + db_name=self.config.finalize_function.apply_service.raw_db, + table_name=self.config.finalize_function.apply_service.raw_table_doc_doc, key=doc_keys, ) if tag_keys: self.client.raw.rows.delete( - db_name=self.config.finalize_function.report_service.raw_db, - table_name=self.config.finalize_function.report_service.raw_table_doc_tag, + db_name=self.config.finalize_function.apply_service.raw_db, + table_name=self.config.finalize_function.apply_service.raw_table_doc_tag, key=tag_keys, ) counts["doc"], counts["tag"] = len(doc_keys), len(tag_keys) @@ -182,8 +182,8 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: self.client.data_modeling.instances.delete(edges=edge_ids) if row_keys: self.client.raw.rows.delete( - db_name=self.config.finalize_function.report_service.raw_db, - table_name=self.config.finalize_function.report_service.raw_table_doc_pattern, + db_name=self.config.finalize_function.apply_service.raw_db, + table_name=self.config.finalize_function.apply_service.raw_table_doc_pattern, key=row_keys, ) counts["pattern"] = len(row_keys) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 15f71ede..8b5bd257 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -204,17 +204,13 @@ class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): class ApplyServiceConfig(BaseModel, alias_generator=to_camel): - sink_node: NodeId auto_approval_threshold: float = Field(gt=0.0, le=1.0) auto_suggest_threshold: float = Field(gt=0.0, le=1.0) - - -class ReportServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId raw_db: str raw_table_doc_tag: str raw_table_doc_doc: str raw_table_doc_pattern: str - raw_batch_size: int class FinalizeFunction(BaseModel, alias_generator=to_camel): @@ -222,7 +218,6 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): max_retry_attempts: int retrieve_service: RetrieveServiceConfig apply_service: ApplyServiceConfig - report_service: ReportServiceConfig class DataModelViews(BaseModel, alias_generator=to_camel): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 15f71ede..8b5bd257 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -204,17 +204,13 @@ class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): class ApplyServiceConfig(BaseModel, alias_generator=to_camel): - sink_node: NodeId auto_approval_threshold: float = Field(gt=0.0, le=1.0) auto_suggest_threshold: float = Field(gt=0.0, le=1.0) - - -class ReportServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId raw_db: str raw_table_doc_tag: str raw_table_doc_doc: str raw_table_doc_pattern: str - raw_batch_size: int class FinalizeFunction(BaseModel, alias_generator=to_camel): @@ -222,7 +218,6 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): max_retry_attempts: int retrieve_service: RetrieveServiceConfig apply_service: ApplyServiceConfig - report_service: ReportServiceConfig class DataModelViews(BaseModel, alias_generator=to_camel): diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py index eb775175..a9757f56 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/pages/Annotation_Quality.py @@ -64,17 +64,16 @@ def reset_selection(): file_view = view_config["file"] target_entities_view = view_config["target_entities"] -report_config = ep_config.get("finalizeFunction", {}).get("reportService", {}) +apply_config = ep_config.get("finalizeFunction", {}).get("applyService", {}) cache_config = ep_config.get("launchFunction", {}).get("cacheService", {}) -db_name = report_config.get("rawDb") -pattern_table = report_config.get("rawTableDocPattern") -tag_table = report_config.get("rawTableDocTag") -doc_table = report_config.get("rawTableDocDoc") +db_name = apply_config.get("rawDb") +pattern_table = apply_config.get("rawTableDocPattern") +tag_table = apply_config.get("rawTableDocTag") +doc_table = apply_config.get("rawTableDocDoc") cache_table = cache_config.get("rawTableCache") manual_patterns_table = cache_config.get("rawManualPatternsCatalog") file_resource_property = ep_config.get("launchFunction", {}).get("fileResourceProperty", "") target_entities_resource_property = ep_config.get("launchFunction", {}).get("targetEntitiesResourceProperty", "") -apply_config = ep_config.get("finalizeFunction", {}).get("applyService", {}) if not all([db_name, pattern_table, tag_table, doc_table, cache_table, manual_patterns_table]): st.error("Could not find all required RAW table names in the pipeline configuration.") @@ -283,10 +282,7 @@ def reset_selection(): df_unmatched_filtered = df_metrics_input[df_metrics_input["startNodeText"].isin(unmatched_tags_list)] tag_to_files_unmatched = ( - df_unmatched_filtered.groupby("startNodeText")["startNode"] - .unique() - .apply(list) - .to_dict() + df_unmatched_filtered.groupby("startNodeText")["startNode"].unique().apply(list).to_dict() ) tag_occurrences = ( @@ -316,7 +312,7 @@ def reset_selection(): "Select": st.column_config.CheckboxColumn(required=True), "text": "Tag", "fileCount": "Associated Files", - "occurrenceCount": "Occurrences" + "occurrenceCount": "Occurrences", }, use_container_width=True, hide_index=True, @@ -326,7 +322,9 @@ def reset_selection(): selected_indices = unmatched_data_editor[unmatched_data_editor.Select].index.tolist() if len(selected_indices) > 1: - new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_unmatched_overall_index] + new_selection = [ + idx for idx in selected_indices if idx != st.session_state.selected_unmatched_overall_index + ] st.session_state.selected_unmatched_overall_index = new_selection[0] if new_selection else None st.rerun() elif len(selected_indices) == 1: @@ -338,7 +336,7 @@ def reset_selection(): if st.session_state.selected_unmatched_overall_index is not None: selected_tag_row = unmatched_display.loc[st.session_state.selected_unmatched_overall_index] selected_tag_text = selected_tag_row["text"] - + show_connect_unmatched_ui( selected_tag_text, file_view, @@ -349,11 +347,10 @@ def reset_selection(): tab="overall", db_name=db_name, pattern_table=pattern_table, - apply_config=apply_config + apply_config=apply_config, ) - # ========================================== # PER-FILE ANALYSIS TAB # ========================================== @@ -589,7 +586,7 @@ def calculate_metrics(row): potential_tags_for_canvas = build_unmatched_tags_with_regions( df=df_metrics_input, file_id=selected_file, - potential_new_annotations=potential_new_annotations_set + potential_new_annotations=potential_new_annotations_set, ) canvas_url = generate_file_canvas( file_id=file_node_id, @@ -626,12 +623,13 @@ def calculate_metrics(row): "💡 Potential New Annotations in this File", len(potential_df), ) - + unmatched_display = potential_df[["startNodeText", "endNodeResourceType"]].copy() unmatched_display.insert(0, "Select", False) - + occurrences = ( - df_patterns_file[df_patterns_file["startNode"] == selected_file].groupby("startNodeText") + df_patterns_file[df_patterns_file["startNode"] == selected_file] + .groupby("startNodeText") .size() .reset_index(name="occurrenceCount") ) @@ -653,7 +651,7 @@ def calculate_metrics(row): "Select": st.column_config.CheckboxColumn(required=True), "startNodeText": "Tag", "endNodeResourceType": "Resource Type", - "occurrenceCount": "Occurrences" + "occurrenceCount": "Occurrences", }, use_container_width=True, hide_index=True, @@ -661,9 +659,11 @@ def calculate_metrics(row): ) selected_indices = unmatched_data_editor[unmatched_data_editor.Select].index.tolist() - + if len(selected_indices) > 1: - new_selection = [idx for idx in selected_indices if idx != st.session_state.selected_unmatched_per_file_index] + new_selection = [ + idx for idx in selected_indices if idx != st.session_state.selected_unmatched_per_file_index + ] st.session_state.selected_unmatched_per_file_index = new_selection[0] if new_selection else None st.rerun() elif len(selected_indices) == 1: @@ -685,9 +685,9 @@ def calculate_metrics(row): tab="per_file", db_name=db_name, pattern_table=pattern_table, - apply_config=apply_config + apply_config=apply_config, ) - + else: st.info("✔️ Select a file in the table above to see a detailed breakdown of its tags.") @@ -712,9 +712,8 @@ def calculate_metrics(row): "key": st.column_config.TextColumn("Scope Key", disabled=True), "sample": st.column_config.TextColumn("Pattern String", required=True), "annotation_type": st.column_config.SelectboxColumn( - "Annotation Type", - options=["diagrams.FileLink", "diagrams.AssetLink"], - required=True), + "Annotation Type", options=["diagrams.FileLink", "diagrams.AssetLink"], required=True + ), "resource_type": st.column_config.TextColumn("Resource Type", required=True), "scope_level": st.column_config.SelectboxColumn( "Scope Level", @@ -748,9 +747,7 @@ def calculate_metrics(row): st.write("2. Enter Pattern Details") new_pattern = st.text_input("Pattern String", placeholder="e.g., [PI]-00000") new_annotation_type = st.selectbox( - "Annotation Type", - ["diagrams.FileLink", "diagrams.AssetLink"], - key="new_annotation_type_selector" + "Annotation Type", ["diagrams.FileLink", "diagrams.AssetLink"], key="new_annotation_type_selector" ) new_resource_type = st.text_input("Resource Type", placeholder="e.g., Asset") From 8c811e3d346ef86fc104d10ae825a9c1ef6c4e75 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 14:03:04 -0500 Subject: [PATCH 083/128] added doc strings to each method in google format --- .../services/ApplyService.py | 290 +++++++++++++++--- .../services/FinalizeService.py | 197 +++++++++--- .../services/LoggerService.py | 96 +++++- .../services/PipelineService.py | 16 +- .../services/RetrieveService.py | 119 +++++-- .../services/AnnotationService.py | 58 +++- .../services/CacheService.py | 220 ++++++++++--- .../services/DataModelService.py | 230 +++++++++++--- .../services/LaunchService.py | 235 ++++++++++---- .../services/LoggerService.py | 96 +++++- .../services/PipelineService.py | 16 +- 11 files changed, 1306 insertions(+), 267 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index f895e142..12b30f7f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -32,7 +32,11 @@ class IApplyService(abc.ABC): @abc.abstractmethod def process_and_apply_annotations_for_file( - self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool + self, + file_node: Node, + regular_item: dict | None, + pattern_item: dict | None, + clean_old: bool, ) -> tuple[str, str]: pass @@ -53,28 +57,57 @@ class GeneralApplyService(IApplyService): EXTERNAL_ID_LIMIT = 256 FUNCTION_ID = "fn_file_annotation_finalize" - def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): + def __init__( + self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger + ): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger - self.core_annotation_view_id: ViewId = config.data_model_views.core_annotation_view.as_view_id() + self.core_annotation_view_id: ViewId = ( + config.data_model_views.core_annotation_view.as_view_id() + ) self.file_view_id: ViewId = config.data_model_views.file_view.as_view_id() self.file_annotation_type = config.data_model_views.file_view.annotation_type - self.approve_threshold = config.finalize_function.apply_service.auto_approval_threshold - self.suggest_threshold = config.finalize_function.apply_service.auto_suggest_threshold + self.approve_threshold = ( + config.finalize_function.apply_service.auto_approval_threshold + ) + self.suggest_threshold = ( + config.finalize_function.apply_service.auto_suggest_threshold + ) self.sink_node_ref = DirectRelationReference( space=config.finalize_function.apply_service.sink_node.space, external_id=config.finalize_function.apply_service.sink_node.external_id, ) def process_and_apply_annotations_for_file( - self, file_node: Node, regular_item: dict | None, pattern_item: dict | None, clean_old: bool + self, + file_node: Node, + regular_item: dict | None, + pattern_item: dict | None, + clean_old: bool, ) -> tuple[str, str]: """ - Performs the entire annotation transaction for a single file. + Performs the complete annotation workflow for a single file. + + Processes diagram detection results (regular and pattern mode), removes old annotations if needed, + creates annotation edges in the data model, writes annotation data to RAW tables, + and updates the file node's tag status. + + Args: + file_node: The file node instance to annotate. + regular_item: Dictionary containing regular diagram detect results. + pattern_item: Dictionary containing pattern mode diagram detect results. + clean_old: Whether to delete existing annotations before applying new ones. + + Returns: + A tuple containing: + - Summary message of regular annotations applied + - Summary message of pattern annotations created """ file_id = file_node.as_id() - source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) + source_id = cast( + str, file_node.properties.get(self.file_view_id, {}).get("sourceId") + ) if clean_old: deleted_counts = self._delete_annotations_for_file(file_id) @@ -89,13 +122,17 @@ def process_and_apply_annotations_for_file( for annotation in regular_item["annotations"]: stable_hash = self._create_stable_hash(annotation) processed_hashes.add(stable_hash) - edges = self._detect_annotation_to_edge_applies(file_id, source_id, doc_rows, tag_rows, annotation) + edges = self._detect_annotation_to_edge_applies( + file_id, source_id, doc_rows, tag_rows, annotation + ) regular_edges.extend(edges.values()) # Step 2: Process pattern annotations, skipping any that were already processed pattern_edges, pattern_rows = [], [] if pattern_item and pattern_item.get("annotations"): - pattern_edges, pattern_rows = self._process_pattern_results(pattern_item, file_node, processed_hashes) + pattern_edges, pattern_rows = self._process_pattern_results( + pattern_item, file_node, processed_hashes + ) # Step 3: Update the file node tag node_apply = file_node.as_write() @@ -109,7 +146,9 @@ def process_and_apply_annotations_for_file( ) # Step 4: Apply all data model and RAW changes - self.update_instances(list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges) + self.update_instances( + list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges + ) db_name = self.config.finalize_function.apply_service.raw_db if doc_rows: self.client.raw.rows.insert( @@ -138,11 +177,36 @@ def process_and_apply_annotations_for_file( f"Created {len(pattern_rows)} new pattern detections.", ) - def update_instances(self, list_node_apply=None, list_edge_apply=None) -> InstancesApplyResult: - return self.client.data_modeling.instances.apply(nodes=list_node_apply, edges=list_edge_apply, replace=False) + def update_instances( + self, list_node_apply=None, list_edge_apply=None + ) -> InstancesApplyResult: + """ + Applies node and/or edge updates to the data model. + + Args: + list_node_apply: Optional NodeApply or list of NodeApply objects to update. + list_edge_apply: Optional EdgeApply or list of EdgeApply objects to update. + + Returns: + InstancesApplyResult containing the results of the apply operation. + """ + return self.client.data_modeling.instances.apply( + nodes=list_node_apply, edges=list_edge_apply, replace=False + ) def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: - """Deletes all standard and pattern edges and their corresponding RAW rows for a file.""" + """ + Removes all existing annotations for a file from both data model and RAW tables. + + Deletes annotation edges (doc-to-doc, doc-to-tag, and pattern annotations) and their + corresponding RAW table entries to prepare for fresh annotations. + + Args: + file_id: NodeId of the file whose annotations should be deleted. + + Returns: + Dictionary with counts of deleted annotations: {"doc": int, "tag": int, "pattern": int}. + """ counts = {"doc": 0, "tag": 0, "pattern": 0} std_edges = self._list_annotations_for_file( @@ -191,9 +255,19 @@ def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: def _list_annotations_for_file(self, node_id: NodeId, edge_instance_space: str): """ - List all annotation edges for a file node given the instance space of where the edges are stored. + Retrieves all annotation edges for a specific file from a given instance space. + + Args: + node_id: NodeId of the file to query annotations for. + edge_instance_space: Instance space where the annotation edges are stored. + + Returns: + EdgeList of all annotation edges connected to the file node. """ - start_node_filter = Equals(["edge", "startNode"], {"space": node_id.space, "externalId": node_id.external_id}) + start_node_filter = Equals( + ["edge", "startNode"], + {"space": node_id.space, "externalId": node_id.external_id}, + ) return self.client.data_modeling.instances.list( instance_type="edge", @@ -206,8 +280,27 @@ def _list_annotations_for_file(self, node_id: NodeId, edge_instance_space: str): def _process_pattern_results( self, result_item: dict, file_node: Node, existing_hashes: set ) -> tuple[list[EdgeApply], list[RowWrite]]: + """ + Processes pattern mode detection results into annotation edges and RAW rows. + + Creates pattern-based annotations that link to a sink node rather than specific entities, + allowing review and approval of pattern-detected annotations before linking to actual entities. + Skips patterns already covered by regular detection results. + + Args: + result_item: Dictionary containing pattern mode detection results. + file_node: The file node being annotated. + existing_hashes: Set of annotation hashes from regular detection to avoid duplicates. + + Returns: + A tuple containing: + - List of EdgeApply objects for pattern annotations + - List of RowWrite objects for RAW table entries + """ file_id = file_node.as_id() - source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) + source_id = cast( + str, file_node.properties.get(self.file_view_id, {}).get("sourceId") + ) doc_patterns, edge_applies = [], [] for detect_annotation in result_item.get("annotations", []): stable_hash = self._create_stable_hash(detect_annotation) @@ -222,7 +315,8 @@ def _process_pattern_results( external_id = self._create_pattern_annotation_id(file_id, detect_annotation) now = datetime.now(timezone.utc).replace(microsecond=0) annotation_type = entity.get( - "annotation_type", self.config.data_model_views.target_entities_view.annotation_type + "annotation_type", + self.config.data_model_views.target_entities_view.annotation_type, ) annotation_properties = { "name": file_id.external_id, @@ -230,10 +324,22 @@ def _process_pattern_results( "status": DiagramAnnotationStatus.SUGGESTED.value, "tags": [], "startNodePageNumber": detect_annotation.get("region", {}).get("page"), - "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMin": min( + v.get("x", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMin": min( + v.get("y", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeXMax": max( + v.get("x", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMax": max( + v.get("y", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, @@ -243,10 +349,20 @@ def _process_pattern_results( edge_apply = EdgeApply( space=self.sink_node_ref.space, external_id=external_id, - type=DirectRelationReference(space=self.core_annotation_view_id.space, external_id=annotation_type), - start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), + type=DirectRelationReference( + space=self.core_annotation_view_id.space, + external_id=annotation_type, + ), + start_node=DirectRelationReference( + space=file_id.space, external_id=file_id.external_id + ), end_node=self.sink_node_ref, - sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], + sources=[ + NodeOrEdgeData( + source=self.core_annotation_view_id, + properties=annotation_properties, + ) + ], ) edge_applies.append(edge_apply) row_columns = { @@ -273,6 +389,22 @@ def _detect_annotation_to_edge_applies( doc_tag: list[RowWrite], detect_annotation: dict[str, Any], ) -> dict[tuple, EdgeApply]: + """ + Converts a single detection annotation into edge applies and RAW row writes. + + Creates annotation edges linking the file to detected entities, applying confidence thresholds + to determine approval/suggestion status. Also creates corresponding RAW table entries. + + Args: + file_instance_id: NodeId of the file being annotated. + source_id: Source ID of the file for RAW table logging. + doc_doc: List to append doc-to-doc annotation RAW rows to. + doc_tag: List to append doc-to-tag annotation RAW rows to. + detect_annotation: Dictionary containing a single detection result. + + Returns: + Dictionary mapping edge keys to EdgeApply objects (deduplicated by start/end/type). + """ diagram_annotations = {} for entity in detect_annotation.get("entities", []): if detect_annotation.get("confidence", 0.0) >= self.approve_threshold: @@ -282,17 +414,31 @@ def _detect_annotation_to_edge_applies( else: continue - external_id = self._create_annotation_id(file_instance_id, entity, detect_annotation) + external_id = self._create_annotation_id( + file_instance_id, entity, detect_annotation + ) now = datetime.now(timezone.utc).replace(microsecond=0) annotation_properties = { "name": file_instance_id.external_id, "confidence": detect_annotation.get("confidence"), "status": status, "startNodePageNumber": detect_annotation.get("region", {}).get("page"), - "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), - "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMin": min( + v.get("x", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMin": min( + v.get("y", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeXMax": max( + v.get("x", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), + "startNodeYMax": max( + v.get("y", 0) + for v in detect_annotation.get("region", {}).get("vertices", []) + ), "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, @@ -303,13 +449,22 @@ def _detect_annotation_to_edge_applies( space=file_instance_id.space, external_id=external_id, type=DirectRelationReference( - space=self.core_annotation_view_id.space, external_id=entity.get("annotation_type") + space=self.core_annotation_view_id.space, + external_id=entity.get("annotation_type"), ), start_node=DirectRelationReference( - space=file_instance_id.space, external_id=file_instance_id.external_id + space=file_instance_id.space, + external_id=file_instance_id.external_id, ), - end_node=DirectRelationReference(space=entity.get("space"), external_id=entity.get("external_id")), - sources=[NodeOrEdgeData(source=self.core_annotation_view_id, properties=annotation_properties)], + end_node=DirectRelationReference( + space=entity.get("space"), external_id=entity.get("external_id") + ), + sources=[ + NodeOrEdgeData( + source=self.core_annotation_view_id, + properties=annotation_properties, + ) + ], ) key = self._get_edge_apply_unique_key(edge) if key not in diagram_annotations: @@ -336,16 +491,47 @@ def _detect_annotation_to_edge_applies( def _create_stable_hash(self, raw_annotation: dict[str, Any]) -> str: """ - Creates a hash based off items of a potential annotation. This is used such that we don't create duplicate annotations for pattern mode and regular results. + Generates a stable hash for an annotation to enable deduplication. + + Creates a deterministic hash based on annotation text, page, and bounding box vertices, + ensuring that identical detections from regular and pattern mode are recognized as duplicates. + + Args: + raw_annotation: Dictionary containing annotation detection data. + + Returns: + 10-character hash string representing the annotation. """ text = raw_annotation.get("text", "") region = raw_annotation.get("region", {}) vertices = region.get("vertices", []) sorted_vertices = sorted(vertices, key=lambda v: (v.get("x", 0), v.get("y", 0))) - stable_representation = {"text": text, "page": region.get("page"), "vertices": sorted_vertices} - return sha256(json.dumps(stable_representation, sort_keys=True).encode()).hexdigest()[:10] + stable_representation = { + "text": text, + "page": region.get("page"), + "vertices": sorted_vertices, + } + return sha256( + json.dumps(stable_representation, sort_keys=True).encode() + ).hexdigest()[:10] + + def _create_annotation_id( + self, file_id: NodeId, entity: dict[str, Any], raw_annotation: dict[str, Any] + ) -> str: + """ + Creates a unique external ID for a regular annotation edge. + + Combines file ID, entity ID, detected text, and hash to create a human-readable + yet unique identifier, truncating if necessary to stay within CDF's 256 character limit. - def _create_annotation_id(self, file_id: NodeId, entity: dict[str, Any], raw_annotation: dict[str, Any]) -> str: + Args: + file_id: NodeId of the file being annotated. + entity: Dictionary containing the detected entity information. + raw_annotation: Dictionary containing annotation detection data. + + Returns: + Unique external ID string for the annotation edge. + """ hash_ = self._create_stable_hash(raw_annotation) text = raw_annotation.get("text", "") naive = f"{file_id.external_id}:{entity.get('external_id')}:{text}:{hash_}" @@ -356,7 +542,22 @@ def _create_annotation_id(self, file_id: NodeId, entity: dict[str, Any], raw_ann prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] return f"{prefix}:{hash_}" - def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[str, Any]) -> str: + def _create_pattern_annotation_id( + self, file_id: NodeId, raw_annotation: dict[str, Any] + ) -> str: + """ + Creates a unique external ID for a pattern annotation edge. + + Similar to regular annotations but prefixed with "pattern:" to distinguish pattern-detected + annotations that link to sink nodes rather than specific entities. + + Args: + file_id: NodeId of the file being annotated. + raw_annotation: Dictionary containing annotation detection data. + + Returns: + Unique external ID string for the pattern annotation edge. + """ hash_ = self._create_stable_hash(raw_annotation) text = raw_annotation.get("text", "") prefix = f"pattern:{file_id.external_id}:{text}" @@ -365,6 +566,17 @@ def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[st return f"{prefix}:{hash_}" def _get_edge_apply_unique_key(self, edge_apply_instance: EdgeApply) -> tuple: + """ + Generates a unique key for an edge based on its start node, end node, and type. + + Used for deduplication to prevent creating multiple edges with identical connections. + + Args: + edge_apply_instance: EdgeApply object to generate key for. + + Returns: + Tuple of (start_node_tuple, end_node_tuple, type_tuple) for deduplication. + """ start_node = edge_apply_instance.start_node end_node = edge_apply_instance.end_node type_ = edge_apply_instance.type diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 15bb077b..42304ebb 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -75,24 +75,52 @@ def __init__( apply_service, ) - self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view + self.annotation_state_view: ViewPropertyConfig = ( + config.data_model_views.annotation_state_view + ) self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.page_range: int = config.launch_function.annotation_service.page_range self.max_retries: int = config.finalize_function.max_retry_attempts - self.clean_old_annotations: bool = config.finalize_function.clean_old_annotations + self.clean_old_annotations: bool = ( + config.finalize_function.clean_old_annotations + ) self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") def run(self) -> Literal["Done"] | None: + """ + Main execution loop for finalizing diagram detection jobs. + + Retrieves completed jobs, fetches their results, processes annotations for each file, + and updates annotation state instances. Handles multi-page files by tracking progress + and requeueing files with remaining pages. + + Args: + None + + Returns: + "Done" if no jobs available, None if processing should continue. + + Raises: + CogniteAPIError: Various API errors are handled gracefully (version conflicts, + timeouts, etc.). + """ self.logger.info("Starting Finalize Function", section="START") try: - job_id, pattern_mode_job_id, file_to_state_map = self.retrieve_service.get_job_id() + job_id, pattern_mode_job_id, file_to_state_map = ( + self.retrieve_service.get_job_id() + ) if not job_id or not file_to_state_map: self.logger.info("No diagram detect jobs found", section="END") return "Done" - self.logger.info(f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files") + self.logger.info( + f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files" + ) except CogniteAPIError as e: - if e.code == 400 and e.message == "A version conflict caused the ingest to fail.": + if ( + e.code == 400 + and e.message == "A version conflict caused the ingest to fail." + ): self.logger.info( message=f"Retrieved job id that has already been claimed. Grabbing another job.", section="END", @@ -100,9 +128,12 @@ def run(self) -> Literal["Done"] | None: return elif ( e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message + == "Graph query timed out. Reduce load or contention, or optimise your query." ): - self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") + self.logger.error( + message=f"Ran into the following error:\n{str(e)}", section="END" + ) return else: raise e @@ -112,7 +143,11 @@ def run(self) -> Literal["Done"] | None: try: job_results = self.retrieve_service.get_diagram_detect_job_result(job_id) if pattern_mode_job_id: - pattern_mode_job_results = self.retrieve_service.get_diagram_detect_job_result(pattern_mode_job_id) + pattern_mode_job_results = ( + self.retrieve_service.get_diagram_detect_job_result( + pattern_mode_job_id + ) + ) except Exception as e: self.logger.info( message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) is a bad gateway", @@ -146,16 +181,22 @@ def run(self) -> Literal["Done"] | None: return self.logger.info( - f"Both jobs ({job_id}, {pattern_mode_job_id}) complete. Applying all annotations.", section="END" + f"Both jobs ({job_id}, {pattern_mode_job_id}) complete. Applying all annotations.", + section="END", ) merged_results = { - (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): {"regular": item} + (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): { + "regular": item + } for item in job_results["items"] } if pattern_mode_job_results: for item in pattern_mode_job_results["items"]: - key = (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]) + key = ( + item["fileInstanceId"]["space"], + item["fileInstanceId"]["externalId"], + ) if key in merged_results: merged_results[key]["pattern"] = item else: @@ -174,27 +215,34 @@ def run(self) -> Literal["Done"] | None: annotation_state_node = file_to_state_map[file_id] current_attempt = cast( - int, annotation_state_node.properties[self.annotation_state_view.as_view_id()]["attemptCount"] + int, + annotation_state_node.properties[ + self.annotation_state_view.as_view_id() + ]["attemptCount"], ) next_attempt = current_attempt + 1 try: self.logger.info(f"Processing file {file_id}:") - annotation_msg, pattern_msg = self.apply_service.process_and_apply_annotations_for_file( - file_node, - results.get("regular"), - results.get("pattern"), - self.clean_old_annotations - and annotation_state_node.properties[self.annotation_state_view.as_view_id()].get( - "annotatedPageCount" + annotation_msg, pattern_msg = ( + self.apply_service.process_and_apply_annotations_for_file( + file_node, + results.get("regular"), + results.get("pattern"), + self.clean_old_annotations + and annotation_state_node.properties[ + self.annotation_state_view.as_view_id() + ].get("annotatedPageCount") + is None, ) - is None, ) self.logger.info(f"\t- {annotation_msg}\n\t- {pattern_msg}") # Logic to handle multi-page files page_count = results.get("regular", {}).get("pageCount", 1) - annotated_pages = self._check_all_pages_annotated(annotation_state_node, page_count) + annotated_pages = self._check_all_pages_annotated( + annotation_state_node, page_count + ) if annotated_pages == page_count: job_node_to_update = self._process_annotation_state( @@ -220,7 +268,9 @@ def run(self) -> Literal["Done"] | None: count_success += 1 # Still a success for this batch except Exception as e: - self.logger.error(f"Failed to process annotations for file {file_id}: {e}") + self.logger.error( + f"Failed to process annotations for file {file_id}: {e}" + ) if next_attempt >= self.max_retries: job_node_to_update = self._process_annotation_state( annotation_state_node, @@ -245,17 +295,25 @@ def run(self) -> Literal["Done"] | None: # Batch update the state nodes at the end if annotation_state_node_applies: self.logger.info( - f"Updating {len(annotation_state_node_applies)} annotation state instances", section="START" + f"Updating {len(annotation_state_node_applies)} annotation state instances", + section="START", ) try: - self.apply_service.update_instances(list_node_apply=annotation_state_node_applies) + self.apply_service.update_instances( + list_node_apply=annotation_state_node_applies + ) self.logger.info( f"\t- {count_success} set to Annotated/New\n\t- {count_retry} set to Retry\n\t- {count_failed} set to Failed" ) except Exception as e: - self.logger.error(f"Error during batch update of annotation states: {e}", section="END") + self.logger.error( + f"Error during batch update of annotation states: {e}", + section="END", + ) - self.tracker.add_files(success=count_success, failed=(count_failed + count_retry)) + self.tracker.add_files( + success=count_success, failed=(count_failed + count_retry) + ) return None def _process_annotation_state( @@ -269,7 +327,25 @@ def _process_annotation_state( pattern_mode_message: str | None = None, ) -> NodeApply: """ - Create a node apply from the node passed into the function. + Creates a NodeApply to update an annotation state instance with processing results. + + Updates status, attempt count, timestamps, and page tracking for multi-page files. + The annotatedPageCount and pageCount properties are updated based on progress through + the file's pages. + + Args: + node: The annotation state node to update. + status: New annotation status (ANNOTATED, FAILED, NEW, RETRY). + attempt_count: Current attempt count for this file. + annotated_page_count: Number of pages successfully annotated so far. + page_count: Total number of pages in the file. + annotation_message: Message describing regular annotation results. + pattern_mode_message: Message describing pattern mode results. + + Returns: + NodeApply object ready to be applied to update the annotation state. + + NOTE: Create a node apply from the node passed into the function. The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. Thus, we set it here and include logic to handle the scneario where it is set. NOTE: Always want to use the latest page count from the diagram detect results @@ -287,7 +363,9 @@ def _process_annotation_state( """ update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat(), "annotationMessage": annotation_message, "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, @@ -314,7 +392,19 @@ def _process_annotation_state( def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: """ - The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. + Calculates how many pages have been annotated after this batch completes. + + Handles progressive annotation of multi-page files by tracking which pages have been + processed based on the configured page_range batch size. + + Args: + node: The annotation state node being processed. + page_count: Total number of pages in the file from diagram detect results. + + Returns: + Number of pages annotated after this batch (includes previous batches). + + NOTE: The annotatedPageCount and pageCount properties won't be set if this is the first time the job has been run for the specific node. - if annotated_page_count is not set (first run): - if page_range >= to the page count: - annotated_page_count = page_count b/c all of the pages were passed into the FileReference during LaunchService @@ -328,7 +418,9 @@ def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: """ annotated_page_count: int | None = cast( int, - node.properties[self.annotation_state_view.as_view_id()].get("annotatedPageCount"), + node.properties[self.annotation_state_view.as_view_id()].get( + "annotatedPageCount" + ), ) if not annotated_page_count: @@ -336,14 +428,18 @@ def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: annotated_page_count = page_count else: annotated_page_count = self.page_range - self.logger.info(f"Annotated pages 1-to-{annotated_page_count} out of {page_count} total pages") + self.logger.info( + f"Annotated pages 1-to-{annotated_page_count} out of {page_count} total pages" + ) else: start_page = annotated_page_count + 1 if (annotated_page_count + self.page_range) >= page_count: annotated_page_count = page_count else: annotated_page_count += self.page_range - self.logger.info(f"Annotated pages {start_page}-to-{annotated_page_count} out of {page_count} total pages") + self.logger.info( + f"Annotated pages {start_page}-to-{annotated_page_count} out of {page_count} total pages" + ) return annotated_page_count @@ -354,16 +450,31 @@ def _update_batch_state( failed: bool = False, ): """ - Updates the properties of FileAnnnotationState + Updates annotation state instances in bulk, typically for error scenarios. + + Used when jobs are incomplete or failed to reset job IDs and update status for + retry or re-queuing. + + Args: + batch: BatchOfNodes containing annotation state nodes to update. + status: New annotation status to set for all nodes. + failed: Whether this is a failure scenario (clears job IDs if True). + + Returns: + None """ if len(batch.nodes) == 0: return - self.logger.info(message=f"Updating {len(batch.nodes)} annotation state instances") + self.logger.info( + message=f"Updating {len(batch.nodes)} annotation state instances" + ) if failed: update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat(), "diagramDetectJobId": None, "patternModeJobId": None, } @@ -373,7 +484,9 @@ def _update_batch_state( ) else: if status == AnnotationStatus.PROCESSING: - claimed_time = batch.nodes[0].properties[self.annotation_state_view.as_view_id()]["sourceUpdatedTime"] + claimed_time = batch.nodes[0].properties[ + self.annotation_state_view.as_view_id() + ]["sourceUpdatedTime"] update_properties = { "annotationStatus": status, "sourceUpdatedTime": claimed_time, @@ -381,14 +494,18 @@ def _update_batch_state( else: update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat(), } batch.update_node_properties( new_properties=update_properties, view_id=self.annotation_state_view.as_view_id(), ) try: - update_results = self.apply_service.update_instances(list_node_apply=batch.apply) + update_results = self.apply_service.update_instances( + list_node_apply=batch.apply + ) self.logger.info(f"- set annotation status to {status}") except Exception as e: self.logger.error( @@ -396,5 +513,7 @@ def _update_batch_state( section="END", ) time.sleep(30) - update_results = self.apply_service.update_instances(list_node_apply=batch.apply) + update_results = self.apply_service.update_instances( + list_node_apply=batch.apply + ) self.logger.info(f"- set annotation status to {status}") diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py index 17f24d6b..c9191137 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py @@ -21,10 +21,22 @@ def __init__( os.makedirs(dir_name, exist_ok=True) self.file_handler = open(self.filepath, "a", encoding="utf-8") except Exception as e: - print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") + print( + f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}" + ) self.write = False def _format_message_lines(self, prefix: str, message: str) -> list[str]: + """ + Formats multi-line messages with consistent indentation. + + Args: + prefix: The log level prefix (e.g., "[INFO]", "[ERROR]"). + message: The message to format. + + Returns: + List of formatted message lines with proper indentation. + """ formatted_lines = [] if "\n" not in message: formatted_lines.append(f"{prefix} {message}") @@ -37,6 +49,16 @@ def _format_message_lines(self, prefix: str, message: str) -> list[str]: return formatted_lines def _print(self, prefix: str, message: str) -> None: + """ + Prints formatted log messages to console and optionally to file. + + Args: + prefix: The log level prefix to prepend to the message. + message: The message to log. + + Returns: + None + """ lines_to_log = self._format_message_lines(prefix, message) if self.write and self.file_handler: try: @@ -50,7 +72,19 @@ def _print(self, prefix: str, message: str) -> None: for line in lines_to_log: print(line) - def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def debug( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs a debug-level message. + + Args: + message: The debug message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level == "DEBUG": @@ -58,7 +92,19 @@ def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = if section == "END" or section == "BOTH": self._section() - def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def info( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs an info-level message. + + Args: + message: The informational message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level in ("DEBUG", "INFO"): @@ -66,7 +112,19 @@ def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = N if section == "END" or section == "BOTH": self._section() - def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def warning( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs a warning-level message. + + Args: + message: The warning message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level in ("DEBUG", "INFO", "WARNING"): @@ -74,7 +132,19 @@ def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None if section == "END" or section == "BOTH": self._section() - def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def error( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs an error-level message. + + Args: + message: The error message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() self._print("[ERROR]", message) @@ -82,13 +152,27 @@ def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = self._section() def _section(self) -> None: + """ + Prints a visual separator line for log sections. + + Returns: + None + """ if self.write and self.file_handler: self.file_handler.write( "--------------------------------------------------------------------------------\n" ) - print("--------------------------------------------------------------------------------") + print( + "--------------------------------------------------------------------------------" + ) def close(self) -> None: + """ + Closes the file handler if file logging is enabled. + + Returns: + None + """ if self.file_handler: try: self.file_handler.close() diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/PipelineService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/PipelineService.py index 5dd95bc7..7cf5d885 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/PipelineService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/PipelineService.py @@ -36,7 +36,13 @@ def __init__(self, pipeline_ext_id: str, client: CogniteClient): def update_extraction_pipeline(self, msg: str) -> None: """ - Update the message log for the extraction pipeline + Appends a message to the extraction pipeline run log. + + Args: + msg: The message to append to the pipeline log. + + Returns: + None """ if not self.ep_write.message: self.ep_write.message = msg @@ -48,7 +54,13 @@ def upload_extraction_pipeline( status: Literal["success", "failure", "seen"], ) -> None: """ - Upload the extraction pipeline run so that status and message logs are captured + Creates an extraction pipeline run with accumulated status and messages. + + Args: + status: The run status to report (success, failure, or seen). + + Returns: + None """ self.ep_write.status = status self.client.extraction_pipelines.runs.create(self.ep_write) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py index ed6ae39e..2fd7521e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py @@ -34,7 +34,9 @@ def get_diagram_detect_job_result(self, job_id: int) -> dict | None: pass @abc.abstractmethod - def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: + def get_job_id( + self, + ) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: pass @@ -43,18 +45,38 @@ class GeneralRetrieveService(IRetrieveService): Interface for retrieving diagram detect jobs """ - def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): + def __init__( + self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger + ): self.client = client self.config = config self.logger: CogniteFunctionLogger = logger - self.annotation_state_view: ViewPropertyConfig = self.config.data_model_views.annotation_state_view + self.annotation_state_view: ViewPropertyConfig = ( + self.config.data_model_views.annotation_state_view + ) self.file_view: ViewPropertyConfig = self.config.data_model_views.file_view - self.filter_jobs: Filter = build_filter_from_query(config.finalize_function.retrieve_service.get_job_id_query) - self.job_api: str = f"/api/v1/projects/{self.client.config.project}/context/diagram/detect" + self.filter_jobs: Filter = build_filter_from_query( + config.finalize_function.retrieve_service.get_job_id_query + ) + self.job_api: str = ( + f"/api/v1/projects/{self.client.config.project}/context/diagram/detect" + ) def get_diagram_detect_job_result(self, job_id: int) -> dict | None: + """ + Retrieves the results of a diagram detection job by job ID. + + Polls the diagram detect API to check if a job has completed and returns the results + if available. + + Args: + job_id: The diagram detection job ID to retrieve results for. + + Returns: + Dictionary containing job results if completed, None if still processing or failed. + """ url = f"{self.job_api}/{job_id}" result = None response = self.client.get(url) @@ -66,12 +88,35 @@ def get_diagram_detect_job_result(self, job_id: int) -> dict | None: else: self.logger.debug(f"{job_id} - Job not complete") else: - self.logger.debug(f"{job_id} - Request to get job result failed with {response.status_code} code") + self.logger.debug( + f"{job_id} - Request to get job result failed with {response.status_code} code" + ) return - def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: + def get_job_id( + self, + ) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, None, None]: """ - To ensure threads are protected, we do the following... + Retrieves and claims an available diagram detection job for processing. + + Implements optimistic locking to ensure thread-safe job claiming across parallel + function executions. Queries for jobs ready to finalize and attempts to claim them + by updating their status to "Finalizing". + + Args: + None + + Returns: + A tuple containing: + - Regular diagram detection job ID + - Optional pattern mode job ID + - Dictionary mapping file NodeIds to their annotation state nodes + Returns (None, None, None) if no jobs are available. + + Raises: + CogniteAPIError: If another thread has already claimed the job (version conflict). + + NOTE: To ensure threads are protected, we do the following... 1. Query for an available job id 2. Find all annotation state nodes with that job id 3. Claim those nodes by providing the existing version in the node apply request @@ -85,7 +130,9 @@ def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, sort_by_time = [] sort_by_time.append( instances.InstanceSort( - property=self.annotation_state_view.as_property_ref("sourceUpdatedTime"), + property=self.annotation_state_view.as_property_ref( + "sourceUpdatedTime" + ), direction="ascending", ) ) @@ -105,11 +152,13 @@ def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, job_node: Node = annotation_state_instance.pop(-1) job_id: int = cast( int, - job_node.properties[self.annotation_state_view.as_view_id()]["diagramDetectJobId"], - ) - pattern_mode_job_id: int | None = job_node.properties[self.annotation_state_view.as_view_id()].get( - "patternModeJobId" + job_node.properties[self.annotation_state_view.as_view_id()][ + "diagramDetectJobId" + ], ) + pattern_mode_job_id: int | None = job_node.properties[ + self.annotation_state_view.as_view_id() + ].get("patternModeJobId") filter_job_id = Equals( property=self.annotation_state_view.as_property_ref("diagramDetectJobId"), @@ -131,15 +180,35 @@ def get_job_id(self) -> tuple[int, int | None, dict[NodeId, Node]] | tuple[None, # NOTE: could bundle this with the attempt to claim loop. Chose not to since the run time gains is negligible and improves readability. file_to_state_map: dict[NodeId, Node] = {} for node in list_job_nodes: - file_reference = node.properties.get(self.annotation_state_view.as_view_id()).get("linkedFile") - file_node_id = NodeId(space=file_reference["space"], external_id=file_reference["externalId"]) + file_reference = node.properties.get( + self.annotation_state_view.as_view_id() + ).get("linkedFile") + file_node_id = NodeId( + space=file_reference["space"], external_id=file_reference["externalId"] + ) file_to_state_map[file_node_id] = node return job_id, pattern_mode_job_id, file_to_state_map def _attempt_to_claim(self, list_job_nodes_to_claim: NodeApplyList) -> None: """ - (Optimistic locking based off the node version) + Attempts to claim annotation state nodes using optimistic locking. + + Updates node status from "Processing" to "Finalizing" while preserving existing version + for conflict detection. Includes client-side validation to handle read-after-write + consistency edge cases. + + Args: + list_job_nodes_to_claim: NodeApplyList of annotation state nodes to claim. + + Returns: + None + + Raises: + CogniteAPIError: If another thread has claimed the job (version conflict) or if + client-side lock bypass detection triggers. + + NOTE: (Optimistic locking based off the node version) Attempt to 'claim' the annotation state nodes by updating the annotation status property. This relies on how the API applies changes to nodes. Specifically... if an existing version is provided in the nodes that are used for the .apply() endpoint, a version conflict will occur if another thread has already claimed the job. @@ -162,12 +231,22 @@ def _attempt_to_claim(self, list_job_nodes_to_claim: NodeApplyList) -> None: must manually raise an error to prevent the duplicate claim. """ for node_apply in list_job_nodes_to_claim: - if node_apply.sources[0].properties["annotationStatus"] == AnnotationStatus.PROCESSING: + if ( + node_apply.sources[0].properties["annotationStatus"] + == AnnotationStatus.PROCESSING + ): node_apply.sources[0].properties["annotationStatus"] = AnnotationStatus.FINALIZING # type: ignore - elif node_apply.sources[0].properties["annotationStatus"] == AnnotationStatus.FINALIZING: + elif ( + node_apply.sources[0].properties["annotationStatus"] + == AnnotationStatus.FINALIZING + ): self.logger.debug("Lock bypassed. Caught on the client-side.") - raise CogniteAPIError(message="A version conflict caused the ingest to fail.", code=400) + raise CogniteAPIError( + message="A version conflict caused the ingest to fail.", code=400 + ) - update_results = self.client.data_modeling.instances.apply(nodes=list_job_nodes_to_claim) + update_results = self.client.data_modeling.instances.apply( + nodes=list_job_nodes_to_claim + ) return diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index 2969461b..8ae8fe7d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -19,11 +19,15 @@ class IAnnotationService(abc.ABC): """ @abc.abstractmethod - def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: + def run_diagram_detect( + self, files: list[FileReference], entities: list[dict[str, Any]] + ) -> int: pass @abc.abstractmethod - def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: + def run_pattern_mode_detect( + self, files: list[FileReference], pattern_samples: list[dict[str, Any]] + ) -> int: pass @@ -33,7 +37,9 @@ class GeneralAnnotationService(IAnnotationService): Build a queue of files that are in the annotation process and return the jobId """ - def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): + def __init__( + self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger + ): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger @@ -41,7 +47,9 @@ def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctio self.annotation_config = config.launch_function.annotation_service self.diagram_detect_config: DiagramDetectConfig | None = None if config.launch_function.annotation_service.diagram_detect_config: - self.diagram_detect_config = config.launch_function.annotation_service.diagram_detect_config.as_config() + self.diagram_detect_config = ( + config.launch_function.annotation_service.diagram_detect_config.as_config() + ) # NOTE: Remove Leading Zeros has a weird interaction with pattern mode so will always turn off if config.launch_function.pattern_mode: # NOTE: Shallow copy that still references Mutable objects in self.diagram_detect_config. @@ -49,7 +57,22 @@ def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctio self.pattern_detect_config = copy.copy(self.diagram_detect_config) self.pattern_detect_config.remove_leading_zeros = False - def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: + def run_diagram_detect( + self, files: list[FileReference], entities: list[dict[str, Any]] + ) -> int: + """ + Initiates a diagram detection job using CDF's diagram detect API. + + Args: + files: List of file references to process for annotation. + entities: List of entity dictionaries containing searchable properties for annotation matching. + + Returns: + The job ID of the initiated diagram detection job. + + Raises: + Exception: If the API call does not return a valid job ID. + """ detect_job: DiagramDetectResults = self.client.diagrams.detect( file_references=files, entities=entities, @@ -63,8 +86,25 @@ def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str else: raise Exception(f"API call to diagram/detect did not return a job ID") - def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: - """Generates patterns and runs the diagram detection job in pattern mode.""" + def run_pattern_mode_detect( + self, files: list[FileReference], pattern_samples: list[dict[str, Any]] + ) -> int: + """ + Initiates a diagram detection job in pattern mode using generated pattern samples. + + Pattern mode enables detection of entities based on regex-like patterns rather than exact matches, + useful for finding variations of asset tags and identifiers. + + Args: + files: List of file references to process for annotation. + pattern_samples: List of pattern sample dictionaries containing regex-like patterns for matching. + + Returns: + The job ID of the initiated pattern mode diagram detection job. + + Raises: + Exception: If the API call does not return a valid job ID. + """ detect_job: DiagramDetectResults = self.client.diagrams.detect( file_references=files, entities=pattern_samples, @@ -77,4 +117,6 @@ def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: l if detect_job.job_id: return detect_job.job_id else: - raise Exception("API call to diagram/detect in pattern mode did not return a job ID") + raise Exception( + "API call to diagram/detect in pattern mode did not return a job ID" + ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index bada7bf3..34b10073 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -52,18 +52,26 @@ class GeneralCacheService(ICacheService): that share the same operational context. """ - def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): + def __init__( + self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger + ): self.client = client self.config = config self.logger = logger self.db_name: str = config.launch_function.cache_service.raw_db self.tbl_name: str = config.launch_function.cache_service.raw_table_cache - self.manual_patterns_tbl_name: str = config.launch_function.cache_service.raw_manual_patterns_catalog - self.cache_time_limit: int = config.launch_function.cache_service.cache_time_limit # in hours + self.manual_patterns_tbl_name: str = ( + config.launch_function.cache_service.raw_manual_patterns_catalog + ) + self.cache_time_limit: int = ( + config.launch_function.cache_service.cache_time_limit + ) # in hours self.file_view: ViewPropertyConfig = config.data_model_views.file_view - self.target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + self.target_entities_view: ViewPropertyConfig = ( + config.data_model_views.target_entities_view + ) def get_entities( self, @@ -72,9 +80,21 @@ def get_entities( secondary_scope_value: str | None, ) -> tuple[list[dict], list[dict]]: """ - Returns file and asset entities for use in diagram detect job. - Ensures that the cache is up to date and valid. This method orchestrates - the fetching of data and the updating of the cache. + Retrieves or generates entities and pattern samples for diagram detection. + + This method orchestrates the cache lifecycle: checking validity, fetching fresh data if needed, + generating pattern samples, and updating the cache. The cache is scoped by primary and secondary + scope values to ensure relevant entities are used for each file context. + + Args: + data_model_service: Service instance for querying data model instances. + primary_scope_value: Primary scope identifier (e.g., site, facility). + secondary_scope_value: Optional secondary scope identifier (e.g., unit, area). + + Returns: + A tuple containing: + - Combined list of entity dictionaries (assets + files) for diagram detection. + - Combined list of pattern sample dictionaries for pattern mode detection. """ entities: list[dict] = [] if secondary_scope_value: @@ -83,19 +103,31 @@ def get_entities( key = f"{primary_scope_value}" try: - row: Row | None = self.client.raw.rows.retrieve(db_name=self.db_name, table_name=self.tbl_name, key=key) + row: Row | None = self.client.raw.rows.retrieve( + db_name=self.db_name, table_name=self.tbl_name, key=key + ) except: row = None # Attempt to retrieve from the cache - if row and row.columns and self._validate_cache(row.columns["LastUpdateTimeUtcIso"]): - self.logger.debug(f"Cache valid for key: {key}. Retrieving entities and patterns.") + if ( + row + and row.columns + and self._validate_cache(row.columns["LastUpdateTimeUtcIso"]) + ): + self.logger.debug( + f"Cache valid for key: {key}. Retrieving entities and patterns." + ) asset_entities: list[dict] = row.columns.get("AssetEntities", []) file_entities: list[dict] = row.columns.get("FileEntities", []) - combined_pattern_samples: list[dict] = row.columns.get("CombinedPatternSamples", []) + combined_pattern_samples: list[dict] = row.columns.get( + "CombinedPatternSamples", [] + ) return (asset_entities + file_entities), combined_pattern_samples - self.logger.info(f"Refreshing RAW entities cache and patterns cache for key: {key}") + self.logger.info( + f"Refreshing RAW entities cache and patterns cache for key: {key}" + ) # Fetch data asset_instances, file_instances = data_model_service.get_instances_entities( @@ -103,7 +135,9 @@ def get_entities( ) # Convert to entities for diagram detect job - asset_entities, file_entities = self._convert_instances_to_entities(asset_instances, file_instances) + asset_entities, file_entities = self._convert_instances_to_entities( + asset_instances, file_instances + ) entities = asset_entities + file_entities # Generate pattern samples from the same entities @@ -112,10 +146,14 @@ def get_entities( auto_pattern_samples = asset_pattern_samples + file_pattern_samples # Grab the manual pattern samples - manual_pattern_samples = self._get_manual_patterns(primary_scope_value, secondary_scope_value) + manual_pattern_samples = self._get_manual_patterns( + primary_scope_value, secondary_scope_value + ) # Merge the auto and manual patterns - combined_pattern_samples = self._merge_patterns(auto_pattern_samples, manual_pattern_samples) + combined_pattern_samples = self._merge_patterns( + auto_pattern_samples, manual_pattern_samples + ) # Update cache new_row = RowWrite( @@ -135,8 +173,16 @@ def get_entities( def _update_cache(self, row_to_write: RowWrite) -> None: """ - Writes a single, fully-formed RowWrite object to the RAW cache table. - This method's only responsibility is the database insertion. + Writes a cache entry to the RAW database table. + + This method's only responsibility is the database insertion. All data preparation + and formatting should be done before calling this method. + + Args: + row_to_write: Fully-formed RowWrite object containing cache data to persist. + + Returns: + None """ self.client.raw.rows.insert( db_name=self.db_name, @@ -149,8 +195,16 @@ def _update_cache(self, row_to_write: RowWrite) -> None: def _validate_cache(self, last_update_datetime_str: str) -> bool: """ - Checks if the retrieved cache is still valid by comparing its creation - timestamp with the 'cacheTimeLimit' from the configuration. + Validates whether cached data is still fresh based on time elapsed since last update. + + Compares the cache's last update timestamp against the configured cache time limit + to determine if a refresh is needed. + + Args: + last_update_datetime_str: ISO-formatted datetime string of the cache's last update. + + Returns: + True if the cache is still valid (within time limit), False if expired. """ last_update_datetime_utc = datetime.fromisoformat(last_update_datetime_str) current_datetime_utc = datetime.now(timezone.utc) @@ -169,14 +223,32 @@ def _convert_instances_to_entities( self, asset_instances: NodeList, file_instances: NodeList ) -> tuple[list[dict], list[dict]]: """ - Convert the asset and file nodes into an entity + Transforms data model node instances into entity dictionaries for diagram detection. + + Extracts relevant properties from asset and file nodes and formats them as entity + dictionaries compatible with the diagram detect API. + + Args: + asset_instances: NodeList of asset instances from the data model. + file_instances: NodeList of file instances from the data model. + + Returns: + A tuple containing: + - List of target entity dictionaries (typically assets). + - List of file entity dictionaries. """ - target_entities_resource_type: str | None = self.config.launch_function.target_entities_resource_property - target_entities_search_property: str = self.config.launch_function.target_entities_search_property + target_entities_resource_type: str | None = ( + self.config.launch_function.target_entities_resource_property + ) + target_entities_search_property: str = ( + self.config.launch_function.target_entities_search_property + ) target_entities: list[dict] = [] for instance in asset_instances: - instance_properties = instance.properties.get(self.target_entities_view.as_view_id()) + instance_properties = instance.properties.get( + self.target_entities_view.as_view_id() + ) if target_entities_resource_type: resource_type: str = instance_properties[target_entities_resource_type] else: @@ -188,7 +260,9 @@ def _convert_instances_to_entities( space=instance.space, annotation_type=self.target_entities_view.annotation_type, resource_type=resource_type, - search_property=instance_properties.get(target_entities_search_property), + search_property=instance_properties.get( + target_entities_search_property + ), ) target_entities.append(asset_entity.to_dict()) else: @@ -203,7 +277,9 @@ def _convert_instances_to_entities( ) target_entities.append(asset_entity.to_dict()) - file_resource_type: str | None = self.config.launch_function.file_resource_property + file_resource_type: str | None = ( + self.config.launch_function.file_resource_property + ) file_search_property: str = self.config.launch_function.file_search_property file_entities: list[dict] = [] @@ -227,14 +303,29 @@ def _convert_instances_to_entities( def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict]: """ - MODIFIED: Generates pattern samples using Implementation 1's logic - while adding the 'annotation_type' from Implementation 2. + Generates regex-like pattern samples from entity search properties for pattern mode detection. + + Analyzes entity aliases to extract common patterns and variations, creating consolidated + pattern samples that can match multiple similar tags (e.g., "FT-[1|2|3]00[1|2]A"). + + Args: + entities: List of entity dictionaries containing search properties (aliases). + + Returns: + List of pattern sample dictionaries, each containing: + - sample: List of pattern strings + - resource_type: Entity resource type + - annotation_type: Annotation type for the entity """ # Structure: { resource_type: {"patterns": { template_key: [...] }, "annotation_type": "..."} } - pattern_builders = defaultdict(lambda: {"patterns": {}, "annotation_type": None}) + pattern_builders = defaultdict( + lambda: {"patterns": {}, "annotation_type": None} + ) self.logger.info(f"Generating pattern samples from {len(entities)} entities.") - def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: + def _parse_alias( + alias: str, resource_type_key: str + ) -> tuple[str, list[list[str]]]: alias_parts = re.split(r"([ -])", alias) full_template_key_parts: list[str] = [] all_variable_parts: list[list[str]] = [] @@ -246,7 +337,9 @@ def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str full_template_key_parts.append(part) continue left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) - right_ok = (i == len(alias_parts) - 1) or (alias_parts[i + 1] in [" ", "-"]) + right_ok = (i == len(alias_parts) - 1) or ( + alias_parts[i + 1] in [" ", "-"] + ) if left_ok and right_ok and part == resource_type_key: full_template_key_parts.append(f"[{part}]") continue @@ -293,7 +386,9 @@ def build_segment(segment_template: str) -> str: return segment_template try: letter_groups_for_segment = next(var_iter) - letter_group_iter: Iterator[set[str]] = iter(letter_groups_for_segment) + letter_group_iter: Iterator[set[str]] = iter( + letter_groups_for_segment + ) def replace_A(match): alternatives = sorted(list(next(letter_group_iter))) @@ -304,7 +399,8 @@ def replace_A(match): return segment_template final_pattern_parts = [ - build_segment(p) if p not in " -" else p for p in re.split(r"([ -])", template_key) + build_segment(p) if p not in " -" else p + for p in re.split(r"([ -])", template_key) ] final_samples.append("".join(final_pattern_parts)) @@ -318,8 +414,22 @@ def replace_A(match): ) return result - def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) -> list[dict]: - """BUG FIX: Fetches manual patterns with correct error handling from Implementation 2.""" + def _get_manual_patterns( + self, primary_scope: str, secondary_scope: str | None + ) -> list[dict]: + """ + Retrieves manually defined pattern samples from the RAW catalog. + + Fetches patterns at three levels of specificity: global, primary scope, and combined scope, + allowing for hierarchical pattern definitions with increasing specificity. + + Args: + primary_scope: Primary scope identifier for fetching scope-specific patterns. + secondary_scope: Optional secondary scope identifier for fetching more specific patterns. + + Returns: + List of manually defined pattern dictionaries from all applicable scope levels. + """ keys_to_fetch = ["GLOBAL"] if primary_scope: keys_to_fetch.append(primary_scope) @@ -331,20 +441,40 @@ def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) for key in keys_to_fetch: try: row: Row | None = self.client.raw.rows.retrieve( - db_name=self.db_name, table_name=self.manual_patterns_tbl_name, key=key + db_name=self.db_name, + table_name=self.manual_patterns_tbl_name, + key=key, ) if row: patterns = (row.columns or {}).get("patterns", []) all_manual_patterns.extend(patterns) except CogniteNotFoundError: - self.logger.info(f"No manual patterns found for key: {key}. This may be expected.") + self.logger.info( + f"No manual patterns found for key: {key}. This may be expected." + ) except Exception as e: - self.logger.error(f"Failed to retrieve manual patterns for key {key}: {e}") + self.logger.error( + f"Failed to retrieve manual patterns for key {key}: {e}" + ) return all_manual_patterns - def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict]) -> list[dict]: - """MODIFIED: Merges patterns while correctly handling the new 'annotation_type' field.""" + def _merge_patterns( + self, auto_patterns: list[dict], manual_patterns: list[dict] + ) -> list[dict]: + """ + Combines automatically generated and manually defined patterns by resource type. + + Merges pattern samples from both sources, ensuring no duplicates while preserving + all unique patterns for each resource type. Auto-pattern annotation types take precedence. + + Args: + auto_patterns: List of automatically generated pattern dictionaries. + manual_patterns: List of manually defined pattern dictionaries. + + Returns: + List of merged pattern dictionaries, deduplicated and organized by resource type. + """ merged = defaultdict(lambda: {"samples": set(), "annotation_type": None}) # Process auto-generated patterns @@ -354,7 +484,9 @@ def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict] merged[resource_type]["samples"].update(item.get("sample", [])) # Set annotation_type if not already set if not merged[resource_type]["annotation_type"]: - merged[resource_type]["annotation_type"] = item.get("annotation_type") + merged[resource_type]["annotation_type"] = item.get( + "annotation_type" + ) # Process manual patterns for item in manual_patterns: @@ -364,7 +496,9 @@ def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict] # Set annotation_type if not already set (auto-patterns take precedence) if not merged[resource_type]["annotation_type"]: # NOTE: UI that creates manual patterns will need to also have the annotation type as a required entry - merged[resource_type]["annotation_type"] = item.get("annotation_type", "diagrams.AssetLink") + merged[resource_type]["annotation_type"] = item.get( + "annotation_type", "diagrams.AssetLink" + ) # Convert the merged dictionary back to the required list format final_list = [ @@ -376,5 +510,7 @@ def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict] for resource_type, data in merged.items() ] - self.logger.info(f"Merged auto and manual patterns into {len(final_list)} resource types.") + self.logger.info( + f"Merged auto and manual patterns into {len(final_list)} resource types." + ) return final_list diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py index 695a539b..babe1666 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py @@ -73,14 +73,20 @@ class GeneralDataModelService(IDataModelService): Implementation used for real runs """ - def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): + def __init__( + self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger + ): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger - self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view + self.annotation_state_view: ViewPropertyConfig = ( + config.data_model_views.annotation_state_view + ) self.file_view: ViewPropertyConfig = config.data_model_views.file_view - self.target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + self.target_entities_view: ViewPropertyConfig = ( + config.data_model_views.target_entities_view + ) self.get_files_to_annotate_retrieve_limit: int | None = get_limit_from_query( config.prepare_function.get_files_to_annotate_query @@ -104,7 +110,14 @@ def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctio def get_files_for_annotation_reset(self) -> NodeList | None: """ - Query for files based on the getFilesForAnnotationReset config parameters + Retrieves files that need their annotation status reset based on configuration. + + Args: + None + + Returns: + NodeList of file instances to reset, or None if no reset query is configured. + NOTE: Not building the filter in the object instantiation because the filter will only ever be used once throughout all runs of prepare Furthermore, there is an implicit guarantee that a filter will be returned b/c launch checks if the query exists. """ @@ -125,8 +138,16 @@ def get_files_for_annotation_reset(self) -> NodeList | None: def get_files_to_annotate(self) -> NodeList | None: """ - Query for files that are marked "ToAnnotate" in tags and don't have 'AnnotataionInProcess' and 'Annotated' in tags. - More specific details of the query come from the getFilesToAnnotate config parameter. + Retrieves files ready for annotation processing based on their tag status. + + Queries for files marked "ToAnnotate" that don't have 'AnnotationInProcess' or 'Annotated' tags. + The specific query filters are defined in the getFilesToAnnotate config parameter. + + Args: + None + + Returns: + NodeList of file instances ready for annotation, or None if no files found. """ result: NodeList | None = self.client.data_modeling.instances.list( instance_type="node", @@ -142,9 +163,19 @@ def get_files_to_process( self, ) -> tuple[NodeList, dict[NodeId, Node]] | tuple[None, None]: """ - Query for FileAnnotationStateInstances based on the getFilesToProcess config parameter. - Extract the NodeIds of the file that is referenced in mpcAnnotationState. - Retrieve the files with the NodeIds. + Retrieves files with annotation state instances that are ready for diagram detection. + + Queries for FileAnnotationStateInstances based on the getFilesToProcess config parameter, + extracts the linked file NodeIds, and retrieves the corresponding file nodes. + + Args: + None + + Returns: + A tuple containing: + - NodeList of file instances to process + - Dictionary mapping file NodeIds to their annotation state Node instances + Returns (None, None) if no files are found. """ annotation_state_filter = self._get_annotation_state_filter() annotation_state_instances: NodeList = self.client.data_modeling.instances.list( @@ -162,8 +193,13 @@ def get_files_to_process( list_file_node_ids: list[NodeId] = [] for node in annotation_state_instances: - file_reference = node.properties.get(self.annotation_state_view.as_view_id()).get("linkedFile") - if self.file_view.instance_space is None or self.file_view.instance_space == file_reference["space"]: + file_reference = node.properties.get( + self.annotation_state_view.as_view_id() + ).get("linkedFile") + if ( + self.file_view.instance_space is None + or self.file_view.instance_space == file_reference["space"] + ): file_node_id = NodeId( space=file_reference["space"], external_id=file_reference["externalId"], @@ -181,17 +217,36 @@ def get_files_to_process( def _get_annotation_state_filter(self) -> Filter: """ - filter = (getFilesToProcess filter || (annotationStatus == Processing && now() - lastUpdatedTime) > 1440 minutes) + Builds a filter for annotation state instances, including automatic retry logic for stuck jobs. + + Combines the configured filter with a fallback filter that catches annotation state instances + stuck in Processing/Finalizing status for more than 12 hours. + + Args: + None + + Returns: + Combined Filter for querying annotation state instances. + + NOTE: filter = (getFilesToProcess filter || (annotationStatus == Processing && now() - lastUpdatedTime) > 1440 minutes) - getFilesToProcess filter comes from extraction pipeline - (annotationStatus == Processing | Finalizing && now() - lastUpdatedTime) > 720 minutes/12 hours -> hardcoded -> reprocesses any file that's stuck - Edge case that occurs very rarely but can happen. NOTE: Implementation of a more complex query that can't be handled in config should come from an implementation of the interface. """ - annotation_status_property = self.annotation_state_view.as_property_ref("annotationStatus") - annotation_last_updated_property = self.annotation_state_view.as_property_ref("sourceUpdatedTime") + annotation_status_property = self.annotation_state_view.as_property_ref( + "annotationStatus" + ) + annotation_last_updated_property = self.annotation_state_view.as_property_ref( + "sourceUpdatedTime" + ) # NOTE: While this number is hard coded, I believe it doesn't need to be configured. Number comes from my experience with the pipeline. Feel free to change if your experience leads to a different number - latest_permissible_time_utc = datetime.now(timezone.utc) - timedelta(minutes=720) - latest_permissible_time_utc = latest_permissible_time_utc.isoformat(timespec="milliseconds") + latest_permissible_time_utc = datetime.now(timezone.utc) - timedelta( + minutes=720 + ) + latest_permissible_time_utc = latest_permissible_time_utc.isoformat( + timespec="milliseconds" + ) filter_stuck = In( annotation_status_property, [AnnotationStatus.PROCESSING, AnnotationStatus.FINALIZING], @@ -200,24 +255,44 @@ def _get_annotation_state_filter(self) -> Filter: filter = self.filter_files_to_process | filter_stuck # | == OR return filter - def update_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + def update_annotation_state( + self, list_node_apply: list[NodeApply] + ) -> NodeApplyResultList: """ - Updates annotation state nodes from the node applies passed into the function + Updates existing annotation state nodes with new property values. + + Args: + list_node_apply: List of NodeApply objects containing updated properties. + + Returns: + NodeApplyResultList containing the results of the update operation. """ - update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( - nodes=list_node_apply, - replace=False, # ensures we don't delete other properties in the view + update_results: InstancesApplyResult = ( + self.client.data_modeling.instances.apply( + nodes=list_node_apply, + replace=False, # ensures we don't delete other properties in the view + ) ) return update_results.nodes - def create_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + def create_annotation_state( + self, list_node_apply: list[NodeApply] + ) -> NodeApplyResultList: """ - Creates annotation state nodes from the node applies passed into the function + Creates new annotation state nodes, replacing any existing nodes with the same IDs. + + Args: + list_node_apply: List of NodeApply objects to create as new annotation state instances. + + Returns: + NodeApplyResultList containing the results of the creation operation. """ - update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( - nodes=list_node_apply, - auto_create_direct_relations=True, - replace=True, # ensures we reset the properties of the node + update_results: InstancesApplyResult = ( + self.client.data_modeling.instances.apply( + nodes=list_node_apply, + auto_create_direct_relations=True, + replace=True, # ensures we reset the properties of the node + ) ) return update_results.nodes @@ -225,12 +300,29 @@ def get_instances_entities( self, primary_scope_value: str, secondary_scope_value: str | None ) -> tuple[NodeList, NodeList]: """ - Return the entities that can be used in diagram detect - 1. grab assets that meet the filter requirement - 2. grab files that meet the filter requirement + Retrieves target entities and file entities for use in diagram detection. + + Queries the data model for entities (assets) and files that match the configured filters + and scope values, which will be used to create the entity cache for diagram detection. + + Args: + primary_scope_value: Primary scope identifier (e.g., site, facility). + secondary_scope_value: Optional secondary scope identifier (e.g., unit, area). + + Returns: + A tuple containing: + - NodeList of target entity instances (typically assets) + - NodeList of file entity instances + + NOTE: 1. grab assets that meet the filter requirement + NOTE: 2. grab files that meet the filter requirement """ - target_filter: Filter = self._get_target_entities_filter(primary_scope_value, secondary_scope_value) - file_filter: Filter = self._get_file_entities_filter(primary_scope_value, secondary_scope_value) + target_filter: Filter = self._get_target_entities_filter( + primary_scope_value, secondary_scope_value + ) + file_filter: Filter = self._get_file_entities_filter( + primary_scope_value, secondary_scope_value + ) target_entities: NodeList = self.client.data_modeling.instances.list( instance_type="node", @@ -248,15 +340,30 @@ def get_instances_entities( ) return target_entities, file_entities - def _get_target_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: + def _get_target_entities_filter( + self, primary_scope_value: str, secondary_scope_value: str | None + ) -> Filter: """ - Create a filter that... + Builds a filter for target entities (assets) based on scope and configuration. + + Creates a filter combining scope-specific filtering with global 'ScopeWideDetect' entities. + + Args: + primary_scope_value: Primary scope identifier for filtering entities. + secondary_scope_value: Optional secondary scope identifier for more specific filtering. + + Returns: + Combined Filter for querying target entities. + + NOTE: Create a filter that... - grabs assets in the primary_scope_value and secondary_scope_value provided with detectInDiagram in the tags property or - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value """ filter_primary_scope: Filter = Equals( - property=self.target_entities_view.as_property_ref(self.config.launch_function.primary_scope_property), + property=self.target_entities_view.as_property_ref( + self.config.launch_function.primary_scope_property + ), value=primary_scope_value, ) filter_entities: Filter = self.filter_target_entities @@ -274,27 +381,47 @@ def _get_target_entities_filter(self, primary_scope_value: str, secondary_scope_ ), value=secondary_scope_value, ) - target_filter = (filter_primary_scope & filter_secondary_scope & filter_entities) | ( + target_filter = ( + filter_primary_scope & filter_secondary_scope & filter_entities + ) | (filter_primary_scope & filter_scope_wide) + else: + target_filter = (filter_primary_scope & filter_entities) | ( filter_primary_scope & filter_scope_wide ) - else: - target_filter = (filter_primary_scope & filter_entities) | (filter_primary_scope & filter_scope_wide) return target_filter - def _get_file_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: + def _get_file_entities_filter( + self, primary_scope_value: str, secondary_scope_value: str | None + ) -> Filter: """ - Create a filter that... + Builds a filter for file entities based on scope and configuration. + + Creates a filter combining scope-specific filtering with global 'ScopeWideDetect' files, + ensuring file entities have the required search properties. + + Args: + primary_scope_value: Primary scope identifier for filtering file entities. + secondary_scope_value: Optional secondary scope identifier for more specific filtering. + + Returns: + Combined Filter for querying file entities. + + NOTE: Create a filter that... - grabs assets in the primary_scope_value and secondary_scope_value provided with DetectInDiagram in the tags property or - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value """ filter_primary_scope: Filter = Equals( - property=self.file_view.as_property_ref(self.config.launch_function.primary_scope_property), + property=self.file_view.as_property_ref( + self.config.launch_function.primary_scope_property + ), value=primary_scope_value, ) filter_entities: Filter = self.filter_file_entities filter_search_property_exists: Filter = Exists( - property=self.file_view.as_property_ref(self.config.launch_function.file_search_property), + property=self.file_view.as_property_ref( + self.config.launch_function.file_search_property + ), ) # NOTE: ScopeWideDetect is an optional string that allows annotating across scopes filter_scope_wide: Filter = In( @@ -302,18 +429,25 @@ def _get_file_entities_filter(self, primary_scope_value: str, secondary_scope_va values=["ScopeWideDetect"], ) if not primary_scope_value: - file_filter = (filter_entities & filter_search_property_exists) | (filter_scope_wide) + file_filter = (filter_entities & filter_search_property_exists) | ( + filter_scope_wide + ) elif secondary_scope_value: filter_secondary_scope: Filter = Equals( - property=self.file_view.as_property_ref(self.config.launch_function.secondary_scope_property), + property=self.file_view.as_property_ref( + self.config.launch_function.secondary_scope_property + ), value=secondary_scope_value, ) file_filter = ( - filter_primary_scope & filter_entities & filter_secondary_scope & filter_search_property_exists + filter_primary_scope + & filter_entities + & filter_secondary_scope + & filter_search_property_exists ) | (filter_primary_scope & filter_scope_wide) else: - file_filter = (filter_primary_scope & filter_entities & filter_search_property_exists) | ( - filter_primary_scope & filter_scope_wide - ) + file_filter = ( + filter_primary_scope & filter_entities & filter_search_property_exists + ) | (filter_primary_scope & filter_scope_wide) return file_filter diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 420d07de..68fbc05f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -88,7 +88,9 @@ def __init__( self.max_batch_size: int = config.launch_function.batch_size self.page_range: int = config.launch_function.annotation_service.page_range - self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view + self.annotation_state_view: ViewPropertyConfig = ( + config.data_model_views.annotation_state_view + ) self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.in_memory_cache: list[dict] = [] @@ -96,8 +98,12 @@ def __init__( self._cached_primary_scope: str | None = None self._cached_secondary_scope: str | None = None - self.primary_scope_property: str = self.config.launch_function.primary_scope_property - self.secondary_scope_property: str | None = self.config.launch_function.secondary_scope_property + self.primary_scope_property: str = ( + self.config.launch_function.primary_scope_property + ) + self.secondary_scope_property: str | None = ( + self.config.launch_function.secondary_scope_property + ) self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") @@ -109,7 +115,20 @@ def __init__( # NOTE: I believe this code should be encapsulated as a separate CDF function named prepFunction. Due to the amount of cdf functions we can spin up, we're coupling this within the launchFunction. def prepare(self) -> Literal["Done"] | None: """ - Retrieves files marked "ToAnnotate" in the tags property and creates a 1-to-1 ratio of FileAnnotationState instances to files + Prepares files for annotation by creating annotation state instances. + + Retrieves files marked "ToAnnotate", creates corresponding FileAnnotationState instances, + and updates file tags to indicate processing has started. Can also reset files if configured. + + Args: + None + + Returns: + "Done" if no more files need preparation, None if processing should continue. + + Raises: + CogniteAPIError: If query timeout or other API errors occur (408 errors are handled gracefully). + ValueError: If annotation state view instance space is not configured. """ self.logger.info( message=f"Starting Prepare Function", @@ -117,7 +136,9 @@ def prepare(self) -> Literal["Done"] | None: ) try: if self.reset_files: - file_nodes_to_reset: NodeList | None = self.data_model_service.get_files_for_annotation_reset() + file_nodes_to_reset: NodeList | None = ( + self.data_model_service.get_files_for_annotation_reset() + ) if not file_nodes_to_reset: self.logger.info( "No files found with the getFilesForAnnotationReset query provided in the config file" @@ -127,7 +148,9 @@ def prepare(self) -> Literal["Done"] | None: reset_node_apply: list[NodeApply] = [] for file_node in file_nodes_to_reset: file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) + tags_property: list[str] = cast( + list[str], file_node_apply.sources[0].properties["tags"] + ) if "AnnotationInProcess" in tags_property: tags_property.remove("AnnotationInProcess") if "Annotated" in tags_property: @@ -136,7 +159,9 @@ def prepare(self) -> Literal["Done"] | None: tags_property.remove("AnnotationFailed") reset_node_apply.append(file_node_apply) - update_results = self.data_model_service.update_annotation_state(reset_node_apply) + update_results = self.data_model_service.update_annotation_state( + reset_node_apply + ) self.logger.info( f"Removed the AnnotationInProcess/Annotated/AnnotationFailed tag of {len(update_results)} files" ) @@ -145,7 +170,8 @@ def prepare(self) -> Literal["Done"] | None: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message + == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -154,7 +180,9 @@ def prepare(self) -> Literal["Done"] | None: raise e try: - file_nodes: NodeList | None = self.data_model_service.get_files_to_annotate() + file_nodes: NodeList | None = ( + self.data_model_service.get_files_to_annotate() + ) if not file_nodes: self.logger.info( message=f"No files found to prepare", @@ -166,7 +194,8 @@ def prepare(self) -> Literal["Done"] | None: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message + == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -183,9 +212,7 @@ def prepare(self) -> Literal["Done"] | None: linkedFile=node_id, ) if not self.annotation_state_view.instance_space: - msg = ( - "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" - ) + msg = "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" self.logger.error(msg) raise ValueError(msg) annotation_instance_space: str = self.annotation_state_view.instance_space @@ -197,21 +224,31 @@ def prepare(self) -> Literal["Done"] | None: annotation_state_instances.append(annotation_node_apply) file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) + tags_property: list[str] = cast( + list[str], file_node_apply.sources[0].properties["tags"] + ) if "AnnotationInProcess" not in tags_property: tags_property.append("AnnotationInProcess") file_apply_instances.append(file_node_apply) try: - create_results = self.data_model_service.create_annotation_state(annotation_state_instances) - self.logger.info(message=f"Created {len(create_results)} annotation state instances") - update_results = self.data_model_service.update_annotation_state(file_apply_instances) + create_results = self.data_model_service.create_annotation_state( + annotation_state_instances + ) + self.logger.info( + message=f"Created {len(create_results)} annotation state instances" + ) + update_results = self.data_model_service.update_annotation_state( + file_apply_instances + ) self.logger.info( message=f"Added 'AnnotationInProcess' to the tag property for {len(update_results)} files", section="END", ) except Exception as e: - self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") + self.logger.error( + message=f"Ran into the following error:\n{str(e)}", section="END" + ) raise self.tracker.add_files(success=len(file_nodes)) @@ -219,24 +256,40 @@ def prepare(self) -> Literal["Done"] | None: def run(self) -> Literal["Done"] | None: """ - The main entry point for the launch service. It prepares the files and then - processes them in organized, context-aware batches. + Main execution loop for launching diagram detection jobs. + + Retrieves files ready for processing, organizes them into context-aware batches based on scope, + ensures appropriate entity caches are loaded, and initiates diagram detection jobs for each batch. + + Args: + None + + Returns: + "Done" if no more files to process or max jobs reached, None if processing should continue. + + Raises: + CogniteAPIError: If query timeout (408) or max jobs reached (429), handled gracefully. """ self.logger.info( message=f"Starting Launch Function", section="START", ) try: - file_nodes, file_to_state_map = self.data_model_service.get_files_to_process() + file_nodes, file_to_state_map = ( + self.data_model_service.get_files_to_process() + ) if not file_nodes or not file_to_state_map: self.logger.info(message=f"No files found to launch") return "Done" - self.logger.info(message=f"Launching {len(file_nodes)} files", section="END") + self.logger.info( + message=f"Launching {len(file_nodes)} files", section="END" + ) except CogniteAPIError as e: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message + == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -244,7 +297,9 @@ def run(self) -> Literal["Done"] | None: else: raise e - processing_batches: list[FileProcessingBatch] = self._organize_files_for_processing(file_nodes) + processing_batches: list[FileProcessingBatch] = ( + self._organize_files_for_processing(file_nodes) + ) total_files_processed = 0 try: @@ -254,7 +309,9 @@ def run(self) -> Literal["Done"] | None: msg = f"{self.primary_scope_property}: {primary_scope_value}" if secondary_scope_value: msg += f", {self.secondary_scope_property}: {secondary_scope_value}" - self.logger.info(message=f"Processing {len(batch.files)} files in {msg}") + self.logger.info( + message=f"Processing {len(batch.files)} files in {msg}" + ) self._ensure_cache_for_batch(primary_scope_value, secondary_scope_value) current_batch = BatchOfPairedNodes(file_to_state_map=file_to_state_map) @@ -267,12 +324,18 @@ def run(self) -> Literal["Done"] | None: current_batch.add_pair(file_node, file_reference) total_files_processed += 1 if current_batch.size() == self.max_batch_size: - self.logger.info(message=f"Processing batch - Max batch size ({self.max_batch_size}) reached") + self.logger.info( + message=f"Processing batch - Max batch size ({self.max_batch_size}) reached" + ) self._process_batch(current_batch) if not current_batch.is_empty(): - self.logger.info(message=f"Processing remaining {current_batch.size()} files in batch") + self.logger.info( + message=f"Processing remaining {current_batch.size()} files in batch" + ) self._process_batch(current_batch) - self.logger.info(message=f"Finished processing for {msg}", section="END") + self.logger.info( + message=f"Finished processing for {msg}", section="END" + ) except CogniteAPIError as e: if e.code == 429: self.logger.debug(f"{str(e)}") @@ -288,14 +351,25 @@ def run(self) -> Literal["Done"] | None: return - def _organize_files_for_processing(self, list_files: NodeList) -> list[FileProcessingBatch]: + def _organize_files_for_processing( + self, list_files: NodeList + ) -> list[FileProcessingBatch]: """ - Groups files based on the 'primary_scope_property' and 'secondary_scope_property' - defined in the configuration. This strategy allows us to load a relevant entity cache - once for a group of files that share the same operational context, significantly - reducing redundant CDF queries. + Organizes files into batches grouped by scope for efficient processing. + + Groups files based on primary and secondary scope properties defined in configuration. + This strategy enables loading a relevant entity cache once per group, significantly + reducing redundant CDF queries for files sharing the same operational context. + + Args: + list_files: NodeList of file instances to organize into batches. + + Returns: + List of FileProcessingBatch objects, each containing files from the same scope. """ - organized_data: dict[str, dict[str, list[Node]]] = defaultdict(lambda: defaultdict(list)) + organized_data: dict[str, dict[str, list[Node]]] = defaultdict( + lambda: defaultdict(list) + ) for file_node in list_files: node_props = file_node.properties[self.file_view.as_view_id()] @@ -327,10 +401,24 @@ def _organize_files_for_processing(self, list_files: NodeList) -> list[FileProce ) return final_processing_batches - def _ensure_cache_for_batch(self, primary_scope_value: str, secondary_scope_value: str | None): + def _ensure_cache_for_batch( + self, primary_scope_value: str, secondary_scope_value: str | None + ): """ - Ensure self.in_memory_cache is populated for the given site and unit. - Checks if there's a mismatch in site, unit, or if the in_memory_cache is empty + Ensures the in-memory entity cache is loaded and current for the given scope. + + Checks if cache needs refreshing (scope mismatch or empty cache) and fetches fresh + entities and patterns from the cache service if needed. + + Args: + primary_scope_value: Primary scope identifier for the batch being processed. + secondary_scope_value: Optional secondary scope identifier for the batch. + + Returns: + None + + Raises: + CogniteAPIError: If query timeout (408) occurs, handled gracefully by returning early. """ if ( self._cached_primary_scope != primary_scope_value @@ -339,8 +427,12 @@ def _ensure_cache_for_batch(self, primary_scope_value: str, secondary_scope_valu ): self.logger.info(f"Refreshing in memory cache") try: - self.in_memory_cache, self.in_memory_patterns = self.cache_service.get_entities( - self.data_model_service, primary_scope_value, secondary_scope_value + self.in_memory_cache, self.in_memory_patterns = ( + self.cache_service.get_entities( + self.data_model_service, + primary_scope_value, + secondary_scope_value, + ) ) self._cached_primary_scope = primary_scope_value self._cached_secondary_scope = secondary_scope_value @@ -348,19 +440,32 @@ def _ensure_cache_for_batch(self, primary_scope_value: str, secondary_scope_valu # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message + == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. - self.logger.error(message=f"Ran into the following error:\n{str(e)}") + self.logger.error( + message=f"Ran into the following error:\n{str(e)}" + ) return else: raise e def _process_batch(self, batch: BatchOfPairedNodes): """ - Processes a single batch of files. For each file, it starts a diagram - detection job and then updates the corresponding 'AnnotationState' node - with the job ID and a 'Processing' status. + Processes a batch of files by initiating diagram detection jobs and updating state. + + Runs both regular and pattern mode diagram detection (if enabled) for all files in the batch, + then updates annotation state instances with job IDs and processing status. + + Args: + batch: BatchOfPairedNodes containing file references and their annotation state nodes. + + Returns: + None + + Raises: + CogniteAPIError: If max concurrent jobs reached (429), handled gracefully. """ if batch.is_empty(): return @@ -375,7 +480,9 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) update_properties = { "annotationStatus": AnnotationStatus.PROCESSING, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat(), "diagramDetectJobId": job_id, "launchFunctionId": self.function_id, "launchFunctionCallId": self.call_id, @@ -386,9 +493,11 @@ def _process_batch(self, batch: BatchOfPairedNodes): if self.config.launch_function.pattern_mode: total_patterns = 0 if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: - total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + len(self.in_memory_patterns[1].get('sample', [])) + total_patterns = len( + self.in_memory_patterns[0].get("sample", []) + ) + len(self.in_memory_patterns[1].get("sample", [])) elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: - total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + total_patterns = len(self.in_memory_patterns[0].get("sample", [])) self.logger.info( f"Running pattern mode diagram detect on {batch.size()} files with {total_patterns} sample patterns" ) @@ -412,15 +521,27 @@ def _process_batch(self, batch: BatchOfPairedNodes): class LocalLaunchService(GeneralLaunchService): """ - A Launch service that uses a custom, local process for handling batches, - while inheriting all other functionality from GeneralLaunchService. + Launch service variant for local development and debugging. + + Extends GeneralLaunchService with custom error handling for local runs, including + sleep/retry logic for API rate limiting rather than immediate termination. """ def _process_batch(self, batch: BatchOfPairedNodes): """ - This method overrides the original _process_batch. - Instead of calling the annotation service, it could, for example, - process the files locally. + Processes a batch with local-specific error handling. + + Extends the base _process_batch with additional error handling suitable for local runs, + including automatic retry with sleep on rate limit errors (429) rather than terminating. + + Args: + batch: BatchOfPairedNodes containing file references and their annotation state nodes. + + Returns: + None + + Raises: + Exception: If non-rate-limit errors occur. """ if batch.is_empty(): return @@ -435,7 +556,9 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) update_properties = { "annotationStatus": AnnotationStatus.PROCESSING, - "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat(), "diagramDetectJobId": job_id, "launchFunctionId": self.function_id, "launchFunctionCallId": self.call_id, @@ -446,9 +569,11 @@ def _process_batch(self, batch: BatchOfPairedNodes): if self.config.launch_function.pattern_mode: total_patterns = 0 if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: - total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + len(self.in_memory_patterns[1].get('sample', [])) + total_patterns = len( + self.in_memory_patterns[0].get("sample", []) + ) + len(self.in_memory_patterns[1].get("sample", [])) elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: - total_patterns = len(self.in_memory_patterns[0].get('sample', [])) + total_patterns = len(self.in_memory_patterns[0].get("sample", [])) self.logger.info( f"Running pattern mode diagram detect on {batch.size()} files with {total_patterns} sample patterns" ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py index 17f24d6b..c9191137 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py @@ -21,10 +21,22 @@ def __init__( os.makedirs(dir_name, exist_ok=True) self.file_handler = open(self.filepath, "a", encoding="utf-8") except Exception as e: - print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") + print( + f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}" + ) self.write = False def _format_message_lines(self, prefix: str, message: str) -> list[str]: + """ + Formats multi-line messages with consistent indentation. + + Args: + prefix: The log level prefix (e.g., "[INFO]", "[ERROR]"). + message: The message to format. + + Returns: + List of formatted message lines with proper indentation. + """ formatted_lines = [] if "\n" not in message: formatted_lines.append(f"{prefix} {message}") @@ -37,6 +49,16 @@ def _format_message_lines(self, prefix: str, message: str) -> list[str]: return formatted_lines def _print(self, prefix: str, message: str) -> None: + """ + Prints formatted log messages to console and optionally to file. + + Args: + prefix: The log level prefix to prepend to the message. + message: The message to log. + + Returns: + None + """ lines_to_log = self._format_message_lines(prefix, message) if self.write and self.file_handler: try: @@ -50,7 +72,19 @@ def _print(self, prefix: str, message: str) -> None: for line in lines_to_log: print(line) - def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def debug( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs a debug-level message. + + Args: + message: The debug message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level == "DEBUG": @@ -58,7 +92,19 @@ def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = if section == "END" or section == "BOTH": self._section() - def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def info( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs an info-level message. + + Args: + message: The informational message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level in ("DEBUG", "INFO"): @@ -66,7 +112,19 @@ def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = N if section == "END" or section == "BOTH": self._section() - def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def warning( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs a warning-level message. + + Args: + message: The warning message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() if self.log_level in ("DEBUG", "INFO", "WARNING"): @@ -74,7 +132,19 @@ def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None if section == "END" or section == "BOTH": self._section() - def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + def error( + self, message: str, section: Literal["START", "END", "BOTH"] | None = None + ) -> None: + """ + Logs an error-level message. + + Args: + message: The error message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ if section == "START" or section == "BOTH": self._section() self._print("[ERROR]", message) @@ -82,13 +152,27 @@ def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = self._section() def _section(self) -> None: + """ + Prints a visual separator line for log sections. + + Returns: + None + """ if self.write and self.file_handler: self.file_handler.write( "--------------------------------------------------------------------------------\n" ) - print("--------------------------------------------------------------------------------") + print( + "--------------------------------------------------------------------------------" + ) def close(self) -> None: + """ + Closes the file handler if file logging is enabled. + + Returns: + None + """ if self.file_handler: try: self.file_handler.close() diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/PipelineService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/PipelineService.py index 5dd95bc7..7cf5d885 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/PipelineService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/PipelineService.py @@ -36,7 +36,13 @@ def __init__(self, pipeline_ext_id: str, client: CogniteClient): def update_extraction_pipeline(self, msg: str) -> None: """ - Update the message log for the extraction pipeline + Appends a message to the extraction pipeline run log. + + Args: + msg: The message to append to the pipeline log. + + Returns: + None """ if not self.ep_write.message: self.ep_write.message = msg @@ -48,7 +54,13 @@ def upload_extraction_pipeline( status: Literal["success", "failure", "seen"], ) -> None: """ - Upload the extraction pipeline run so that status and message logs are captured + Creates an extraction pipeline run with accumulated status and messages. + + Args: + status: The run status to report (success, failure, or seen). + + Returns: + None """ self.ep_write.status = status self.client.extraction_pipelines.runs.create(self.ep_write) From 9db710d4bc1141f6f407002894a990bcbedeaa28 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 11 Sep 2025 11:30:24 -0500 Subject: [PATCH 084/128] broke out the README file into separate more manageable readme files --- .../cdf_file_annotation/README.md | 292 +----------------- .../readme_architecture.md | 85 +++++ .../cdf_file_annotation/readme_deployment.md | 184 +++++++++++ 3 files changed, 277 insertions(+), 284 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/readme_architecture.md create mode 100644 modules/contextualization/cdf_file_annotation/readme_deployment.md diff --git a/modules/contextualization/cdf_file_annotation/README.md b/modules/contextualization/cdf_file_annotation/README.md index ba2cb6a1..86061f70 100644 --- a/modules/contextualization/cdf_file_annotation/README.md +++ b/modules/contextualization/cdf_file_annotation/README.md @@ -1,4 +1,4 @@ -# Cognite Data Model-Based Annotation Function +# Cognite Data Model-Based Annotation Module ## Overview @@ -6,291 +6,15 @@ The Annotation template is a framework designed to automate the process of annot ## Key Features -- **Configuration-Driven Workflow:** The entire process is controlled by a single config.yaml file, allowing adaptation to different data models and operational parameters without code changes. -- **Dual Annotation Modes**: Simultaneously runs standard entity matching and a new pattern-based detection mode to create a comprehensive indexed reference catalog. -- **Large Document Support (\>50 Pages):** Automatically handles files with more than 50 pages by breaking them into manageable chunks, processing them iteratively, and tracking the overall progress. -- **Parallel Execution Ready:** Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions when multiple finalize function instances run in parallel. -- **Detailed Reporting:** Local logs and processed annotation details stored in CDF RAW tables, fucntion logs, and extraction pipeline runs for auditing and analysis. -- **Local Running and Debugging:** Both the launch and finalize handler can be ran locally and have default setups in the 'Run & Debug' tab in vscode. Requires a .env file to be placed in the directory. - - - -## Getting Started - -Deploying this annotation module into a new Cognite Data Fusion (CDF) project is a streamlined process. Since all necessary resources (Data Sets, Extraction Pipelines, Functions, etc.) are bundled into a single module, you only need to configure one file to get started. - -### Prerequisites - -- Python 3.11+ -- An active Cognite Data Fusion (CDF) project. -- The required Python packages are listed in the `cdf_file_annotation/functions/fn_file_annotation_launch/requirements.txt` and `cdf_file_annotation/functions/fn_file_annotation_finalize/requirements.txt` files. -- Alias and tag generation is abstracted out of the annotation function. Thus, you'll need to create a transformation that populates the `aliases` and `tags` property of your file and target entity view. - - The `aliases` property is used to match files with entities and should contain a list of alternative names or identifiers that can be found in the files image. - - The `tags` property serves multiple purposes and consists of the following... - - (`DetectInDiagrams`) Identifies files and assets to include as entities filtered by primary scope and secondary scope (if provided). - - (`ScopeWideDetect`) Identifies files and asset to include as entities filtered by a primary scope. - - (`ToAnnotate`) Identifies files that need to be annotated. - - (`AnnotationInProcess`) Identifies files that are in the process of being annotated. - - (`Annotated`) Identifies files that have been annotated. - - (`AnnotationFailed`) Identifies files that have failed the annotation process. Either by erroring out or by receiving 0 possible matches. - - Don't worry if these concepts don't immediately make sense. Aliases and tags are explained in greater detail in the detailed_guides/ documentation. The template also includes a jupyter notebook that prepare the files and assets for annotation if using the toolkit's quickstart module. - -### Deployment Steps - -_**NOTE:** I'm constantly improving this template, thus some parts of the video walkthroughs are from an older version. The video tutorials below are still **relevant**. Any breaking changes will receive a new video tutorial._ - -_(if videos fail to load, try loading page in incognito or re-sign into github) ~ Hope y'all enjoy :)_ - -1. **Create a CDF Project through Toolkit** - - Follow the guide [here](https://docs.cognite.com/cdf/deploy/cdf_toolkit/) - - (optional) Initialize the quickstart package using toolkit CLI - -```bash -poetry init -poetry add cognite-toolkit -poetry run cdf modules init -``` - - - - - -2. **Integrate the Module** - - Move the `local_setup/` folder to the root and unpack .vscode/ and .env.tmpl - - Update the default.config.yaml file with project-specific configurations - - Add the module name to the list of selected modules in your config.{env}.yaml file - - Make sure to create a .env file with credentials pointing to your CDF project - - - - - -3. **Build and Deploy the Module** - - - (optional) Build and deploy the quickstart template modules - - Build and deploy this module - -```bash -poetry run cdf build --env dev -poetry run cdf deploy --dry-run -poetry run cdf deploy -``` - -```yaml -# config..yaml used in examples below -environment: - name: dev - project: - validation-type: dev - selected: - - modules/ - -variables: - modules: - # stuff from quickstart package... - organization: tx - - # ... - - cdf_ingestion: - workflow: ingestion - groupSourceId: - ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} - ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} - pandidContextualizationFunction: contextualization_p_and_id_annotater - contextualization_connection_writer: contextualization_connection_writer - schemaSpace: sp_enterprise_process_industry - schemaSpace2: cdf_cdm - schemaSpace3: cdf_idm - instanceSpaces: - - springfield_instances - - cdf_cdm_units - runWorkflowUserIds: - - - - contextualization: - cdf_file_annotation: - # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows - annotationDatasetExternalId: ds_file_annotation - - # used in /data_models and /extraction_pipelines - annotationStateExternalId: FileAnnotationState - annotationStateInstanceSpace: sp_dat_cdf_annotation_states - annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model - annotationStateVersion: v1.0.1 - fileSchemaSpace: sp_enterprise_process_industry - fileExternalId: txFile - fileVersion: v1 - - # used in /raw and /extraction_pipelines - rawDb: db_file_annotation - rawTableDocTag: annotation_documents_tags - rawTableDocDoc: annotation_documents_docs - rawTableCache: annotation_entities_cache - - # used in /extraction_pipelines - extractionPipelineExternalId: ep_file_annotation - targetEntitySchemaSpace: sp_enterprise_process_industry - targetEntityExternalId: txEquipment - targetEntityVersion: v1 - - # used in /functions and /workflows - launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID - launchFunctionVersion: v1.0.0 - finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID - finalizeFunctionVersion: v1.0.0 - functionClientId: ${IDP_CLIENT_ID} - functionClientSecret: ${IDP_CLIENT_SECRET} - - # used in /workflows - workflowSchedule: "*/10 * * * *" - workflowExternalId: wf_file_annotation - workflowVersion: v1 - - # used in /auth - groupSourceId: # source ID from Azure AD for the corresponding groups - - - # ... -``` - - - - - -4. **Run the Workflow** - - After deployment, the annotation process is managed by a workflow that orchestrates the `Launch` and `Finalize` functions. The workflow is automatically triggered based on the schedule defined in the configuration. You can monitor the progress and logs of the functions in the CDF UI. - - - (optional) Run the ingestion workflow from the quickstart package to create instances of File, Asset, etc - - (optional) Checkout the instantiated files that have been annotated using the annotation function from the quickstart package - - (optional) Run the local_setup.ipynb to setup the files for annotation - - Run the File Annotation Workflow - - - - - - +- **Configuration-Driven**: The entire process is controlled by a single `config.yaml` file, allowing adaptation without code changes. - - -### Local Development and Debugging - -This template is configured for easy local execution and debugging directly within Visual Studio Code. - -1. **Create Environment File**: Before running locally, you must create a `.env` file in the root directory. This file will hold the necessary credentials and configuration for connecting to your CDF project. Populate it with the required environment variables for `IDP_CLIENT_ID`, `CDF_CLUSTER`, etc. In the `local_runs/` folder you'll find a .env template. - -2. **Use the VS Code Debugger**: The repository includes a pre-configured `local_runs/.vscode/launch.json` file. Please move the .vscode/ folder to the top level of your repo. - - - Navigate to the "Run and Debug" view in the VS Code sidebar. - - You will see dropdown options for launching the different functions (e.g., `Launch Function`, `Finalize Function`). - - Select the function you wish to run and click the green "Start Debugging" arrow. This will start the function on your local machine, with the debugger attached, allowing you to set breakpoints and inspect variables. - - Feel free to change/adjust the arguments passed into the function call to point to a test_extraction_pipeline and/or change the log level. - - - -## How It Works - -The template operates in three main phases, orchestrated by CDF Workflows. Since the prepare phase is relatively small, it is bundled in with the launch phase. However, conceptually it should be treated as a separate process. - -### Prepare Phase - -- **Goal**: Identify files that need to be annotated or have their status reset. -- **Process**: - 1. It queries for files that are marked for re-annotation and resets their status. - 2. It then queries for new files tagged for annotation (e.g., with a "ToAnnotate" tag). - 3. For each new file, it creates a corresponding `AnnotationState` instance in the data model, marking it with a "New" status. - -### Launch Phase - -![LaunchService](https://github.com/user-attachments/assets/3e5ba403-50bb-4f6a-a723-be8947c65ebc) - -- **Goal**: Launch the annotation jobs for files that are ready. -- **Process**: - 1. It queries for `AnnotationState` instances with a "New" or "Retry" status. - 2. It groups these files by a primary scope to provide context. - 3. For each group, it fetches the relevant file and target entity information, using a cache to avoid redundant lookups. - 4. It calls the Cognite Diagram Detect API to initiate two async jobs: - - A `standard annotation` job to find and link known entities. - - A `pattern mode` job to detect all potential tags and build an indexed reference catalog. - 5. It updates the `AnnotationState` instance with both the `diagramDetectJobId` and `patternModeJobId` and sets the overall `annotationStatus` to "Processing". - -### Finalize Phase - -![FinalizeService](https://github.com/user-attachments/assets/152d9eaf-afdb-46fe-9125-11430ff10bc9) - -- **Goal**: Retrieve, process, and store the results of completed annotation jobs. -- **Process**: - 1. It queries for `AnnotationState` instances with a "Processing" status. - 2. It waits until both the standard and pattern modejobs for a given file are complete. - 3. It then retrieves and merges the results from both jobs. - 4. It will optionally clean old annotations first and then: - - Applies the standard annotations by creating edges in the data model, writing the results to a dedicated RAW table. - - Processes the pattern mode results, writing them to a dedicated RAW table to populate the reference catalog. - 5. It updates the `AnnotationState` status to "Annotated" or "Failed" and tags the file accordingly. - -## Configuration - -The templates behavior is entirely controlled by the `ep_file_annotation.config.yaml` file. This YAML file is parsed by Pydantic models in the code, ensuring a strongly typed and validated configuration. - -Key configuration sections include: - -- `dataModelViews`: Defines the data model views for files, annotation states, and target entities. -- `prepareFunction`: Configures the queries to find files to annotate. -- `launchFunction`: Sets parameters for the annotation job, such as batch size, entity matching properties, and a new `patternMode: true` flag to enable the pattern detection feature. -- `finalizeFunction`: Defines how to process and apply the final annotations. - -This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. - -## Detailed Guides - -This README provides a high-level overview of the template's purpose and architecture. To gain a deeper understanding of how to configure and extend the template, I highly recommend exploring the detailed guides located in the `cdf_file_annotation/detailed_guides/` directory: - -- **`CONFIG.md`**: A document outlining the `ep_file_annotation.config.yaml` file to control the behavior of the Annotation Function. -- **`CONFIG_PATTERNS.md`**: A guide with recipes for common operational tasks, such as processing specific subsets of data, reprocessing files for debugging, and tuning performance by adjusting the configuration. -- **`DEVELOPING.md`**: A guide for developers who wish to extend the template's functionality. It details the interface-based architecture and provides a step-by-step walkthrough on how to create and integrate your own custom service implementations for specialized logic. - -## Design Philosophy - -There were two principles I kept in mind when designing this template. - -- **Evolving Needs:** Project requirements evolve. A simple, plug-and-play tool is great to start with, but it can hit limitations when faced with demands for scale, performance, or specialized logic—as was the case with previous annotation templates when applied to projects with tens of thousands of complex files. My belief is that a modern template must be built to be extended. - -- **The Balance Between Configuration and Code:** This template is architected to provide two primary modes of adaptation, striking a crucial balance: - - 1. **Quick Start (via Configuration):** For the majority of use cases, a user should only need to edit the `config.yaml` file. By defining their data model views and tuning process parameters, they can get the template running quickly and effectively. - 2. **Scaling (via Interfaces):** When a project demands unique optimizations—like a non-standard batching strategy or a complex query to fetch entities—the interface-based design provides the necessary "escape hatch." A developer can write a custom Python class to implement their specialized logic, ensuring the template can meet any future requirement. - -## Architecture & Optimizations - -This section explains some of the core design choices made to ensure the template is robust and scalable. - -### Stateful Processing with Data Models - -Instead of using a simpler store like a RAW table to track the status of each file, this module uses a dedicated `AnnotationState` Data Model. There is a 1-to-1 relationship between a file being annotated and its corresponding `AnnotationState` instance. This architectural choice is deliberate and crucial for reliability: - -- **Concurrency:** Data Model instances have built-in optimistic locking via the `existing_version` field. When multiple parallel functions attempt to "claim" a job, only the first one can succeed in updating the `AnnotationState` instance. All others will receive a version conflict error. This database-level locking is far more reliable and simpler to manage than building a custom locking mechanism on top of RAW. -- **Query Performance:** Finding all files that need processing (e.g., status is "New" or "Retry") is a fast, indexed query against the Data Model. Performing equivalent filtering on potentially millions of rows in a RAW table would be significantly slower and less efficient. -- **Schema Enforcement and Data Integrity:** The `AnnotationState` view enforces a strict schema for state information (`status`, `attemptCount`, `annotatedPageCount`, etc.), ensuring data consistency across the entire process. RAW tables offer no schema guarantees. -- **Discoverability and Governance:** The state of the annotation pipeline is exposed as a first-class entity in the CDF data catalog. This makes it easy to monitor progress, build dashboards, and govern the data lifecycle, which is much more difficult with state hidden away in RAW rows. - -### Optimized Batch Processing & Caching - -When processing tens of thousands of files, naively fetching context for each file is inefficient. This module implements a significant optimization based on experiences with large-scale projects. - -- **Rationale:** For many projects, the entities relevant to a given file are often co-located within the same site or operational unit. By grouping files based on these properties before processing, we can create a highly effective cache. -- **Implementation:** The `launchFunction` configuration allows specifying a `primary_scope_property` and an optional `secondary_scope_property`. The `LaunchService` uses these properties to organize all files into ordered batches. The cache for entities is then loaded once for each context, drastically reducing the number of queries to CDF and improving overall throughput. - -### Interface-Based Extensibility - -The template is designed around a core set of abstract interfaces (e.g., `IDataModelService`, `ILaunchService`). This is a foundational architectural choice that enables scalability and long-term viability. +- **Dual Annotation Modes**: Simultaneously runs standard entity matching and a new pattern-based detection mode to create a comprehensive indexed reference catalog. +- **Large Document Support**: Automatically handles files with more than 50 pages by processing them in chunks and tracking overall progress. -- **Contract vs. Implementation:** An interface defines a stable "contract" of _what_ a service should do. The provided `General...Service` classes offer a powerful default implementation that is driven by the configuration file. -- **Enabling Customization:** When a project's needs exceed the capabilities of the default implementation or configuration, developers can write their own concrete class that implements the interface with bespoke logic. This custom class can then be "plugged in" via the dependency injection system, without needing to modify the rest of the template's code. +- **Parallel Execution Ready**: Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions. -## About Me +- **Comprehensive Reporting**: Includes a multi-page Streamlit dashboard for monitoring pipeline health, analyzing annotation quality, and managing patterns. -Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach, Noah Karsky, and Darren Downtain for providing invaluable input from a solution architect's point of view. I also want to thank Khaled Shaheen and Gayatri Babel for their help in building this. +--- -This code is my attempt to create a standard template that 'breaks' the cycle where projects build simple tools, outgrow them, and are then forced to build a new and often hard-to-reuse solution. My current belief is that it's impossible for a template to have long-term success if it's not built on the fundamental premise of being extended. Customer needs will evolve, and new product features will create new opportunities for optimization. + diff --git a/modules/contextualization/cdf_file_annotation/readme_architecture.md b/modules/contextualization/cdf_file_annotation/readme_architecture.md new file mode 100644 index 00000000..f1d45dd0 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/readme_architecture.md @@ -0,0 +1,85 @@ +# Architecture Guide + +## Configuration Driven + +The templates behavior is entirely controlled by the `ep_file_annotation.config.yaml` file. This YAML file is parsed by Pydantic models in the code, ensuring a strongly typed and validated configuration. + +Key configuration sections include: + +- `dataModelViews`: Defines the data model views for files, annotation states, and target entities. +- `prepareFunction`: Configures the queries to find files to annotate. +- `launchFunction`: Sets parameters for the annotation job, such as batch size, entity matching properties, and a new `patternMode: true` flag to enable the pattern detection feature. +- `finalizeFunction`: Defines how to process and apply the final annotations. + +This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. + +## How It Works + +The template operates in three main phases, orchestrated by a CDF Workflow that calls a `Launch` and multiple `Finalize` functions in parallel. + +### Prepare Phase + +- **Goal**: Identify files that need to be annotated. +- **Process**: This initial step, handled within the `LaunchService`, queries for new files tagged for annotation (e.g., with a "ToAnnotate" tag). For each new file, it creates a corresponding `AnnotationState` instance in the data model, marking it with a "New" status. + +### Launch Phase + +- **Goal**: Initiate the annotation jobs for all ready files. +- **Process**: The `LaunchService` queries for `AnnotationState` instances with a "New" or "Retry" status. It groups these files by a configured scope (e.g., `site`) to create a relevant entity cache, avoiding redundant lookups. It then calls the Cognite Diagram Detect API to start two asynchronous jobs: a standard entity matching job and a pattern mode job. Finally, it updates the `AnnotationState` instance with the job IDs and sets the status to "Processing". + +### Finalize Phase + +- **Goal**: Retrieve, process, and store the results of completed annotation jobs. +- **Process**: The `FinalizeService` queries for `AnnotationState` instances with a "Processing" status. Once both the standard and pattern jobs for a file are complete, it retrieves the results. It applies the standard annotations by creating edges in the data model and logs the results to RAW tables. The pattern mode results are also logged to a dedicated RAW table (`annotation_documents_patterns`). Finally, it updates the `AnnotationState` status to "Annotated" or "Failed". + +## Design Philosophy + +There were two principles I kept in mind when designing this template. + +- **Evolving Needs:** Project requirements evolve. A simple, plug-and-play tool is great to start with, but it can hit limitations when faced with demands for scale, performance, or specialized logic—as was the case with previous annotation templates when applied to projects with tens of thousands of complex files. My belief is that a modern template must be built to be extended. + +- **The Balance Between Configuration and Code:** This template is architected to provide two primary modes of adaptation, striking a crucial balance: + + 1. **Quick Start (via Configuration):** For the majority of use cases, a user should only need to edit the `config.yaml` file. By defining their data model views and tuning process parameters, they can get the template running quickly and effectively. + 2. **Scaling (via Interfaces):** When a project demands unique optimizations—like a non-standard batching strategy or a complex query to fetch entities—the interface-based design provides the necessary "escape hatch." A developer can write a custom Python class to implement their specialized logic, ensuring the template can meet any future requirement. + +## Architecture & Optimizations + +### Stateful Processing with Data Models + +Instead of using RAW tables to track file status, this module uses a dedicated `AnnotationState` Data Model. This is a crucial architectural choice for several reasons: + +- **Concurrency**: Data model instances provide built-in optimistic locking via the `existing_version` field. When multiple `Finalize` functions try to claim a job, only the first one succeeds, preventing race conditions. +- **Performance**: Finding files ready for processing is a fast, indexed query against the data model, which is far more efficient than filtering millions of rows in a RAW table. +- **Data Integrity & Governance**: The `AnnotationState` view enforces a strict schema for all status information, ensuring consistency and making the pipeline's state a first-class, governable entity in CDF. + +### Interface-Based Extensibility + +The template is designed around a core set of abstract interfaces (e.g., `AbstractLaunchService`, `IDataModelService`). This enables scalability and long-term viability. + +- **Contract vs. Implementation**: An interface defines a stable contract of _what_ a service should do. The provided `General...Service` classes offer a powerful default implementation driven by the configuration file. +- **Enabling Customization**: When a project's needs exceed the configuration options, developers can write their own class that implements the interface with custom logic. This custom class can then be "plugged in" via the dependency injection system without modifying the rest of the template's code. + +For more details on configuration and extending the template, see `detailed_guides/CONFIG.md` and `detailed_guides/DEVELOPING.md`. + +### Known Limitation: Scalable Deletion of Pattern Results + +A key architectural challenge remains regarding the `cleanOldAnnotations` feature for pattern mode results. + +- **The Challenge**: Pattern results are stored in the `annotation_documents_patterns` RAW table with a key format of `f"{tag_text}:{file_id.space}:{file_id.external_id}"`. To delete these rows when reprocessing a file, one would need to know all possible values of `{tag_text}` beforehand. +- **The Impact**: The current implementation cannot scalably delete old pattern results for a specific file because listing all rows to find the relevant keys is not feasible for large tables. This can lead to stale data on the Annotation Quality dashboard if files are frequently re-processed. This is a known issue targeted for future enhancement. +- **Temporary Solution**: Delete the `annotation_documents_patterns` table before re-annotating all files to ensure fresh data. + +## Detailed Guides + +This README provides a high-level overview of the template's purpose and architecture. To gain a deeper understanding of how to configure and extend the template, I highly recommend exploring the detailed guides located in the `cdf_file_annotation/detailed_guides/` directory: + +- **`CONFIG.md`**: A document outlining the `ep_file_annotation.config.yaml` file to control the behavior of the Annotation Function. +- **`CONFIG_PATTERNS.md`**: A guide with recipes for common operational tasks, such as processing specific subsets of data, reprocessing files for debugging, and tuning performance by adjusting the configuration. +- **`DEVELOPING.md`**: A guide for developers who wish to extend the template's functionality. It details the interface-based architecture and provides a step-by-step walkthrough on how to create and integrate your own custom service implementations for specialized logic. + +## About Me + +Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach, Noah Karsky, and Darren Downtain for providing invaluable input from a solution architect's point of view. I also want to thank Khaled Shaheen and Gayatri Babel for their help in building this. + +This code is my attempt to create a standard template that 'breaks' the cycle where projects build simple tools, outgrow them, and are then forced to build a new and often hard-to-reuse solution. My current belief is that it's impossible for a template to have long-term success if it's not built on the fundamental premise of being extended. Customer needs will evolve, and new product features will create new opportunities for optimization. diff --git a/modules/contextualization/cdf_file_annotation/readme_deployment.md b/modules/contextualization/cdf_file_annotation/readme_deployment.md new file mode 100644 index 00000000..8a10acdd --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/readme_deployment.md @@ -0,0 +1,184 @@ +# Deployment Guide + +## Getting Started + +Deploying this annotation module into a new Cognite Data Fusion (CDF) project is a streamlined process. Since all necessary resources (Data Sets, Extraction Pipelines, Functions, etc.) are bundled into a single module, you only need to configure one file to get started. + +--- + +### Prerequisites + +- Python 3.11+ +- An active Cognite Data Fusion (CDF) project. +- The required Python packages are listed in the `cdf_file_annotation/functions/fn_file_annotation_launch/requirements.txt` and `cdf_file_annotation/functions/fn_file_annotation_finalize/requirements.txt` files. +- Alias and tag generation is abstracted out of the annotation function. Thus, you'll need to create a transformation that populates the `aliases` and `tags` property of your file and target entity view. + - The `aliases` property is used to match files with entities and should contain a list of alternative names or identifiers that can be found in the files image. + - The `tags` property serves multiple purposes and consists of the following... + - (`DetectInDiagrams`) Identifies files and assets to include as entities filtered by primary scope and secondary scope (if provided). + - (`ScopeWideDetect`) Identifies files and asset to include as entities filtered by a primary scope. + - (`ToAnnotate`) Identifies files that need to be annotated. + - (`AnnotationInProcess`) Identifies files that are in the process of being annotated. + - (`Annotated`) Identifies files that have been annotated. + - (`AnnotationFailed`) Identifies files that have failed the annotation process. Either by erroring out or by receiving 0 possible matches. + - Don't worry if these concepts don't immediately make sense. Aliases and tags are explained in greater detail in the detailed_guides/ documentation. The template also includes a jupyter notebook that prepare the files and assets for annotation if using the toolkit's quickstart module. + +--- + +### Deployment Steps with Quickstart Module + +The video and deployment steps are with regards to getting things setup on a quickstart module. If you have an existing project, the important step is to insert the correct information in the config.env.yaml file. + +_(if videos fail to load, try loading page in incognito or re-sign into github)_ + +1. **Create a CDF Project through Toolkit** + - Follow the guide [here](https://docs.cognite.com/cdf/deploy/cdf_toolkit/) + - (optional) Initialize the quickstart package using toolkit CLI + +```bash +poetry init +poetry add cognite-toolkit +poetry run cdf modules init +``` + + + + + +2. **Integrate the Module** + - Move the `local_setup/` folder to the root and unpack .vscode/ and .env.tmpl + - Update the default.config.yaml file with project-specific configurations + - Add the module name to the list of selected modules in your config.{env}.yaml file + - Make sure to create a .env file with credentials pointing to your CDF project + + + + + +3. **Build and Deploy the Module** + + - (optional) Build and deploy the quickstart template modules + - Build and deploy this module + +```bash +poetry run cdf build --env dev +poetry run cdf deploy --dry-run +poetry run cdf deploy +``` + +```yaml +# config..yaml used in examples below +environment: + name: dev + project: + validation-type: dev + selected: + - modules/ + +variables: + modules: + # stuff from quickstart package... + organization: + + # ... + + cdf_ingestion: + workflow: ingestion + groupSourceId: + ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} + ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} + pandidContextualizationFunction: contextualization_p_and_id_annotater + contextualization_connection_writer: contextualization_connection_writer + schemaSpace: sp_enterprise_process_industry + schemaSpace2: cdf_cdm + schemaSpace3: cdf_idm + instanceSpaces: + - springfield_instances + - cdf_cdm_units + runWorkflowUserIds: + - + + contextualization: + cdf_file_annotation: + # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows + annotationDatasetExternalId: ds_file_annotation + + # used in /data_models and /extraction_pipelines + annotationStateExternalId: FileAnnotationState + annotationStateInstanceSpace: sp_dat_cdf_annotation_states # NOTE: can set to fileInstanceSpace if scoping is required - refer to detailed_guides/config_patterns.md + annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model + annotationStateVersion: v1.0.0 + fileSchemaSpace: sp_enterprise_process_industry + fileExternalId: File + fileInstanceSpace: # Optional - used for scoping - refer to detailed_guides/config_patterns.md + fileVersion: v1 + + # used in /raw and /extraction_pipelines + rawDb: db_file_annotation + rawTableDocTag: annotation_documents_tags + rawTableDocDoc: annotation_documents_docs + rawTableCache: annotation_entities_cache + + # used in /extraction_pipelines + extractionPipelineExternalId: ep_file_annotation + targetEntitySchemaSpace: sp_enterprise_process_industry + targetEntityExternalId: Equipment + targetEntityInstanceSpace: # Optional - used for scoping - refer to detailed_guides/config_patterns.md + targetEntityVersion: v1 + + # used in /functions and /workflows + launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID + launchFunctionVersion: v1.0.0 + finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID + finalizeFunctionVersion: v1.0.0 + functionClientId: ${IDP_CLIENT_ID} + functionClientSecret: ${IDP_CLIENT_SECRET} + + # used in /workflows + workflowSchedule: "*/10 * * * *" + workflowExternalId: wf_file_annotation + workflowVersion: v1 + + # used in /auth + groupSourceId: # source ID from Azure AD for the corresponding groups + + + # ... +``` + + + + + +4. **Run the Workflow** + + After deployment, the annotation process is managed by a workflow that orchestrates the `Launch` and `Finalize` functions. The workflow is automatically triggered based on the schedule defined in the configuration. You can monitor the progress and logs of the functions in the CDF UI. + + - (optional) Run the ingestion workflow from the quickstart package to create instances of File, Asset, etc + - (optional) Checkout the instantiated files that have been annotated using the annotation function from the quickstart package + - (optional) Run the local_setup.ipynb to setup the files for annotation + - Run the File Annotation Workflow + + + + + + + + + +--- + +### Local Development and Debugging + +This template is configured for easy local execution and debugging directly within Visual Studio Code. + +1. **Create Environment File**: Before running locally, you must create a `.env` file in the root directory. This file will hold the necessary credentials and configuration for connecting to your CDF project. Populate it with the required environment variables for `IDP_CLIENT_ID`, `CDF_CLUSTER`, etc. In the `local_runs/` folder you'll find a .env template. + +2. **Use the VS Code Debugger**: The repository includes a pre-configured `local_runs/.vscode/launch.json` file. Please move the .vscode/ folder to the top level of your repo. + + - Navigate to the "Run and Debug" view in the VS Code sidebar. + - You will see dropdown options for launching the different functions (e.g., `Launch Function`, `Finalize Function`). + - Select the function you wish to run and click the green "Start Debugging" arrow. This will start the function on your local machine, with the debugger attached, allowing you to set breakpoints and inspect variables. + - Feel free to change/adjust the arguments passed into the function call to point to a test_extraction_pipeline and/or change the log level. + + From 88ec5cb0f9086f25937f69cc9b960c364895c52a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 14:55:06 -0500 Subject: [PATCH 085/128] updated readme and added supporting markdown files --- .../cdf_file_annotation/CONTRIBUTING.md | 197 +++++++++++++++ .../cdf_file_annotation/DEPLOYMENT.md | 229 ++++++++++++++++++ .../cdf_file_annotation/README.md | 168 ++++++++++++- .../readme_architecture.md | 85 ------- .../cdf_file_annotation/readme_deployment.md | 184 -------------- 5 files changed, 587 insertions(+), 276 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/CONTRIBUTING.md create mode 100644 modules/contextualization/cdf_file_annotation/DEPLOYMENT.md delete mode 100644 modules/contextualization/cdf_file_annotation/readme_architecture.md delete mode 100644 modules/contextualization/cdf_file_annotation/readme_deployment.md diff --git a/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md b/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md new file mode 100644 index 00000000..946a72c3 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md @@ -0,0 +1,197 @@ +# Contributing to CDF File Annotation Module + +Thank you for your interest in contributing to the CDF File Annotation Module! This document outlines the process for contributing to this project. + +## Contribution Workflow + +All contributions to this project must follow this workflow: + +### 1. Create a GitHub Issue + +Before making any changes, please create a GitHub issue to discuss: + +- **Bug Reports**: Describe the bug, steps to reproduce, expected vs. actual behavior, and your environment +- **Feature Requests**: Describe the feature, its use case, and how it would benefit the project +- **Documentation Improvements**: Describe what documentation is missing or needs clarification +- **Code Improvements**: Describe the refactoring or optimization you'd like to make + +**Why create an issue first?** + +- Ensures alignment on the problem and proposed solution +- Prevents duplicate work +- Allows for discussion before investing time in implementation +- Provides context for the eventual pull request + +### 2. Create a Pull Request + +Once the issue has been discussed and you're ready to contribute: + +1. **Fork the repository** to your GitHub account +2. **Create a feature branch** from `main`: + + ```bash + git checkout -b feature/issue-123-short-description + ``` + + or + + ```bash + git checkout -b fix/issue-456-short-description + ``` + +3. **Make your changes** following the code standards below + +4. **Commit your changes** with clear, descriptive commit messages: + + ```bash + git commit -m "Fix: Resolve cache invalidation issue (#123) + + - Updated cache validation logic to handle edge cases + - Added unit tests for cache service + - Updated documentation" + ``` + +5. **Push to your fork**: + + ```bash + git push origin feature/issue-123-short-description + ``` + +6. **Create a Pull Request** on GitHub: + - Reference the related issue in the PR description (e.g., "Closes #123" or "Fixes #456") + - Provide a clear description of what changed and why + - Include any relevant testing details or screenshots + - Add `@jack-cognite` as a reviewer (or the current maintainer) + +### 3. Code Review and Approval + +- **All PRs require approval** from the project maintainer (@jack-cognite or designated reviewer) before merging +- The maintainer will review your code for: + + - Code quality and adherence to project standards + - Test coverage + - Documentation updates + - Breaking changes or backward compatibility + - Performance implications + +- Address any feedback or requested changes +- Once approved, the maintainer will merge your PR + +**Note**: PRs will not be merged without maintainer approval, even if all automated checks pass. + +## Code Standards + +### Python Code Style + +- Follow [PEP 8](https://pep8.org/) style guidelines +- Use type hints for all function parameters and return values +- Maximum line length: 120 characters (as configured in the project) +- Use meaningful variable and function names + +### Documentation + +- **All functions must include Google-style docstrings** with: + - Brief description + - `Args`: Parameter descriptions + - `Returns`: Return value description + - `Raises`: Exception descriptions (if applicable) +- Update README.md or relevant documentation if your changes affect user-facing behavior +- Add inline comments for complex logic or non-obvious decisions + +### Example Docstring Format + +```python +def process_annotations( + self, + file_node: Node, + regular_item: dict | None, + pattern_item: dict | None +) -> tuple[str, str]: + """ + Processes diagram detection results and applies annotations to a file. + + Handles both regular entity matching and pattern mode results, applying + confidence thresholds and deduplication logic. + + Args: + file_node: The file node instance to annotate. + regular_item: Dictionary containing regular diagram detect results. + pattern_item: Dictionary containing pattern mode results. + + Returns: + A tuple containing: + - Summary message of regular annotations applied + - Summary message of pattern annotations created + + Raises: + CogniteAPIError: If the API calls to apply annotations fail. + ValueError: If the file node is missing required properties. + """ + # Implementation... +``` + +### Testing + +- Add tests for new functionality where applicable +- Ensure existing tests pass before submitting your PR +- Test locally using the VSCode debugger setup (see [DEPLOYMENT.md](DEPLOYMENT.md)) + +### Configuration Changes + +- If you modify the configuration structure (`ep_file_annotation.config.yaml`), ensure: + - Pydantic models are updated accordingly + - Documentation in `detailed_guides/CONFIG.md` is updated + - Backward compatibility is maintained or a migration path is provided + +## What We're Looking For + +Contributions that align with the project's philosophy: + +- **Configuration-driven**: Prefer adding configuration options over hardcoded behavior +- **Interface-based**: Extend functionality through interfaces rather than modifying core logic +- **Well-documented**: Code should be self-explanatory with clear documentation +- **Production-ready**: Code should handle edge cases, errors, and scale considerations +- **Backward compatible**: Avoid breaking changes unless absolutely necessary + +## Types of Contributions We Welcome + +- **Bug fixes**: Resolve issues, fix edge cases, improve error handling +- **Performance improvements**: Optimize queries, caching, or processing logic +- **Documentation**: Improve guides, add examples, clarify confusing sections +- **New configuration options**: Add flexibility through new config parameters +- **New service implementations**: Create alternative implementations of existing interfaces +- **Test coverage**: Add unit tests, integration tests, or test utilities +- **Examples**: Add example configurations or use cases + +## Types of Changes Requiring Extra Discussion + +These types of changes require significant discussion in the GitHub issue before proceeding: + +- Breaking changes to the configuration format +- Changes to the core architecture or interfaces +- New external dependencies +- Changes affecting the data model structure +- Performance changes that trade off memory/CPU/network differently + +## Questions or Need Help? + +- Create a GitHub issue with your question +- Tag it with the "question" label +- The maintainer will respond as soon as possible + +## Code of Conduct + +- Be respectful and constructive in all interactions +- Provide thoughtful, actionable feedback during code reviews +- Assume good intentions from all contributors +- Focus on the code and ideas, not the person + +## License + +By contributing to this project, you agree that your contributions will be licensed under the same license as the project (see LICENSE file). + +--- + +Thank you for contributing to making this project better! 🚀 + +Return to [Main README](README.md) diff --git a/modules/contextualization/cdf_file_annotation/DEPLOYMENT.md b/modules/contextualization/cdf_file_annotation/DEPLOYMENT.md new file mode 100644 index 00000000..c859ad13 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/DEPLOYMENT.md @@ -0,0 +1,229 @@ +# Deployment Guide + +This guide provides step-by-step instructions for deploying the CDF File Annotation Module to your Cognite Data Fusion (CDF) project. + +## Prerequisites + +Before deploying this module, ensure you have the following: + +- **Python 3.11+** installed on your system +- **An active Cognite Data Fusion (CDF) project** +- **CDF Toolkit** installed (see step 1 below) +- **Required Python packages** are listed in: + - `cdf_file_annotation/functions/fn_file_annotation_launch/requirements.txt` + - `cdf_file_annotation/functions/fn_file_annotation_finalize/requirements.txt` + +### Data Preparation Requirements + +Alias and tag generation is abstracted out of the annotation function. You'll need to create a transformation that populates the `aliases` and `tags` properties of your file and target entity views: + +#### Aliases Property + +- Used to match files with entities +- Should contain a list of alternative names or identifiers that can be found in the file's image +- Examples: `["FT-101A", "Flow Transmitter 101A", "FT101A"]` + +#### Tags Property + +The `tags` property serves multiple purposes and consists of the following: + +- **`DetectInDiagrams`**: Identifies files and assets to include as entities filtered by primary scope and secondary scope (if provided) +- **`ScopeWideDetect`**: Identifies files and assets to include as entities filtered by a primary scope only +- **`ToAnnotate`**: Identifies files that need to be annotated +- **`AnnotationInProcess`**: Identifies files that are in the process of being annotated +- **`Annotated`**: Identifies files that have been annotated +- **`AnnotationFailed`**: Identifies files that have failed the annotation process (either by erroring out or by receiving 0 possible matches) + +> **Note**: Don't worry if these concepts don't immediately make sense. Aliases and tags are explained in greater detail in the `detailed_guides/` documentation. The template also includes a jupyter notebook that prepares the files and assets for annotation if using the toolkit's quickstart module. + +## Deployment Steps + +_**NOTE:** I'm constantly improving this template, thus some parts of the video walkthroughs are from an older version. The video tutorials below are still **relevant**. Any breaking changes will receive a new video tutorial._ + +_(If videos fail to load, try loading the page in incognito or re-sign into GitHub)_ + +### Step 1: Create a CDF Project through Toolkit + +Follow the [CDF Toolkit guide](https://docs.cognite.com/cdf/deploy/cdf_toolkit/) to set up your project. + +Optionally, initialize the quickstart package using toolkit CLI: + +```bash +poetry init +poetry add cognite-toolkit +poetry run cdf modules init +``` + + + + + +### Step 2: Integrate the Module + +1. Move the `local_setup/` folder to the root and unpack `.vscode/` and `.env.tmpl` +2. Update the `default.config.yaml` file with project-specific configurations +3. Add the module name to the list of selected modules in your `config.{env}.yaml` file +4. Create a `.env` file with credentials pointing to your CDF project + + + + + +### Step 3: Build and Deploy the Module + +1. (Optional) Build and deploy the quickstart template modules +2. Build and deploy this module: + +```bash +poetry run cdf build --env dev +poetry run cdf deploy --dry-run +poetry run cdf deploy +``` + +#### Example Configuration File + +Below is an example `config..yaml` configuration: + +```yaml +# config..yaml used in examples below +environment: + name: dev + project: + validation-type: dev + selected: + - modules/ + +variables: + modules: + # stuff from quickstart package... + organization: tx + + # ... + + cdf_ingestion: + workflow: ingestion + groupSourceId: + ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} + ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} + pandidContextualizationFunction: contextualization_p_and_id_annotater + contextualization_connection_writer: contextualization_connection_writer + schemaSpace: sp_enterprise_process_industry + schemaSpace2: cdf_cdm + schemaSpace3: cdf_idm + instanceSpaces: + - springfield_instances + - cdf_cdm_units + runWorkflowUserIds: + - + + contextualization: + cdf_file_annotation: + # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows + annotationDatasetExternalId: ds_file_annotation + + # used in /data_models and /extraction_pipelines + annotationStateExternalId: FileAnnotationState + annotationStateInstanceSpace: sp_dat_cdf_annotation_states + annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model + annotationStateVersion: v1.0.1 + fileSchemaSpace: sp_enterprise_process_industry + fileExternalId: txFile + fileVersion: v1 + + # used in /raw and /extraction_pipelines + rawDb: db_file_annotation + rawTableDocTag: annotation_documents_tags + rawTableDocDoc: annotation_documents_docs + rawTableCache: annotation_entities_cache + + # used in /extraction_pipelines + extractionPipelineExternalId: ep_file_annotation + targetEntitySchemaSpace: sp_enterprise_process_industry + targetEntityExternalId: txEquipment + targetEntityVersion: v1 + + # used in /functions and /workflows + launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID + launchFunctionVersion: v1.0.0 + finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID + finalizeFunctionVersion: v1.0.0 + functionClientId: ${IDP_CLIENT_ID} + functionClientSecret: ${IDP_CLIENT_SECRET} + + # used in /workflows + workflowSchedule: "*/10 * * * *" + workflowExternalId: wf_file_annotation + workflowVersion: v1 + + # used in /auth + groupSourceId: # source ID from Azure AD for the corresponding groups + + + # ... +``` + + + + + +### Step 4: Run the Workflow + +After deployment, the annotation process is managed by a workflow that orchestrates the `Launch` and `Finalize` functions. The workflow is automatically triggered based on the schedule defined in the configuration. You can monitor the progress and logs of the functions in the CDF UI. + +**Optional preparatory steps:** + +1. Run the ingestion workflow from the quickstart package to create instances of `File`, `Asset`, etc. +2. Check out the instantiated files that have been annotated using the annotation function from the quickstart package +3. Run the `local_setup.ipynb` notebook to set up the files for annotation + +**Run the File Annotation Workflow** in the CDF UI and monitor its progress. + + + + + + + + + +## Local Development and Debugging + +This template is configured for easy local execution and debugging directly within Visual Studio Code. + +### Setup Instructions + +1. **Create Environment File**: Before running locally, you must create a `.env` file in the root directory. This file will hold the necessary credentials and configuration for connecting to your CDF project. Populate it with the required environment variables for `IDP_CLIENT_ID`, `CDF_CLUSTER`, etc. In the `local_runs/` folder you'll find a `.env` template. + +2. **Use the VS Code Debugger**: The repository includes a pre-configured `local_runs/.vscode/launch.json` file. Move the `.vscode/` folder to the top level of your repo. + + - Navigate to the "Run and Debug" view in the VS Code sidebar + - You will see dropdown options for launching the different functions (e.g., `Launch Function`, `Finalize Function`) + - Select the function you wish to run and click the green "Start Debugging" arrow + - This will start the function on your local machine, with the debugger attached, allowing you to set breakpoints and inspect variables + - Feel free to change/adjust the arguments passed into the function call to point to a test extraction pipeline and/or change the log level + + + +## Troubleshooting + +### Common Issues + +- **Authentication Errors**: Ensure your `.env` file contains valid credentials and that your service principal has the necessary permissions +- **Module Not Found**: Verify that the module is listed in your `config.{env}.yaml` file under `selected` +- **Function Deployment Fails**: Check that the function folder names match the external IDs defined in your configuration +- **Workflow Not Triggering**: Verify the workflow schedule is valid cron syntax and that the workflow has been deployed successfully + +For additional help, please refer to the [detailed guides](detailed_guides/) or [open an issue](../../issues) on GitHub. + +## Next Steps + +After successful deployment: + +1. Review the [Configuration Guide](detailed_guides/CONFIG.md) to understand all available options +2. Check the [Configuration Patterns Guide](detailed_guides/CONFIG_PATTERNS.md) for common use cases +3. Explore the [Development Guide](detailed_guides/DEVELOPING.md) if you need to extend functionality +4. Monitor your workflows and extraction pipelines in the CDF UI + +--- + +Return to [Main README](README.md) diff --git a/modules/contextualization/cdf_file_annotation/README.md b/modules/contextualization/cdf_file_annotation/README.md index 86061f70..626519e6 100644 --- a/modules/contextualization/cdf_file_annotation/README.md +++ b/modules/contextualization/cdf_file_annotation/README.md @@ -6,15 +6,169 @@ The Annotation template is a framework designed to automate the process of annot ## Key Features -- **Configuration-Driven**: The entire process is controlled by a single `config.yaml` file, allowing adaptation without code changes. +- **Configuration-Driven Workflow:** The entire process is controlled by a single config.yaml file, allowing adaptation to different data models and operational parameters without code changes. +- **Dual Annotation Modes**: Simultaneously runs standard entity matching and pattern-based detection mode: + - **Standard Mode**: Links files to known entities in your data model with confidence-based approval thresholds. + - **Pattern Mode**: Automatically generates regex-like patterns from entity aliases and detects all matching text in files, creating a comprehensive searchable catalog of potential entities for review and approval. +- **Intelligent Pattern Generation:** Automatically analyzes entity aliases to generate pattern samples, with support for manual pattern overrides at global, site, or unit levels. +- **Large Document Support (\>50 Pages):** Automatically handles files with more than 50 pages by breaking them into manageable chunks, processing them iteratively, and tracking the overall progress. +- **Parallel Execution Ready:** Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions when multiple finalize function instances run in parallel. +- **Comprehensive Reporting:** Annotations stored in three dedicated RAW tables (doc-to-doc links, doc-to-tag links, and pattern detections) plus extraction pipeline logs for full traceability. +- **Local Running and Debugging:** Both the launch and finalize handler can be run locally and have default setups in the 'Run & Debug' tab in VSCode. Requires a .env file to be placed in the directory. -- **Dual Annotation Modes**: Simultaneously runs standard entity matching and a new pattern-based detection mode to create a comprehensive indexed reference catalog. -- **Large Document Support**: Automatically handles files with more than 50 pages by processing them in chunks and tracking overall progress. +## Getting Started -- **Parallel Execution Ready**: Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions. +Ready to deploy? Check out the **[Deployment Guide](DEPLOYMENT.md)** for step-by-step instructions on: -- **Comprehensive Reporting**: Includes a multi-page Streamlit dashboard for monitoring pipeline health, analyzing annotation quality, and managing patterns. +- Prerequisites and data preparation requirements +- CDF Toolkit setup +- Module integration and configuration +- Local development and debugging ---- +For a quick overview, deploying this annotation module into a new Cognite Data Fusion (CDF) project is a streamlined process. Since all necessary resources (Data Sets, Extraction Pipelines, Functions, etc.) are bundled into a single module, you only need to configure one file to get started. - +## How It Works + +The template operates in three main phases, orchestrated by CDF Workflows. Since the prepare phase is relatively small, it is bundled in with the launch phase. However, conceptually it should be treated as a separate process. + +### Prepare Phase + +- **Goal**: Identify files that need to be annotated or have their status reset. +- **Process**: + 1. It queries for files that are marked for re-annotation and resets their status. + 2. It then queries for new files tagged for annotation (e.g., with a "ToAnnotate" tag). + 3. For each new file, it creates a corresponding `AnnotationState` instance in the data model, marking it with a "New" status. + +### Launch Phase + +![LaunchService](https://github.com/user-attachments/assets/3e5ba403-50bb-4f6a-a723-be8947c65ebc) + +- **Goal**: Launch the annotation jobs for files that are ready. +- **Process**: + 1. It queries for `AnnotationState` instances with a "New" or "Retry" status. + 2. It groups these files by a primary scope (e.g., site, unit) to provide operational context. + 3. For each group, it fetches the relevant file and target entity information using an intelligent caching system: + - Checks if a valid cache exists in RAW (based on scope and time limit). + - If cache is stale or missing, queries the data model for entities within scope. + - Automatically generates pattern samples from entity aliases (e.g., "FT-101A" → "[FT]-000[A]"). + - Retrieves manual pattern overrides from RAW catalog (GLOBAL, site-level, or unit-level). + - Merges and deduplicates auto-generated and manual patterns. + - Stores the combined entity list and pattern samples in RAW cache for reuse. + 4. It calls the Cognite Diagram Detect API to initiate two async jobs: + - A `standard annotation` job to find and link known entities with confidence scoring. + - A `pattern mode` job (if enabled) to detect all text matching the pattern samples, creating a searchable reference catalog. + 5. It updates the `AnnotationState` instance with both the `diagramDetectJobId` and `patternModeJobId` (if applicable) and sets the overall `annotationStatus` to "Processing". + +### Finalize Phase + +![FinalizeService](https://github.com/user-attachments/assets/152d9eaf-afdb-46fe-9125-11430ff10bc9) + +- **Goal**: Retrieve, process, and store the results of completed annotation jobs. +- **Process**: + 1. It queries for `AnnotationState` instances with a "Processing" or "Finalizing" status (using optimistic locking to claim jobs). + 2. It waits until both the standard and pattern mode jobs for a given file are complete. + 3. It retrieves and processes the results from both jobs: + - Creates a stable hash for each detection to enable deduplication between standard and pattern results. + - Filters standard annotations by confidence thresholds (auto-approve vs. suggest). + - Skips pattern detections that duplicate standard annotations. + 4. It optionally cleans old annotations first (on first run for multi-page files), then: + - **Standard annotations**: Creates edges in the data model linking files to specific entities, writes results to RAW tables (`doc_tag` for assets, `doc_doc` for file-to-file links). + - **Pattern annotations**: Creates edges linking files to a configurable "sink node" for review, writes results to a dedicated `doc_pattern` RAW table for the searchable catalog. + 5. Updates the file node tag from "AnnotationInProcess" to "Annotated". + 6. Updates the `AnnotationState` status to "Annotated", "Failed", or back to "New" (if more pages remain), tracking page progress for large files. + +## Configuration + +The template's behavior is entirely controlled by the `ep_file_annotation.config.yaml` file. This YAML file is parsed by Pydantic models in the code, ensuring a strongly typed and validated configuration. + +Key configuration sections include: + +- `dataModelViews`: Defines the data model views for files, annotation states, core annotations, and target entities. +- `prepareFunction`: Configures the queries to find files to annotate and optionally reset. +- `launchFunction`: Sets parameters for the annotation job: + - `batchSize`: Maximum files per diagram detect call (1-50). + - `patternMode`: Boolean flag to enable pattern-based detection alongside standard matching. + - `primaryScopeProperty` / `secondaryScopeProperty`: Properties used for batching and cache scoping (e.g., "site", "unit"). + - `cacheService`: Configuration for entity cache storage and time limits. + - `annotationService`: Diagram detect parameters including `pageRange` for multi-page file processing. +- `finalizeFunction`: Defines how to process and apply the final annotations: + - `autoApprovalThreshold` / `autoSuggestThreshold`: Confidence thresholds for standard annotations. + - `cleanOldAnnotations`: Whether to remove existing annotations before applying new ones. + - `maxRetryAttempts`: Retry limit for failed files. + - `sinkNode`: Target node for pattern mode annotations pending review. + +This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. Manual pattern samples can be added to the RAW catalog at `GLOBAL`, site, or unit levels to override or supplement auto-generated patterns. + +## Documentation + +This README provides a high-level overview of the template's purpose and architecture. For more detailed information: + +### Deployment & Setup + +- **[Deployment Guide](DEPLOYMENT.md)**: Step-by-step instructions for deploying to CDF, including prerequisites, configuration, and local debugging setup. + +### Configuration & Usage + +- **[CONFIG.md](detailed_guides/CONFIG.md)**: Comprehensive guide to the `ep_file_annotation.config.yaml` file and all configuration options. +- **[CONFIG_PATTERNS.md](detailed_guides/CONFIG_PATTERNS.md)**: Recipes for common operational tasks, including processing specific subsets, reprocessing files, and performance tuning. + +### Development & Extension + +- **[DEVELOPING.md](detailed_guides/DEVELOPING.md)**: Guide for developers extending the template's functionality, including the interface-based architecture and how to create custom service implementations. + +### Contributing + +- **[CONTRIBUTING.md](CONTRIBUTING.md)**: Guidelines for contributing to this project, including the issue/PR workflow, code standards, and review process. + +## Design Philosophy + +There were two principles I kept in mind when designing this template. + +- **Evolving Needs:** Project requirements evolve. A simple, plug-and-play tool is great to start with, but it can hit limitations when faced with demands for scale, performance, or specialized logic—as was the case with previous annotation templates when applied to projects with tens of thousands of complex files. My belief is that a modern template must be built to be extended. + +- **The Balance Between Configuration and Code:** This template is architected to provide two primary modes of adaptation, striking a crucial balance: + + 1. **Quick Start (via Configuration):** For the majority of use cases, a user should only need to edit the `config.yaml` file. By defining their data model views and tuning process parameters, they can get the template running quickly and effectively. + 2. **Scaling (via Interfaces):** When a project demands unique optimizations—like a non-standard batching strategy or a complex query to fetch entities—the interface-based design provides the necessary "escape hatch." A developer can write a custom Python class to implement their specialized logic, ensuring the template can meet any future requirement. + +## Architecture & Optimizations + +This section explains some of the core design choices made to ensure the template is robust and scalable. + +### Stateful Processing with Data Models + +Instead of using a simpler store like a RAW table to track the status of each file, this module uses a dedicated `AnnotationState` Data Model. There is a 1-to-1 relationship between a file being annotated and its corresponding `AnnotationState` instance. This architectural choice is deliberate and crucial for reliability: + +- **Concurrency:** Data Model instances have built-in optimistic locking via the `existing_version` field. When multiple parallel functions attempt to "claim" a job, only the first one can succeed in updating the `AnnotationState` instance. All others will receive a version conflict error. This database-level locking is far more reliable and simpler to manage than building a custom locking mechanism on top of RAW. +- **Query Performance:** Finding all files that need processing (e.g., status is "New" or "Retry") is a fast, indexed query against the Data Model. Performing equivalent filtering on potentially millions of rows in a RAW table would be significantly slower and less efficient. +- **Schema Enforcement and Data Integrity:** The `AnnotationState` view enforces a strict schema for state information (`status`, `attemptCount`, `annotatedPageCount`, etc.), ensuring data consistency across the entire process. RAW tables offer no schema guarantees. +- **Discoverability and Governance:** The state of the annotation pipeline is exposed as a first-class entity in the CDF data catalog. This makes it easy to monitor progress, build dashboards, and govern the data lifecycle, which is much more difficult with state hidden away in RAW rows. + +### Optimized Batch Processing & Caching + +When processing tens of thousands of files, naively fetching context for each file is inefficient. This module implements a significant optimization based on experiences with large-scale projects. + +- **Rationale:** For many projects, the entities relevant to a given file are often co-located within the same site or operational unit. By grouping files based on these properties before processing, we can create a highly effective cache. +- **Implementation:** The `launchFunction` configuration allows specifying a `primary_scope_property` and an optional `secondary_scope_property`. The `LaunchService` uses these properties to organize all files into ordered batches. For each unique scope combination: + + 1. Check if a valid cache exists in RAW (scoped by primary/secondary values and time limit). + 2. If stale or missing, query the data model for all relevant entities within that scope. + 3. Transform entities into the format required by diagram detect. + 4. Automatically generate pattern samples by analyzing entity alias properties. + 5. Retrieve and merge manual pattern overrides from the RAW catalog. + 6. Store the complete entity list and pattern samples in RAW for reuse. + + This cache is loaded once per scope and reused for all files in that batch, drastically reducing the number of queries to CDF and improving overall throughput. The pattern generation process extracts common naming conventions from aliases, creating regex-like patterns that can match variations (e.g., detecting "FT-102A" even if only "FT-101A" was in the training data). + +### Interface-Based Extensibility + +The template is designed around a core set of abstract interfaces (e.g., `IDataModelService`, `ILaunchService`). This is a foundational architectural choice that enables scalability and long-term viability. + +- **Contract vs. Implementation:** An interface defines a stable "contract" of _what_ a service should do. The provided `General...Service` classes offer a powerful default implementation that is driven by the configuration file. +- **Enabling Customization:** When a project's needs exceed the capabilities of the default implementation or configuration, developers can write their own concrete class that implements the interface with bespoke logic. This custom class can then be "plugged in" via the dependency injection system, without needing to modify the rest of the template's code. + +## About Me + +Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach, Noah Karsky, and Darren Downtain for providing invaluable input from a solution architect's point of view. I also want to thank Lucas Guimaraes, Khaled Shaheen and Gayatri Babel for their help in building this. + +This code is my attempt to create a standard template that 'breaks' the cycle where projects build simple tools, outgrow them, and are then forced to build a new and often hard-to-reuse solution. My current belief is that it's impossible for a template to have long-term success if it's not built on the fundamental premise of being extended. Customer needs will evolve, and new product features will create new opportunities for optimization. diff --git a/modules/contextualization/cdf_file_annotation/readme_architecture.md b/modules/contextualization/cdf_file_annotation/readme_architecture.md deleted file mode 100644 index f1d45dd0..00000000 --- a/modules/contextualization/cdf_file_annotation/readme_architecture.md +++ /dev/null @@ -1,85 +0,0 @@ -# Architecture Guide - -## Configuration Driven - -The templates behavior is entirely controlled by the `ep_file_annotation.config.yaml` file. This YAML file is parsed by Pydantic models in the code, ensuring a strongly typed and validated configuration. - -Key configuration sections include: - -- `dataModelViews`: Defines the data model views for files, annotation states, and target entities. -- `prepareFunction`: Configures the queries to find files to annotate. -- `launchFunction`: Sets parameters for the annotation job, such as batch size, entity matching properties, and a new `patternMode: true` flag to enable the pattern detection feature. -- `finalizeFunction`: Defines how to process and apply the final annotations. - -This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. - -## How It Works - -The template operates in three main phases, orchestrated by a CDF Workflow that calls a `Launch` and multiple `Finalize` functions in parallel. - -### Prepare Phase - -- **Goal**: Identify files that need to be annotated. -- **Process**: This initial step, handled within the `LaunchService`, queries for new files tagged for annotation (e.g., with a "ToAnnotate" tag). For each new file, it creates a corresponding `AnnotationState` instance in the data model, marking it with a "New" status. - -### Launch Phase - -- **Goal**: Initiate the annotation jobs for all ready files. -- **Process**: The `LaunchService` queries for `AnnotationState` instances with a "New" or "Retry" status. It groups these files by a configured scope (e.g., `site`) to create a relevant entity cache, avoiding redundant lookups. It then calls the Cognite Diagram Detect API to start two asynchronous jobs: a standard entity matching job and a pattern mode job. Finally, it updates the `AnnotationState` instance with the job IDs and sets the status to "Processing". - -### Finalize Phase - -- **Goal**: Retrieve, process, and store the results of completed annotation jobs. -- **Process**: The `FinalizeService` queries for `AnnotationState` instances with a "Processing" status. Once both the standard and pattern jobs for a file are complete, it retrieves the results. It applies the standard annotations by creating edges in the data model and logs the results to RAW tables. The pattern mode results are also logged to a dedicated RAW table (`annotation_documents_patterns`). Finally, it updates the `AnnotationState` status to "Annotated" or "Failed". - -## Design Philosophy - -There were two principles I kept in mind when designing this template. - -- **Evolving Needs:** Project requirements evolve. A simple, plug-and-play tool is great to start with, but it can hit limitations when faced with demands for scale, performance, or specialized logic—as was the case with previous annotation templates when applied to projects with tens of thousands of complex files. My belief is that a modern template must be built to be extended. - -- **The Balance Between Configuration and Code:** This template is architected to provide two primary modes of adaptation, striking a crucial balance: - - 1. **Quick Start (via Configuration):** For the majority of use cases, a user should only need to edit the `config.yaml` file. By defining their data model views and tuning process parameters, they can get the template running quickly and effectively. - 2. **Scaling (via Interfaces):** When a project demands unique optimizations—like a non-standard batching strategy or a complex query to fetch entities—the interface-based design provides the necessary "escape hatch." A developer can write a custom Python class to implement their specialized logic, ensuring the template can meet any future requirement. - -## Architecture & Optimizations - -### Stateful Processing with Data Models - -Instead of using RAW tables to track file status, this module uses a dedicated `AnnotationState` Data Model. This is a crucial architectural choice for several reasons: - -- **Concurrency**: Data model instances provide built-in optimistic locking via the `existing_version` field. When multiple `Finalize` functions try to claim a job, only the first one succeeds, preventing race conditions. -- **Performance**: Finding files ready for processing is a fast, indexed query against the data model, which is far more efficient than filtering millions of rows in a RAW table. -- **Data Integrity & Governance**: The `AnnotationState` view enforces a strict schema for all status information, ensuring consistency and making the pipeline's state a first-class, governable entity in CDF. - -### Interface-Based Extensibility - -The template is designed around a core set of abstract interfaces (e.g., `AbstractLaunchService`, `IDataModelService`). This enables scalability and long-term viability. - -- **Contract vs. Implementation**: An interface defines a stable contract of _what_ a service should do. The provided `General...Service` classes offer a powerful default implementation driven by the configuration file. -- **Enabling Customization**: When a project's needs exceed the configuration options, developers can write their own class that implements the interface with custom logic. This custom class can then be "plugged in" via the dependency injection system without modifying the rest of the template's code. - -For more details on configuration and extending the template, see `detailed_guides/CONFIG.md` and `detailed_guides/DEVELOPING.md`. - -### Known Limitation: Scalable Deletion of Pattern Results - -A key architectural challenge remains regarding the `cleanOldAnnotations` feature for pattern mode results. - -- **The Challenge**: Pattern results are stored in the `annotation_documents_patterns` RAW table with a key format of `f"{tag_text}:{file_id.space}:{file_id.external_id}"`. To delete these rows when reprocessing a file, one would need to know all possible values of `{tag_text}` beforehand. -- **The Impact**: The current implementation cannot scalably delete old pattern results for a specific file because listing all rows to find the relevant keys is not feasible for large tables. This can lead to stale data on the Annotation Quality dashboard if files are frequently re-processed. This is a known issue targeted for future enhancement. -- **Temporary Solution**: Delete the `annotation_documents_patterns` table before re-annotating all files to ensure fresh data. - -## Detailed Guides - -This README provides a high-level overview of the template's purpose and architecture. To gain a deeper understanding of how to configure and extend the template, I highly recommend exploring the detailed guides located in the `cdf_file_annotation/detailed_guides/` directory: - -- **`CONFIG.md`**: A document outlining the `ep_file_annotation.config.yaml` file to control the behavior of the Annotation Function. -- **`CONFIG_PATTERNS.md`**: A guide with recipes for common operational tasks, such as processing specific subsets of data, reprocessing files for debugging, and tuning performance by adjusting the configuration. -- **`DEVELOPING.md`**: A guide for developers who wish to extend the template's functionality. It details the interface-based architecture and provides a step-by-step walkthrough on how to create and integrate your own custom service implementations for specialized logic. - -## About Me - -Hey everyone\! I'm Jack Zhao, the creator of this template. I want to give a huge shoutout to Thomas Molbach, Noah Karsky, and Darren Downtain for providing invaluable input from a solution architect's point of view. I also want to thank Khaled Shaheen and Gayatri Babel for their help in building this. - -This code is my attempt to create a standard template that 'breaks' the cycle where projects build simple tools, outgrow them, and are then forced to build a new and often hard-to-reuse solution. My current belief is that it's impossible for a template to have long-term success if it's not built on the fundamental premise of being extended. Customer needs will evolve, and new product features will create new opportunities for optimization. diff --git a/modules/contextualization/cdf_file_annotation/readme_deployment.md b/modules/contextualization/cdf_file_annotation/readme_deployment.md deleted file mode 100644 index 8a10acdd..00000000 --- a/modules/contextualization/cdf_file_annotation/readme_deployment.md +++ /dev/null @@ -1,184 +0,0 @@ -# Deployment Guide - -## Getting Started - -Deploying this annotation module into a new Cognite Data Fusion (CDF) project is a streamlined process. Since all necessary resources (Data Sets, Extraction Pipelines, Functions, etc.) are bundled into a single module, you only need to configure one file to get started. - ---- - -### Prerequisites - -- Python 3.11+ -- An active Cognite Data Fusion (CDF) project. -- The required Python packages are listed in the `cdf_file_annotation/functions/fn_file_annotation_launch/requirements.txt` and `cdf_file_annotation/functions/fn_file_annotation_finalize/requirements.txt` files. -- Alias and tag generation is abstracted out of the annotation function. Thus, you'll need to create a transformation that populates the `aliases` and `tags` property of your file and target entity view. - - The `aliases` property is used to match files with entities and should contain a list of alternative names or identifiers that can be found in the files image. - - The `tags` property serves multiple purposes and consists of the following... - - (`DetectInDiagrams`) Identifies files and assets to include as entities filtered by primary scope and secondary scope (if provided). - - (`ScopeWideDetect`) Identifies files and asset to include as entities filtered by a primary scope. - - (`ToAnnotate`) Identifies files that need to be annotated. - - (`AnnotationInProcess`) Identifies files that are in the process of being annotated. - - (`Annotated`) Identifies files that have been annotated. - - (`AnnotationFailed`) Identifies files that have failed the annotation process. Either by erroring out or by receiving 0 possible matches. - - Don't worry if these concepts don't immediately make sense. Aliases and tags are explained in greater detail in the detailed_guides/ documentation. The template also includes a jupyter notebook that prepare the files and assets for annotation if using the toolkit's quickstart module. - ---- - -### Deployment Steps with Quickstart Module - -The video and deployment steps are with regards to getting things setup on a quickstart module. If you have an existing project, the important step is to insert the correct information in the config.env.yaml file. - -_(if videos fail to load, try loading page in incognito or re-sign into github)_ - -1. **Create a CDF Project through Toolkit** - - Follow the guide [here](https://docs.cognite.com/cdf/deploy/cdf_toolkit/) - - (optional) Initialize the quickstart package using toolkit CLI - -```bash -poetry init -poetry add cognite-toolkit -poetry run cdf modules init -``` - - - - - -2. **Integrate the Module** - - Move the `local_setup/` folder to the root and unpack .vscode/ and .env.tmpl - - Update the default.config.yaml file with project-specific configurations - - Add the module name to the list of selected modules in your config.{env}.yaml file - - Make sure to create a .env file with credentials pointing to your CDF project - - - - - -3. **Build and Deploy the Module** - - - (optional) Build and deploy the quickstart template modules - - Build and deploy this module - -```bash -poetry run cdf build --env dev -poetry run cdf deploy --dry-run -poetry run cdf deploy -``` - -```yaml -# config..yaml used in examples below -environment: - name: dev - project: - validation-type: dev - selected: - - modules/ - -variables: - modules: - # stuff from quickstart package... - organization: - - # ... - - cdf_ingestion: - workflow: ingestion - groupSourceId: - ingestionClientId: ${IDP_CLIENT_ID} # Changed from ${INGESTION_CLIENT_ID} - ingestionClientSecret: ${IDP_CLIENT_SECRET} # Changed from ${INGESTION_CLIENT_SECRET} - pandidContextualizationFunction: contextualization_p_and_id_annotater - contextualization_connection_writer: contextualization_connection_writer - schemaSpace: sp_enterprise_process_industry - schemaSpace2: cdf_cdm - schemaSpace3: cdf_idm - instanceSpaces: - - springfield_instances - - cdf_cdm_units - runWorkflowUserIds: - - - - contextualization: - cdf_file_annotation: - # used in /data_sets, /data_models, /functions, /extraction_pipelines, and /workflows - annotationDatasetExternalId: ds_file_annotation - - # used in /data_models and /extraction_pipelines - annotationStateExternalId: FileAnnotationState - annotationStateInstanceSpace: sp_dat_cdf_annotation_states # NOTE: can set to fileInstanceSpace if scoping is required - refer to detailed_guides/config_patterns.md - annotationStateSchemaSpace: sp_hdm #NOTE: stands for space helper data model - annotationStateVersion: v1.0.0 - fileSchemaSpace: sp_enterprise_process_industry - fileExternalId: File - fileInstanceSpace: # Optional - used for scoping - refer to detailed_guides/config_patterns.md - fileVersion: v1 - - # used in /raw and /extraction_pipelines - rawDb: db_file_annotation - rawTableDocTag: annotation_documents_tags - rawTableDocDoc: annotation_documents_docs - rawTableCache: annotation_entities_cache - - # used in /extraction_pipelines - extractionPipelineExternalId: ep_file_annotation - targetEntitySchemaSpace: sp_enterprise_process_industry - targetEntityExternalId: Equipment - targetEntityInstanceSpace: # Optional - used for scoping - refer to detailed_guides/config_patterns.md - targetEntityVersion: v1 - - # used in /functions and /workflows - launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID - launchFunctionVersion: v1.0.0 - finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID - finalizeFunctionVersion: v1.0.0 - functionClientId: ${IDP_CLIENT_ID} - functionClientSecret: ${IDP_CLIENT_SECRET} - - # used in /workflows - workflowSchedule: "*/10 * * * *" - workflowExternalId: wf_file_annotation - workflowVersion: v1 - - # used in /auth - groupSourceId: # source ID from Azure AD for the corresponding groups - - - # ... -``` - - - - - -4. **Run the Workflow** - - After deployment, the annotation process is managed by a workflow that orchestrates the `Launch` and `Finalize` functions. The workflow is automatically triggered based on the schedule defined in the configuration. You can monitor the progress and logs of the functions in the CDF UI. - - - (optional) Run the ingestion workflow from the quickstart package to create instances of File, Asset, etc - - (optional) Checkout the instantiated files that have been annotated using the annotation function from the quickstart package - - (optional) Run the local_setup.ipynb to setup the files for annotation - - Run the File Annotation Workflow - - - - - - - - - ---- - -### Local Development and Debugging - -This template is configured for easy local execution and debugging directly within Visual Studio Code. - -1. **Create Environment File**: Before running locally, you must create a `.env` file in the root directory. This file will hold the necessary credentials and configuration for connecting to your CDF project. Populate it with the required environment variables for `IDP_CLIENT_ID`, `CDF_CLUSTER`, etc. In the `local_runs/` folder you'll find a .env template. - -2. **Use the VS Code Debugger**: The repository includes a pre-configured `local_runs/.vscode/launch.json` file. Please move the .vscode/ folder to the top level of your repo. - - - Navigate to the "Run and Debug" view in the VS Code sidebar. - - You will see dropdown options for launching the different functions (e.g., `Launch Function`, `Finalize Function`). - - Select the function you wish to run and click the green "Start Debugging" arrow. This will start the function on your local machine, with the debugger attached, allowing you to set breakpoints and inspect variables. - - Feel free to change/adjust the arguments passed into the function call to point to a test_extraction_pipeline and/or change the log level. - - From ff2cb7b234795dc96a53ad40bbc81845cea40be3 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 16:03:28 -0500 Subject: [PATCH 086/128] filter overly aggressive numeric patterns and special character handling --- .../services/CacheService.py | 148 ++++++++++++------ 1 file changed, 102 insertions(+), 46 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 34b10073..4ce6c478 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -1,6 +1,6 @@ import abc import re -from typing import Iterator +from typing import Iterator, Any, Dict, List, Set, cast from collections import defaultdict from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient @@ -249,17 +249,18 @@ def _convert_instances_to_entities( instance_properties = instance.properties.get( self.target_entities_view.as_view_id() ) - if target_entities_resource_type: - resource_type: str = instance_properties[target_entities_resource_type] - else: - resource_type: str = self.target_entities_view.external_id + asset_resource_type: str = ( + instance_properties[target_entities_resource_type] + if target_entities_resource_type + else self.target_entities_view.external_id + ) if target_entities_search_property in instance_properties: asset_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, annotation_type=self.target_entities_view.annotation_type, - resource_type=resource_type, + resource_type=asset_resource_type, search_property=instance_properties.get( target_entities_search_property ), @@ -272,12 +273,12 @@ def _convert_instances_to_entities( name=instance_properties.get("name"), space=instance.space, annotation_type=self.target_entities_view.annotation_type, - resource_type=resource_type, + resource_type=asset_resource_type, search_property=search_value, ) target_entities.append(asset_entity.to_dict()) - file_resource_type: str | None = ( + file_resource_type_prop: str | None = ( self.config.launch_function.file_resource_property ) file_search_property: str = self.config.launch_function.file_search_property @@ -285,16 +286,17 @@ def _convert_instances_to_entities( for instance in file_instances: instance_properties = instance.properties.get(self.file_view.as_view_id()) - if target_entities_resource_type: - resource_type: str = instance_properties[file_resource_type] - else: - resource_type: str = self.file_view.external_id + file_entity_resource_type: str = ( + instance_properties[file_resource_type_prop] + if target_entities_resource_type + else self.file_view.external_id + ) file_entity = entity( external_id=instance.external_id, name=instance_properties.get("name"), space=instance.space, annotation_type=self.file_view.annotation_type, - resource_type=resource_type, + resource_type=file_entity_resource_type, search_property=instance_properties.get(file_search_property), ) file_entities.append(file_entity.to_dict()) @@ -306,7 +308,7 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict Generates regex-like pattern samples from entity search properties for pattern mode detection. Analyzes entity aliases to extract common patterns and variations, creating consolidated - pattern samples that can match multiple similar tags (e.g., "FT-[1|2|3]00[1|2]A"). + pattern samples that can match multiple similar tags (e.g., "[FT]-000[A|B]"). Args: entities: List of entity dictionaries containing search properties (aliases). @@ -318,7 +320,7 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict - annotation_type: Annotation type for the entity """ # Structure: { resource_type: {"patterns": { template_key: [...] }, "annotation_type": "..."} } - pattern_builders = defaultdict( + pattern_builders: Dict[str, Dict[str, Any]] = defaultdict( lambda: {"patterns": {}, "annotation_type": None} ) self.logger.info(f"Generating pattern samples from {len(entities)} entities.") @@ -326,29 +328,60 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict def _parse_alias( alias: str, resource_type_key: str ) -> tuple[str, list[list[str]]]: - alias_parts = re.split(r"([ -])", alias) + """ + Parse an alias into a normalized template string and collect variable letter groups. + + - Treat hyphens '-' and spaces ' ' as literal characters. + - Wrap all other non-alphanumeric characters in brackets to mark them as required literals (e.g., [+], [.]). + - Replace digits with '0' and letters with 'A' in alphanumeric segments. + - If an alphanumeric segment equals the resource type and is token-boundary isolated, wrap it in brackets to mark it constant. + """ + # Tokenize alias into alphanumeric runs and single-character separators + tokens: list[str] = [] + current_alnum: list[str] = [] + for ch in alias: + if ch.isalnum(): + current_alnum.append(ch) + else: + if current_alnum: + tokens.append("".join(current_alnum)) + current_alnum = [] + tokens.append(ch) + if current_alnum: + tokens.append("".join(current_alnum)) + full_template_key_parts: list[str] = [] all_variable_parts: list[list[str]] = [] - for i, part in enumerate(alias_parts): + def is_separator(tok: str) -> bool: + return len(tok) == 1 and not tok.isalnum() + + for i, part in enumerate(tokens): if not part: continue - if part in [" ", "-"]: - full_template_key_parts.append(part) + if is_separator(part): + # Hyphen and space are plain literals; other specials must be wrapped in brackets + if part == "-" or part == " ": + full_template_key_parts.append(part) + else: + full_template_key_parts.append(f"[{part}]") continue - left_ok = (i == 0) or (alias_parts[i - 1] in [" ", "-"]) - right_ok = (i == len(alias_parts) - 1) or ( - alias_parts[i + 1] in [" ", "-"] - ) + + # Alphanumeric segment + left_ok = (i == 0) or is_separator(tokens[i - 1]) + right_ok = (i == len(tokens) - 1) or is_separator(tokens[i + 1]) if left_ok and right_ok and part == resource_type_key: full_template_key_parts.append(f"[{part}]") continue + segment_template = re.sub(r"\d", "0", part) segment_template = re.sub(r"[A-Za-z]", "A", segment_template) full_template_key_parts.append(segment_template) + variable_letters = re.findall(r"[A-Za-z]+", part) if variable_letters: all_variable_parts.append(variable_letters) + return "".join(full_template_key_parts), all_variable_parts for entity in entities: @@ -376,17 +409,17 @@ def _parse_alias( result = [] for resource_type, data in pattern_builders.items(): final_samples = [] - templates = data["patterns"] + templates: Dict[str, List[List[Set[str]]]] = data.get("patterns") or {} annotation_type = data["annotation_type"] for template_key, collected_vars in templates.items(): - var_iter: Iterator[list[set[str]]] = iter(collected_vars) + var_iter: Iterator[List[Set[str]]] = iter(collected_vars) def build_segment(segment_template: str) -> str: if "A" not in segment_template: return segment_template try: - letter_groups_for_segment = next(var_iter) - letter_group_iter: Iterator[set[str]] = iter( + letter_groups_for_segment: List[Set[str]] = next(var_iter) + letter_group_iter: Iterator[Set[str]] = iter( letter_groups_for_segment ) @@ -398,12 +431,28 @@ def replace_A(match): except StopIteration: return segment_template + # Split by bracketed constants or any single non-alphanumeric separator to preserve them as tokens + parts = [ + p + for p in re.split(r"(\[[^\]]+\]|[^A-Za-z0-9])", template_key) + if p != "" + ] final_pattern_parts = [ - build_segment(p) if p not in " -" else p - for p in re.split(r"([ -])", template_key) + build_segment(p) if re.search(r"A", p) else p for p in parts ] final_samples.append("".join(final_pattern_parts)) + # Sanity filter: drop overly generic numeric-only patterns (must contain a letter or a character class) + def _has_alpha_or_class(s: str) -> bool: + if re.search(r"[A-Za-z]", s): + return True + # Character class: bracketed alternatives like [A|B] or [1|2] + if re.search(r"\[[^\]]*\|[^\]]*\]", s): + return True + return False + + final_samples = [s for s in final_samples if _has_alpha_or_class(s)] + if final_samples: result.append( { @@ -475,40 +524,47 @@ def _merge_patterns( Returns: List of merged pattern dictionaries, deduplicated and organized by resource type. """ - merged = defaultdict(lambda: {"samples": set(), "annotation_type": None}) + merged: Dict[str, Dict[str, Any]] = defaultdict( + lambda: {"samples": set(), "annotation_type": None} + ) # Process auto-generated patterns for item in auto_patterns: resource_type = item.get("resource_type") if resource_type: - merged[resource_type]["samples"].update(item.get("sample", [])) + bucket = merged[resource_type] + samples_set = cast(Set[str], bucket["samples"]) + sample_list = item.get("sample") or [] + samples_set.update(sample_list) # Set annotation_type if not already set - if not merged[resource_type]["annotation_type"]: - merged[resource_type]["annotation_type"] = item.get( - "annotation_type" - ) + if not bucket.get("annotation_type"): + bucket["annotation_type"] = item.get("annotation_type") # Process manual patterns for item in manual_patterns: resource_type = item.get("resource_type") if resource_type and item.get("sample"): - merged[resource_type]["samples"].add(item["sample"]) + bucket = merged[resource_type] + samples_set = cast(Set[str], bucket["samples"]) + samples_set.add(cast(str, item["sample"])) # Set annotation_type if not already set (auto-patterns take precedence) - if not merged[resource_type]["annotation_type"]: + if not bucket.get("annotation_type"): # NOTE: UI that creates manual patterns will need to also have the annotation type as a required entry - merged[resource_type]["annotation_type"] = item.get( + bucket["annotation_type"] = item.get( "annotation_type", "diagrams.AssetLink" ) # Convert the merged dictionary back to the required list format - final_list = [ - { - "resource_type": resource_type, - "sample": sorted(list(data["samples"])), - "annotation_type": data["annotation_type"], - } - for resource_type, data in merged.items() - ] + final_list = [] + for resource_type, data in merged.items(): + samples_safe: Set[str] = cast(Set[str], data.get("samples") or set()) + final_list.append( + { + "resource_type": resource_type, + "sample": sorted(list(samples_safe)), + "annotation_type": data.get("annotation_type"), + } + ) self.logger.info( f"Merged auto and manual patterns into {len(final_list)} resource types." From 1f51ed51791a72f6553fc973adacccccc33a3a9a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 16:45:39 -0500 Subject: [PATCH 087/128] formatted line length to 120 characters --- .../services/ApplyService.py | 108 ++++---------- .../services/FinalizeService.py | 116 ++++----------- .../services/LoggerService.py | 24 +-- .../services/RetrieveService.py | 60 ++------ .../services/AnnotationService.py | 28 +--- .../services/CacheService.py | 120 ++++----------- .../services/DataModelService.py | 118 +++++---------- .../services/LaunchService.py | 138 ++++++------------ .../services/LoggerService.py | 24 +-- .../file_annotation_dashboard/canvas.py | 28 ++-- .../data_structures.py | 4 +- .../file_annotation_dashboard/helper.py | 94 ++++++------ 12 files changed, 257 insertions(+), 605 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py index 12b30f7f..1a3dc183 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ApplyService.py @@ -57,23 +57,15 @@ class GeneralApplyService(IApplyService): EXTERNAL_ID_LIMIT = 256 FUNCTION_ID = "fn_file_annotation_finalize" - def __init__( - self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger - ): + def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger - self.core_annotation_view_id: ViewId = ( - config.data_model_views.core_annotation_view.as_view_id() - ) + self.core_annotation_view_id: ViewId = config.data_model_views.core_annotation_view.as_view_id() self.file_view_id: ViewId = config.data_model_views.file_view.as_view_id() self.file_annotation_type = config.data_model_views.file_view.annotation_type - self.approve_threshold = ( - config.finalize_function.apply_service.auto_approval_threshold - ) - self.suggest_threshold = ( - config.finalize_function.apply_service.auto_suggest_threshold - ) + self.approve_threshold = config.finalize_function.apply_service.auto_approval_threshold + self.suggest_threshold = config.finalize_function.apply_service.auto_suggest_threshold self.sink_node_ref = DirectRelationReference( space=config.finalize_function.apply_service.sink_node.space, external_id=config.finalize_function.apply_service.sink_node.external_id, @@ -105,9 +97,7 @@ def process_and_apply_annotations_for_file( - Summary message of pattern annotations created """ file_id = file_node.as_id() - source_id = cast( - str, file_node.properties.get(self.file_view_id, {}).get("sourceId") - ) + source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) if clean_old: deleted_counts = self._delete_annotations_for_file(file_id) @@ -122,17 +112,13 @@ def process_and_apply_annotations_for_file( for annotation in regular_item["annotations"]: stable_hash = self._create_stable_hash(annotation) processed_hashes.add(stable_hash) - edges = self._detect_annotation_to_edge_applies( - file_id, source_id, doc_rows, tag_rows, annotation - ) + edges = self._detect_annotation_to_edge_applies(file_id, source_id, doc_rows, tag_rows, annotation) regular_edges.extend(edges.values()) # Step 2: Process pattern annotations, skipping any that were already processed pattern_edges, pattern_rows = [], [] if pattern_item and pattern_item.get("annotations"): - pattern_edges, pattern_rows = self._process_pattern_results( - pattern_item, file_node, processed_hashes - ) + pattern_edges, pattern_rows = self._process_pattern_results(pattern_item, file_node, processed_hashes) # Step 3: Update the file node tag node_apply = file_node.as_write() @@ -146,9 +132,7 @@ def process_and_apply_annotations_for_file( ) # Step 4: Apply all data model and RAW changes - self.update_instances( - list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges - ) + self.update_instances(list_node_apply=node_apply, list_edge_apply=regular_edges + pattern_edges) db_name = self.config.finalize_function.apply_service.raw_db if doc_rows: self.client.raw.rows.insert( @@ -177,9 +161,7 @@ def process_and_apply_annotations_for_file( f"Created {len(pattern_rows)} new pattern detections.", ) - def update_instances( - self, list_node_apply=None, list_edge_apply=None - ) -> InstancesApplyResult: + def update_instances(self, list_node_apply=None, list_edge_apply=None) -> InstancesApplyResult: """ Applies node and/or edge updates to the data model. @@ -190,9 +172,7 @@ def update_instances( Returns: InstancesApplyResult containing the results of the apply operation. """ - return self.client.data_modeling.instances.apply( - nodes=list_node_apply, edges=list_edge_apply, replace=False - ) + return self.client.data_modeling.instances.apply(nodes=list_node_apply, edges=list_edge_apply, replace=False) def _delete_annotations_for_file(self, file_id: NodeId) -> dict[str, int]: """ @@ -298,9 +278,7 @@ def _process_pattern_results( - List of RowWrite objects for RAW table entries """ file_id = file_node.as_id() - source_id = cast( - str, file_node.properties.get(self.file_view_id, {}).get("sourceId") - ) + source_id = cast(str, file_node.properties.get(self.file_view_id, {}).get("sourceId")) doc_patterns, edge_applies = [], [] for detect_annotation in result_item.get("annotations", []): stable_hash = self._create_stable_hash(detect_annotation) @@ -324,22 +302,10 @@ def _process_pattern_results( "status": DiagramAnnotationStatus.SUGGESTED.value, "tags": [], "startNodePageNumber": detect_annotation.get("region", {}).get("page"), - "startNodeXMin": min( - v.get("x", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMin": min( - v.get("y", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeXMax": max( - v.get("x", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMax": max( - v.get("y", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), + "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, @@ -353,9 +319,7 @@ def _process_pattern_results( space=self.core_annotation_view_id.space, external_id=annotation_type, ), - start_node=DirectRelationReference( - space=file_id.space, external_id=file_id.external_id - ), + start_node=DirectRelationReference(space=file_id.space, external_id=file_id.external_id), end_node=self.sink_node_ref, sources=[ NodeOrEdgeData( @@ -414,31 +378,17 @@ def _detect_annotation_to_edge_applies( else: continue - external_id = self._create_annotation_id( - file_instance_id, entity, detect_annotation - ) + external_id = self._create_annotation_id(file_instance_id, entity, detect_annotation) now = datetime.now(timezone.utc).replace(microsecond=0) annotation_properties = { "name": file_instance_id.external_id, "confidence": detect_annotation.get("confidence"), "status": status, "startNodePageNumber": detect_annotation.get("region", {}).get("page"), - "startNodeXMin": min( - v.get("x", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMin": min( - v.get("y", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeXMax": max( - v.get("x", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), - "startNodeYMax": max( - v.get("y", 0) - for v in detect_annotation.get("region", {}).get("vertices", []) - ), + "startNodeXMin": min(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMin": min(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeXMax": max(v.get("x", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), + "startNodeYMax": max(v.get("y", 0) for v in detect_annotation.get("region", {}).get("vertices", [])), "startNodeText": detect_annotation.get("text"), "sourceCreatedUser": self.FUNCTION_ID, "sourceUpdatedUser": self.FUNCTION_ID, @@ -456,9 +406,7 @@ def _detect_annotation_to_edge_applies( space=file_instance_id.space, external_id=file_instance_id.external_id, ), - end_node=DirectRelationReference( - space=entity.get("space"), external_id=entity.get("external_id") - ), + end_node=DirectRelationReference(space=entity.get("space"), external_id=entity.get("external_id")), sources=[ NodeOrEdgeData( source=self.core_annotation_view_id, @@ -511,13 +459,9 @@ def _create_stable_hash(self, raw_annotation: dict[str, Any]) -> str: "page": region.get("page"), "vertices": sorted_vertices, } - return sha256( - json.dumps(stable_representation, sort_keys=True).encode() - ).hexdigest()[:10] + return sha256(json.dumps(stable_representation, sort_keys=True).encode()).hexdigest()[:10] - def _create_annotation_id( - self, file_id: NodeId, entity: dict[str, Any], raw_annotation: dict[str, Any] - ) -> str: + def _create_annotation_id(self, file_id: NodeId, entity: dict[str, Any], raw_annotation: dict[str, Any]) -> str: """ Creates a unique external ID for a regular annotation edge. @@ -542,9 +486,7 @@ def _create_annotation_id( prefix = prefix[: self.EXTERNAL_ID_LIMIT - 11] return f"{prefix}:{hash_}" - def _create_pattern_annotation_id( - self, file_id: NodeId, raw_annotation: dict[str, Any] - ) -> str: + def _create_pattern_annotation_id(self, file_id: NodeId, raw_annotation: dict[str, Any]) -> str: """ Creates a unique external ID for a pattern annotation edge. diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 42304ebb..506318f3 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -75,15 +75,11 @@ def __init__( apply_service, ) - self.annotation_state_view: ViewPropertyConfig = ( - config.data_model_views.annotation_state_view - ) + self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.page_range: int = config.launch_function.annotation_service.page_range self.max_retries: int = config.finalize_function.max_retry_attempts - self.clean_old_annotations: bool = ( - config.finalize_function.clean_old_annotations - ) + self.clean_old_annotations: bool = config.finalize_function.clean_old_annotations self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") @@ -107,20 +103,13 @@ def run(self) -> Literal["Done"] | None: """ self.logger.info("Starting Finalize Function", section="START") try: - job_id, pattern_mode_job_id, file_to_state_map = ( - self.retrieve_service.get_job_id() - ) + job_id, pattern_mode_job_id, file_to_state_map = self.retrieve_service.get_job_id() if not job_id or not file_to_state_map: self.logger.info("No diagram detect jobs found", section="END") return "Done" - self.logger.info( - f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files" - ) + self.logger.info(f"Retrieved job id ({job_id}) and claimed {len(file_to_state_map.values())} files") except CogniteAPIError as e: - if ( - e.code == 400 - and e.message == "A version conflict caused the ingest to fail." - ): + if e.code == 400 and e.message == "A version conflict caused the ingest to fail.": self.logger.info( message=f"Retrieved job id that has already been claimed. Grabbing another job.", section="END", @@ -128,12 +117,9 @@ def run(self) -> Literal["Done"] | None: return elif ( e.code == 408 - and e.message - == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): - self.logger.error( - message=f"Ran into the following error:\n{str(e)}", section="END" - ) + self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") return else: raise e @@ -143,11 +129,7 @@ def run(self) -> Literal["Done"] | None: try: job_results = self.retrieve_service.get_diagram_detect_job_result(job_id) if pattern_mode_job_id: - pattern_mode_job_results = ( - self.retrieve_service.get_diagram_detect_job_result( - pattern_mode_job_id - ) - ) + pattern_mode_job_results = self.retrieve_service.get_diagram_detect_job_result(pattern_mode_job_id) except Exception as e: self.logger.info( message=f"Unfinalizing {len(file_to_state_map.keys())} files - job id ({job_id}) is a bad gateway", @@ -186,9 +168,7 @@ def run(self) -> Literal["Done"] | None: ) merged_results = { - (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): { - "regular": item - } + (item["fileInstanceId"]["space"], item["fileInstanceId"]["externalId"]): {"regular": item} for item in job_results["items"] } if pattern_mode_job_results: @@ -216,33 +196,27 @@ def run(self) -> Literal["Done"] | None: annotation_state_node = file_to_state_map[file_id] current_attempt = cast( int, - annotation_state_node.properties[ - self.annotation_state_view.as_view_id() - ]["attemptCount"], + annotation_state_node.properties[self.annotation_state_view.as_view_id()]["attemptCount"], ) next_attempt = current_attempt + 1 try: self.logger.info(f"Processing file {file_id}:") - annotation_msg, pattern_msg = ( - self.apply_service.process_and_apply_annotations_for_file( - file_node, - results.get("regular"), - results.get("pattern"), - self.clean_old_annotations - and annotation_state_node.properties[ - self.annotation_state_view.as_view_id() - ].get("annotatedPageCount") - is None, + annotation_msg, pattern_msg = self.apply_service.process_and_apply_annotations_for_file( + file_node, + results.get("regular"), + results.get("pattern"), + self.clean_old_annotations + and annotation_state_node.properties[self.annotation_state_view.as_view_id()].get( + "annotatedPageCount" ) + is None, ) self.logger.info(f"\t- {annotation_msg}\n\t- {pattern_msg}") # Logic to handle multi-page files page_count = results.get("regular", {}).get("pageCount", 1) - annotated_pages = self._check_all_pages_annotated( - annotation_state_node, page_count - ) + annotated_pages = self._check_all_pages_annotated(annotation_state_node, page_count) if annotated_pages == page_count: job_node_to_update = self._process_annotation_state( @@ -268,9 +242,7 @@ def run(self) -> Literal["Done"] | None: count_success += 1 # Still a success for this batch except Exception as e: - self.logger.error( - f"Failed to process annotations for file {file_id}: {e}" - ) + self.logger.error(f"Failed to process annotations for file {file_id}: {e}") if next_attempt >= self.max_retries: job_node_to_update = self._process_annotation_state( annotation_state_node, @@ -299,9 +271,7 @@ def run(self) -> Literal["Done"] | None: section="START", ) try: - self.apply_service.update_instances( - list_node_apply=annotation_state_node_applies - ) + self.apply_service.update_instances(list_node_apply=annotation_state_node_applies) self.logger.info( f"\t- {count_success} set to Annotated/New\n\t- {count_retry} set to Retry\n\t- {count_failed} set to Failed" ) @@ -311,9 +281,7 @@ def run(self) -> Literal["Done"] | None: section="END", ) - self.tracker.add_files( - success=count_success, failed=(count_failed + count_retry) - ) + self.tracker.add_files(success=count_success, failed=(count_failed + count_retry)) return None def _process_annotation_state( @@ -363,9 +331,7 @@ def _process_annotation_state( """ update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "annotationMessage": annotation_message, "patternModeMessage": pattern_mode_message, "attemptCount": attempt_count, @@ -418,9 +384,7 @@ def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: """ annotated_page_count: int | None = cast( int, - node.properties[self.annotation_state_view.as_view_id()].get( - "annotatedPageCount" - ), + node.properties[self.annotation_state_view.as_view_id()].get("annotatedPageCount"), ) if not annotated_page_count: @@ -428,18 +392,14 @@ def _check_all_pages_annotated(self, node: Node, page_count: int) -> int: annotated_page_count = page_count else: annotated_page_count = self.page_range - self.logger.info( - f"Annotated pages 1-to-{annotated_page_count} out of {page_count} total pages" - ) + self.logger.info(f"Annotated pages 1-to-{annotated_page_count} out of {page_count} total pages") else: start_page = annotated_page_count + 1 if (annotated_page_count + self.page_range) >= page_count: annotated_page_count = page_count else: annotated_page_count += self.page_range - self.logger.info( - f"Annotated pages {start_page}-to-{annotated_page_count} out of {page_count} total pages" - ) + self.logger.info(f"Annotated pages {start_page}-to-{annotated_page_count} out of {page_count} total pages") return annotated_page_count @@ -466,15 +426,11 @@ def _update_batch_state( if len(batch.nodes) == 0: return - self.logger.info( - message=f"Updating {len(batch.nodes)} annotation state instances" - ) + self.logger.info(message=f"Updating {len(batch.nodes)} annotation state instances") if failed: update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": None, "patternModeJobId": None, } @@ -484,9 +440,7 @@ def _update_batch_state( ) else: if status == AnnotationStatus.PROCESSING: - claimed_time = batch.nodes[0].properties[ - self.annotation_state_view.as_view_id() - ]["sourceUpdatedTime"] + claimed_time = batch.nodes[0].properties[self.annotation_state_view.as_view_id()]["sourceUpdatedTime"] update_properties = { "annotationStatus": status, "sourceUpdatedTime": claimed_time, @@ -494,18 +448,14 @@ def _update_batch_state( else: update_properties = { "annotationStatus": status, - "sourceUpdatedTime": datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), } batch.update_node_properties( new_properties=update_properties, view_id=self.annotation_state_view.as_view_id(), ) try: - update_results = self.apply_service.update_instances( - list_node_apply=batch.apply - ) + update_results = self.apply_service.update_instances(list_node_apply=batch.apply) self.logger.info(f"- set annotation status to {status}") except Exception as e: self.logger.error( @@ -513,7 +463,5 @@ def _update_batch_state( section="END", ) time.sleep(30) - update_results = self.apply_service.update_instances( - list_node_apply=batch.apply - ) + update_results = self.apply_service.update_instances(list_node_apply=batch.apply) self.logger.info(f"- set annotation status to {status}") diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py index c9191137..773b7797 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/LoggerService.py @@ -21,9 +21,7 @@ def __init__( os.makedirs(dir_name, exist_ok=True) self.file_handler = open(self.filepath, "a", encoding="utf-8") except Exception as e: - print( - f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}" - ) + print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") self.write = False def _format_message_lines(self, prefix: str, message: str) -> list[str]: @@ -72,9 +70,7 @@ def _print(self, prefix: str, message: str) -> None: for line in lines_to_log: print(line) - def debug( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs a debug-level message. @@ -92,9 +88,7 @@ def debug( if section == "END" or section == "BOTH": self._section() - def info( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs an info-level message. @@ -112,9 +106,7 @@ def info( if section == "END" or section == "BOTH": self._section() - def warning( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs a warning-level message. @@ -132,9 +124,7 @@ def warning( if section == "END" or section == "BOTH": self._section() - def error( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs an error-level message. @@ -162,9 +152,7 @@ def _section(self) -> None: self.file_handler.write( "--------------------------------------------------------------------------------\n" ) - print( - "--------------------------------------------------------------------------------" - ) + print("--------------------------------------------------------------------------------") def close(self) -> None: """ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py index 2fd7521e..272c0770 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/RetrieveService.py @@ -45,24 +45,16 @@ class GeneralRetrieveService(IRetrieveService): Interface for retrieving diagram detect jobs """ - def __init__( - self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger - ): + def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): self.client = client self.config = config self.logger: CogniteFunctionLogger = logger - self.annotation_state_view: ViewPropertyConfig = ( - self.config.data_model_views.annotation_state_view - ) + self.annotation_state_view: ViewPropertyConfig = self.config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = self.config.data_model_views.file_view - self.filter_jobs: Filter = build_filter_from_query( - config.finalize_function.retrieve_service.get_job_id_query - ) - self.job_api: str = ( - f"/api/v1/projects/{self.client.config.project}/context/diagram/detect" - ) + self.filter_jobs: Filter = build_filter_from_query(config.finalize_function.retrieve_service.get_job_id_query) + self.job_api: str = f"/api/v1/projects/{self.client.config.project}/context/diagram/detect" def get_diagram_detect_job_result(self, job_id: int) -> dict | None: """ @@ -88,9 +80,7 @@ def get_diagram_detect_job_result(self, job_id: int) -> dict | None: else: self.logger.debug(f"{job_id} - Job not complete") else: - self.logger.debug( - f"{job_id} - Request to get job result failed with {response.status_code} code" - ) + self.logger.debug(f"{job_id} - Request to get job result failed with {response.status_code} code") return def get_job_id( @@ -130,9 +120,7 @@ def get_job_id( sort_by_time = [] sort_by_time.append( instances.InstanceSort( - property=self.annotation_state_view.as_property_ref( - "sourceUpdatedTime" - ), + property=self.annotation_state_view.as_property_ref("sourceUpdatedTime"), direction="ascending", ) ) @@ -152,13 +140,11 @@ def get_job_id( job_node: Node = annotation_state_instance.pop(-1) job_id: int = cast( int, - job_node.properties[self.annotation_state_view.as_view_id()][ - "diagramDetectJobId" - ], + job_node.properties[self.annotation_state_view.as_view_id()]["diagramDetectJobId"], + ) + pattern_mode_job_id: int | None = job_node.properties[self.annotation_state_view.as_view_id()].get( + "patternModeJobId" ) - pattern_mode_job_id: int | None = job_node.properties[ - self.annotation_state_view.as_view_id() - ].get("patternModeJobId") filter_job_id = Equals( property=self.annotation_state_view.as_property_ref("diagramDetectJobId"), @@ -180,12 +166,8 @@ def get_job_id( # NOTE: could bundle this with the attempt to claim loop. Chose not to since the run time gains is negligible and improves readability. file_to_state_map: dict[NodeId, Node] = {} for node in list_job_nodes: - file_reference = node.properties.get( - self.annotation_state_view.as_view_id() - ).get("linkedFile") - file_node_id = NodeId( - space=file_reference["space"], external_id=file_reference["externalId"] - ) + file_reference = node.properties.get(self.annotation_state_view.as_view_id()).get("linkedFile") + file_node_id = NodeId(space=file_reference["space"], external_id=file_reference["externalId"]) file_to_state_map[file_node_id] = node return job_id, pattern_mode_job_id, file_to_state_map @@ -231,22 +213,12 @@ def _attempt_to_claim(self, list_job_nodes_to_claim: NodeApplyList) -> None: must manually raise an error to prevent the duplicate claim. """ for node_apply in list_job_nodes_to_claim: - if ( - node_apply.sources[0].properties["annotationStatus"] - == AnnotationStatus.PROCESSING - ): + if node_apply.sources[0].properties["annotationStatus"] == AnnotationStatus.PROCESSING: node_apply.sources[0].properties["annotationStatus"] = AnnotationStatus.FINALIZING # type: ignore - elif ( - node_apply.sources[0].properties["annotationStatus"] - == AnnotationStatus.FINALIZING - ): + elif node_apply.sources[0].properties["annotationStatus"] == AnnotationStatus.FINALIZING: self.logger.debug("Lock bypassed. Caught on the client-side.") - raise CogniteAPIError( - message="A version conflict caused the ingest to fail.", code=400 - ) + raise CogniteAPIError(message="A version conflict caused the ingest to fail.", code=400) - update_results = self.client.data_modeling.instances.apply( - nodes=list_job_nodes_to_claim - ) + update_results = self.client.data_modeling.instances.apply(nodes=list_job_nodes_to_claim) return diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py index 8ae8fe7d..c134afd6 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/AnnotationService.py @@ -19,15 +19,11 @@ class IAnnotationService(abc.ABC): """ @abc.abstractmethod - def run_diagram_detect( - self, files: list[FileReference], entities: list[dict[str, Any]] - ) -> int: + def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: pass @abc.abstractmethod - def run_pattern_mode_detect( - self, files: list[FileReference], pattern_samples: list[dict[str, Any]] - ) -> int: + def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: pass @@ -37,9 +33,7 @@ class GeneralAnnotationService(IAnnotationService): Build a queue of files that are in the annotation process and return the jobId """ - def __init__( - self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger - ): + def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger @@ -47,9 +41,7 @@ def __init__( self.annotation_config = config.launch_function.annotation_service self.diagram_detect_config: DiagramDetectConfig | None = None if config.launch_function.annotation_service.diagram_detect_config: - self.diagram_detect_config = ( - config.launch_function.annotation_service.diagram_detect_config.as_config() - ) + self.diagram_detect_config = config.launch_function.annotation_service.diagram_detect_config.as_config() # NOTE: Remove Leading Zeros has a weird interaction with pattern mode so will always turn off if config.launch_function.pattern_mode: # NOTE: Shallow copy that still references Mutable objects in self.diagram_detect_config. @@ -57,9 +49,7 @@ def __init__( self.pattern_detect_config = copy.copy(self.diagram_detect_config) self.pattern_detect_config.remove_leading_zeros = False - def run_diagram_detect( - self, files: list[FileReference], entities: list[dict[str, Any]] - ) -> int: + def run_diagram_detect(self, files: list[FileReference], entities: list[dict[str, Any]]) -> int: """ Initiates a diagram detection job using CDF's diagram detect API. @@ -86,9 +76,7 @@ def run_diagram_detect( else: raise Exception(f"API call to diagram/detect did not return a job ID") - def run_pattern_mode_detect( - self, files: list[FileReference], pattern_samples: list[dict[str, Any]] - ) -> int: + def run_pattern_mode_detect(self, files: list[FileReference], pattern_samples: list[dict[str, Any]]) -> int: """ Initiates a diagram detection job in pattern mode using generated pattern samples. @@ -117,6 +105,4 @@ def run_pattern_mode_detect( if detect_job.job_id: return detect_job.job_id else: - raise Exception( - "API call to diagram/detect in pattern mode did not return a job ID" - ) + raise Exception("API call to diagram/detect in pattern mode did not return a job ID") diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py index 4ce6c478..f122c132 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/CacheService.py @@ -52,26 +52,18 @@ class GeneralCacheService(ICacheService): that share the same operational context. """ - def __init__( - self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger - ): + def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): self.client = client self.config = config self.logger = logger self.db_name: str = config.launch_function.cache_service.raw_db self.tbl_name: str = config.launch_function.cache_service.raw_table_cache - self.manual_patterns_tbl_name: str = ( - config.launch_function.cache_service.raw_manual_patterns_catalog - ) - self.cache_time_limit: int = ( - config.launch_function.cache_service.cache_time_limit - ) # in hours + self.manual_patterns_tbl_name: str = config.launch_function.cache_service.raw_manual_patterns_catalog + self.cache_time_limit: int = config.launch_function.cache_service.cache_time_limit # in hours self.file_view: ViewPropertyConfig = config.data_model_views.file_view - self.target_entities_view: ViewPropertyConfig = ( - config.data_model_views.target_entities_view - ) + self.target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view def get_entities( self, @@ -103,31 +95,19 @@ def get_entities( key = f"{primary_scope_value}" try: - row: Row | None = self.client.raw.rows.retrieve( - db_name=self.db_name, table_name=self.tbl_name, key=key - ) + row: Row | None = self.client.raw.rows.retrieve(db_name=self.db_name, table_name=self.tbl_name, key=key) except: row = None # Attempt to retrieve from the cache - if ( - row - and row.columns - and self._validate_cache(row.columns["LastUpdateTimeUtcIso"]) - ): - self.logger.debug( - f"Cache valid for key: {key}. Retrieving entities and patterns." - ) + if row and row.columns and self._validate_cache(row.columns["LastUpdateTimeUtcIso"]): + self.logger.debug(f"Cache valid for key: {key}. Retrieving entities and patterns.") asset_entities: list[dict] = row.columns.get("AssetEntities", []) file_entities: list[dict] = row.columns.get("FileEntities", []) - combined_pattern_samples: list[dict] = row.columns.get( - "CombinedPatternSamples", [] - ) + combined_pattern_samples: list[dict] = row.columns.get("CombinedPatternSamples", []) return (asset_entities + file_entities), combined_pattern_samples - self.logger.info( - f"Refreshing RAW entities cache and patterns cache for key: {key}" - ) + self.logger.info(f"Refreshing RAW entities cache and patterns cache for key: {key}") # Fetch data asset_instances, file_instances = data_model_service.get_instances_entities( @@ -135,9 +115,7 @@ def get_entities( ) # Convert to entities for diagram detect job - asset_entities, file_entities = self._convert_instances_to_entities( - asset_instances, file_instances - ) + asset_entities, file_entities = self._convert_instances_to_entities(asset_instances, file_instances) entities = asset_entities + file_entities # Generate pattern samples from the same entities @@ -146,14 +124,10 @@ def get_entities( auto_pattern_samples = asset_pattern_samples + file_pattern_samples # Grab the manual pattern samples - manual_pattern_samples = self._get_manual_patterns( - primary_scope_value, secondary_scope_value - ) + manual_pattern_samples = self._get_manual_patterns(primary_scope_value, secondary_scope_value) # Merge the auto and manual patterns - combined_pattern_samples = self._merge_patterns( - auto_pattern_samples, manual_pattern_samples - ) + combined_pattern_samples = self._merge_patterns(auto_pattern_samples, manual_pattern_samples) # Update cache new_row = RowWrite( @@ -237,18 +211,12 @@ def _convert_instances_to_entities( - List of target entity dictionaries (typically assets). - List of file entity dictionaries. """ - target_entities_resource_type: str | None = ( - self.config.launch_function.target_entities_resource_property - ) - target_entities_search_property: str = ( - self.config.launch_function.target_entities_search_property - ) + target_entities_resource_type: str | None = self.config.launch_function.target_entities_resource_property + target_entities_search_property: str = self.config.launch_function.target_entities_search_property target_entities: list[dict] = [] for instance in asset_instances: - instance_properties = instance.properties.get( - self.target_entities_view.as_view_id() - ) + instance_properties = instance.properties.get(self.target_entities_view.as_view_id()) asset_resource_type: str = ( instance_properties[target_entities_resource_type] if target_entities_resource_type @@ -261,9 +229,7 @@ def _convert_instances_to_entities( space=instance.space, annotation_type=self.target_entities_view.annotation_type, resource_type=asset_resource_type, - search_property=instance_properties.get( - target_entities_search_property - ), + search_property=instance_properties.get(target_entities_search_property), ) target_entities.append(asset_entity.to_dict()) else: @@ -278,9 +244,7 @@ def _convert_instances_to_entities( ) target_entities.append(asset_entity.to_dict()) - file_resource_type_prop: str | None = ( - self.config.launch_function.file_resource_property - ) + file_resource_type_prop: str | None = self.config.launch_function.file_resource_property file_search_property: str = self.config.launch_function.file_search_property file_entities: list[dict] = [] @@ -320,14 +284,10 @@ def _generate_tag_samples_from_entities(self, entities: list[dict]) -> list[dict - annotation_type: Annotation type for the entity """ # Structure: { resource_type: {"patterns": { template_key: [...] }, "annotation_type": "..."} } - pattern_builders: Dict[str, Dict[str, Any]] = defaultdict( - lambda: {"patterns": {}, "annotation_type": None} - ) + pattern_builders: Dict[str, Dict[str, Any]] = defaultdict(lambda: {"patterns": {}, "annotation_type": None}) self.logger.info(f"Generating pattern samples from {len(entities)} entities.") - def _parse_alias( - alias: str, resource_type_key: str - ) -> tuple[str, list[list[str]]]: + def _parse_alias(alias: str, resource_type_key: str) -> tuple[str, list[list[str]]]: """ Parse an alias into a normalized template string and collect variable letter groups. @@ -419,9 +379,7 @@ def build_segment(segment_template: str) -> str: return segment_template try: letter_groups_for_segment: List[Set[str]] = next(var_iter) - letter_group_iter: Iterator[Set[str]] = iter( - letter_groups_for_segment - ) + letter_group_iter: Iterator[Set[str]] = iter(letter_groups_for_segment) def replace_A(match): alternatives = sorted(list(next(letter_group_iter))) @@ -432,14 +390,8 @@ def replace_A(match): return segment_template # Split by bracketed constants or any single non-alphanumeric separator to preserve them as tokens - parts = [ - p - for p in re.split(r"(\[[^\]]+\]|[^A-Za-z0-9])", template_key) - if p != "" - ] - final_pattern_parts = [ - build_segment(p) if re.search(r"A", p) else p for p in parts - ] + parts = [p for p in re.split(r"(\[[^\]]+\]|[^A-Za-z0-9])", template_key) if p != ""] + final_pattern_parts = [build_segment(p) if re.search(r"A", p) else p for p in parts] final_samples.append("".join(final_pattern_parts)) # Sanity filter: drop overly generic numeric-only patterns (must contain a letter or a character class) @@ -463,9 +415,7 @@ def _has_alpha_or_class(s: str) -> bool: ) return result - def _get_manual_patterns( - self, primary_scope: str, secondary_scope: str | None - ) -> list[dict]: + def _get_manual_patterns(self, primary_scope: str, secondary_scope: str | None) -> list[dict]: """ Retrieves manually defined pattern samples from the RAW catalog. @@ -498,19 +448,13 @@ def _get_manual_patterns( patterns = (row.columns or {}).get("patterns", []) all_manual_patterns.extend(patterns) except CogniteNotFoundError: - self.logger.info( - f"No manual patterns found for key: {key}. This may be expected." - ) + self.logger.info(f"No manual patterns found for key: {key}. This may be expected.") except Exception as e: - self.logger.error( - f"Failed to retrieve manual patterns for key {key}: {e}" - ) + self.logger.error(f"Failed to retrieve manual patterns for key {key}: {e}") return all_manual_patterns - def _merge_patterns( - self, auto_patterns: list[dict], manual_patterns: list[dict] - ) -> list[dict]: + def _merge_patterns(self, auto_patterns: list[dict], manual_patterns: list[dict]) -> list[dict]: """ Combines automatically generated and manually defined patterns by resource type. @@ -524,9 +468,7 @@ def _merge_patterns( Returns: List of merged pattern dictionaries, deduplicated and organized by resource type. """ - merged: Dict[str, Dict[str, Any]] = defaultdict( - lambda: {"samples": set(), "annotation_type": None} - ) + merged: Dict[str, Dict[str, Any]] = defaultdict(lambda: {"samples": set(), "annotation_type": None}) # Process auto-generated patterns for item in auto_patterns: @@ -550,9 +492,7 @@ def _merge_patterns( # Set annotation_type if not already set (auto-patterns take precedence) if not bucket.get("annotation_type"): # NOTE: UI that creates manual patterns will need to also have the annotation type as a required entry - bucket["annotation_type"] = item.get( - "annotation_type", "diagrams.AssetLink" - ) + bucket["annotation_type"] = item.get("annotation_type", "diagrams.AssetLink") # Convert the merged dictionary back to the required list format final_list = [] @@ -566,7 +506,5 @@ def _merge_patterns( } ) - self.logger.info( - f"Merged auto and manual patterns into {len(final_list)} resource types." - ) + self.logger.info(f"Merged auto and manual patterns into {len(final_list)} resource types.") return final_list diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py index babe1666..ee374874 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/DataModelService.py @@ -73,20 +73,14 @@ class GeneralDataModelService(IDataModelService): Implementation used for real runs """ - def __init__( - self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger - ): + def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): self.client: CogniteClient = client self.config: Config = config self.logger: CogniteFunctionLogger = logger - self.annotation_state_view: ViewPropertyConfig = ( - config.data_model_views.annotation_state_view - ) + self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = config.data_model_views.file_view - self.target_entities_view: ViewPropertyConfig = ( - config.data_model_views.target_entities_view - ) + self.target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view self.get_files_to_annotate_retrieve_limit: int | None = get_limit_from_query( config.prepare_function.get_files_to_annotate_query @@ -193,13 +187,8 @@ def get_files_to_process( list_file_node_ids: list[NodeId] = [] for node in annotation_state_instances: - file_reference = node.properties.get( - self.annotation_state_view.as_view_id() - ).get("linkedFile") - if ( - self.file_view.instance_space is None - or self.file_view.instance_space == file_reference["space"] - ): + file_reference = node.properties.get(self.annotation_state_view.as_view_id()).get("linkedFile") + if self.file_view.instance_space is None or self.file_view.instance_space == file_reference["space"]: file_node_id = NodeId( space=file_reference["space"], external_id=file_reference["externalId"], @@ -234,19 +223,11 @@ def _get_annotation_state_filter(self) -> Filter: - Edge case that occurs very rarely but can happen. NOTE: Implementation of a more complex query that can't be handled in config should come from an implementation of the interface. """ - annotation_status_property = self.annotation_state_view.as_property_ref( - "annotationStatus" - ) - annotation_last_updated_property = self.annotation_state_view.as_property_ref( - "sourceUpdatedTime" - ) + annotation_status_property = self.annotation_state_view.as_property_ref("annotationStatus") + annotation_last_updated_property = self.annotation_state_view.as_property_ref("sourceUpdatedTime") # NOTE: While this number is hard coded, I believe it doesn't need to be configured. Number comes from my experience with the pipeline. Feel free to change if your experience leads to a different number - latest_permissible_time_utc = datetime.now(timezone.utc) - timedelta( - minutes=720 - ) - latest_permissible_time_utc = latest_permissible_time_utc.isoformat( - timespec="milliseconds" - ) + latest_permissible_time_utc = datetime.now(timezone.utc) - timedelta(minutes=720) + latest_permissible_time_utc = latest_permissible_time_utc.isoformat(timespec="milliseconds") filter_stuck = In( annotation_status_property, [AnnotationStatus.PROCESSING, AnnotationStatus.FINALIZING], @@ -255,9 +236,7 @@ def _get_annotation_state_filter(self) -> Filter: filter = self.filter_files_to_process | filter_stuck # | == OR return filter - def update_annotation_state( - self, list_node_apply: list[NodeApply] - ) -> NodeApplyResultList: + def update_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: """ Updates existing annotation state nodes with new property values. @@ -267,17 +246,13 @@ def update_annotation_state( Returns: NodeApplyResultList containing the results of the update operation. """ - update_results: InstancesApplyResult = ( - self.client.data_modeling.instances.apply( - nodes=list_node_apply, - replace=False, # ensures we don't delete other properties in the view - ) + update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( + nodes=list_node_apply, + replace=False, # ensures we don't delete other properties in the view ) return update_results.nodes - def create_annotation_state( - self, list_node_apply: list[NodeApply] - ) -> NodeApplyResultList: + def create_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: """ Creates new annotation state nodes, replacing any existing nodes with the same IDs. @@ -287,12 +262,10 @@ def create_annotation_state( Returns: NodeApplyResultList containing the results of the creation operation. """ - update_results: InstancesApplyResult = ( - self.client.data_modeling.instances.apply( - nodes=list_node_apply, - auto_create_direct_relations=True, - replace=True, # ensures we reset the properties of the node - ) + update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( + nodes=list_node_apply, + auto_create_direct_relations=True, + replace=True, # ensures we reset the properties of the node ) return update_results.nodes @@ -317,12 +290,8 @@ def get_instances_entities( NOTE: 1. grab assets that meet the filter requirement NOTE: 2. grab files that meet the filter requirement """ - target_filter: Filter = self._get_target_entities_filter( - primary_scope_value, secondary_scope_value - ) - file_filter: Filter = self._get_file_entities_filter( - primary_scope_value, secondary_scope_value - ) + target_filter: Filter = self._get_target_entities_filter(primary_scope_value, secondary_scope_value) + file_filter: Filter = self._get_file_entities_filter(primary_scope_value, secondary_scope_value) target_entities: NodeList = self.client.data_modeling.instances.list( instance_type="node", @@ -340,9 +309,7 @@ def get_instances_entities( ) return target_entities, file_entities - def _get_target_entities_filter( - self, primary_scope_value: str, secondary_scope_value: str | None - ) -> Filter: + def _get_target_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: """ Builds a filter for target entities (assets) based on scope and configuration. @@ -361,9 +328,7 @@ def _get_target_entities_filter( - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value """ filter_primary_scope: Filter = Equals( - property=self.target_entities_view.as_property_ref( - self.config.launch_function.primary_scope_property - ), + property=self.target_entities_view.as_property_ref(self.config.launch_function.primary_scope_property), value=primary_scope_value, ) filter_entities: Filter = self.filter_target_entities @@ -381,18 +346,14 @@ def _get_target_entities_filter( ), value=secondary_scope_value, ) - target_filter = ( - filter_primary_scope & filter_secondary_scope & filter_entities - ) | (filter_primary_scope & filter_scope_wide) - else: - target_filter = (filter_primary_scope & filter_entities) | ( + target_filter = (filter_primary_scope & filter_secondary_scope & filter_entities) | ( filter_primary_scope & filter_scope_wide ) + else: + target_filter = (filter_primary_scope & filter_entities) | (filter_primary_scope & filter_scope_wide) return target_filter - def _get_file_entities_filter( - self, primary_scope_value: str, secondary_scope_value: str | None - ) -> Filter: + def _get_file_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: """ Builds a filter for file entities based on scope and configuration. @@ -412,16 +373,12 @@ def _get_file_entities_filter( - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value """ filter_primary_scope: Filter = Equals( - property=self.file_view.as_property_ref( - self.config.launch_function.primary_scope_property - ), + property=self.file_view.as_property_ref(self.config.launch_function.primary_scope_property), value=primary_scope_value, ) filter_entities: Filter = self.filter_file_entities filter_search_property_exists: Filter = Exists( - property=self.file_view.as_property_ref( - self.config.launch_function.file_search_property - ), + property=self.file_view.as_property_ref(self.config.launch_function.file_search_property), ) # NOTE: ScopeWideDetect is an optional string that allows annotating across scopes filter_scope_wide: Filter = In( @@ -429,25 +386,18 @@ def _get_file_entities_filter( values=["ScopeWideDetect"], ) if not primary_scope_value: - file_filter = (filter_entities & filter_search_property_exists) | ( - filter_scope_wide - ) + file_filter = (filter_entities & filter_search_property_exists) | (filter_scope_wide) elif secondary_scope_value: filter_secondary_scope: Filter = Equals( - property=self.file_view.as_property_ref( - self.config.launch_function.secondary_scope_property - ), + property=self.file_view.as_property_ref(self.config.launch_function.secondary_scope_property), value=secondary_scope_value, ) file_filter = ( - filter_primary_scope - & filter_entities - & filter_secondary_scope - & filter_search_property_exists + filter_primary_scope & filter_entities & filter_secondary_scope & filter_search_property_exists ) | (filter_primary_scope & filter_scope_wide) else: - file_filter = ( - filter_primary_scope & filter_entities & filter_search_property_exists - ) | (filter_primary_scope & filter_scope_wide) + file_filter = (filter_primary_scope & filter_entities & filter_search_property_exists) | ( + filter_primary_scope & filter_scope_wide + ) return file_filter diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 68fbc05f..8323791a 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -88,9 +88,7 @@ def __init__( self.max_batch_size: int = config.launch_function.batch_size self.page_range: int = config.launch_function.annotation_service.page_range - self.annotation_state_view: ViewPropertyConfig = ( - config.data_model_views.annotation_state_view - ) + self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view self.file_view: ViewPropertyConfig = config.data_model_views.file_view self.in_memory_cache: list[dict] = [] @@ -98,12 +96,8 @@ def __init__( self._cached_primary_scope: str | None = None self._cached_secondary_scope: str | None = None - self.primary_scope_property: str = ( - self.config.launch_function.primary_scope_property - ) - self.secondary_scope_property: str | None = ( - self.config.launch_function.secondary_scope_property - ) + self.primary_scope_property: str = self.config.launch_function.primary_scope_property + self.secondary_scope_property: str | None = self.config.launch_function.secondary_scope_property self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") @@ -136,9 +130,7 @@ def prepare(self) -> Literal["Done"] | None: ) try: if self.reset_files: - file_nodes_to_reset: NodeList | None = ( - self.data_model_service.get_files_for_annotation_reset() - ) + file_nodes_to_reset: NodeList | None = self.data_model_service.get_files_for_annotation_reset() if not file_nodes_to_reset: self.logger.info( "No files found with the getFilesForAnnotationReset query provided in the config file" @@ -148,9 +140,7 @@ def prepare(self) -> Literal["Done"] | None: reset_node_apply: list[NodeApply] = [] for file_node in file_nodes_to_reset: file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast( - list[str], file_node_apply.sources[0].properties["tags"] - ) + tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) if "AnnotationInProcess" in tags_property: tags_property.remove("AnnotationInProcess") if "Annotated" in tags_property: @@ -159,9 +149,7 @@ def prepare(self) -> Literal["Done"] | None: tags_property.remove("AnnotationFailed") reset_node_apply.append(file_node_apply) - update_results = self.data_model_service.update_annotation_state( - reset_node_apply - ) + update_results = self.data_model_service.update_annotation_state(reset_node_apply) self.logger.info( f"Removed the AnnotationInProcess/Annotated/AnnotationFailed tag of {len(update_results)} files" ) @@ -170,8 +158,7 @@ def prepare(self) -> Literal["Done"] | None: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message - == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -180,9 +167,7 @@ def prepare(self) -> Literal["Done"] | None: raise e try: - file_nodes: NodeList | None = ( - self.data_model_service.get_files_to_annotate() - ) + file_nodes: NodeList | None = self.data_model_service.get_files_to_annotate() if not file_nodes: self.logger.info( message=f"No files found to prepare", @@ -194,8 +179,7 @@ def prepare(self) -> Literal["Done"] | None: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message - == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -212,7 +196,9 @@ def prepare(self) -> Literal["Done"] | None: linkedFile=node_id, ) if not self.annotation_state_view.instance_space: - msg = "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" + msg = ( + "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" + ) self.logger.error(msg) raise ValueError(msg) annotation_instance_space: str = self.annotation_state_view.instance_space @@ -224,31 +210,21 @@ def prepare(self) -> Literal["Done"] | None: annotation_state_instances.append(annotation_node_apply) file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast( - list[str], file_node_apply.sources[0].properties["tags"] - ) + tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) if "AnnotationInProcess" not in tags_property: tags_property.append("AnnotationInProcess") file_apply_instances.append(file_node_apply) try: - create_results = self.data_model_service.create_annotation_state( - annotation_state_instances - ) - self.logger.info( - message=f"Created {len(create_results)} annotation state instances" - ) - update_results = self.data_model_service.update_annotation_state( - file_apply_instances - ) + create_results = self.data_model_service.create_annotation_state(annotation_state_instances) + self.logger.info(message=f"Created {len(create_results)} annotation state instances") + update_results = self.data_model_service.update_annotation_state(file_apply_instances) self.logger.info( message=f"Added 'AnnotationInProcess' to the tag property for {len(update_results)} files", section="END", ) except Exception as e: - self.logger.error( - message=f"Ran into the following error:\n{str(e)}", section="END" - ) + self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") raise self.tracker.add_files(success=len(file_nodes)) @@ -275,21 +251,16 @@ def run(self) -> Literal["Done"] | None: section="START", ) try: - file_nodes, file_to_state_map = ( - self.data_model_service.get_files_to_process() - ) + file_nodes, file_to_state_map = self.data_model_service.get_files_to_process() if not file_nodes or not file_to_state_map: self.logger.info(message=f"No files found to launch") return "Done" - self.logger.info( - message=f"Launching {len(file_nodes)} files", section="END" - ) + self.logger.info(message=f"Launching {len(file_nodes)} files", section="END") except CogniteAPIError as e: # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message - == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. self.logger.error(message=f"Ran into the following error:\n{str(e)}") @@ -297,9 +268,7 @@ def run(self) -> Literal["Done"] | None: else: raise e - processing_batches: list[FileProcessingBatch] = ( - self._organize_files_for_processing(file_nodes) - ) + processing_batches: list[FileProcessingBatch] = self._organize_files_for_processing(file_nodes) total_files_processed = 0 try: @@ -309,9 +278,7 @@ def run(self) -> Literal["Done"] | None: msg = f"{self.primary_scope_property}: {primary_scope_value}" if secondary_scope_value: msg += f", {self.secondary_scope_property}: {secondary_scope_value}" - self.logger.info( - message=f"Processing {len(batch.files)} files in {msg}" - ) + self.logger.info(message=f"Processing {len(batch.files)} files in {msg}") self._ensure_cache_for_batch(primary_scope_value, secondary_scope_value) current_batch = BatchOfPairedNodes(file_to_state_map=file_to_state_map) @@ -324,18 +291,12 @@ def run(self) -> Literal["Done"] | None: current_batch.add_pair(file_node, file_reference) total_files_processed += 1 if current_batch.size() == self.max_batch_size: - self.logger.info( - message=f"Processing batch - Max batch size ({self.max_batch_size}) reached" - ) + self.logger.info(message=f"Processing batch - Max batch size ({self.max_batch_size}) reached") self._process_batch(current_batch) if not current_batch.is_empty(): - self.logger.info( - message=f"Processing remaining {current_batch.size()} files in batch" - ) + self.logger.info(message=f"Processing remaining {current_batch.size()} files in batch") self._process_batch(current_batch) - self.logger.info( - message=f"Finished processing for {msg}", section="END" - ) + self.logger.info(message=f"Finished processing for {msg}", section="END") except CogniteAPIError as e: if e.code == 429: self.logger.debug(f"{str(e)}") @@ -351,9 +312,7 @@ def run(self) -> Literal["Done"] | None: return - def _organize_files_for_processing( - self, list_files: NodeList - ) -> list[FileProcessingBatch]: + def _organize_files_for_processing(self, list_files: NodeList) -> list[FileProcessingBatch]: """ Organizes files into batches grouped by scope for efficient processing. @@ -367,9 +326,7 @@ def _organize_files_for_processing( Returns: List of FileProcessingBatch objects, each containing files from the same scope. """ - organized_data: dict[str, dict[str, list[Node]]] = defaultdict( - lambda: defaultdict(list) - ) + organized_data: dict[str, dict[str, list[Node]]] = defaultdict(lambda: defaultdict(list)) for file_node in list_files: node_props = file_node.properties[self.file_view.as_view_id()] @@ -401,9 +358,7 @@ def _organize_files_for_processing( ) return final_processing_batches - def _ensure_cache_for_batch( - self, primary_scope_value: str, secondary_scope_value: str | None - ): + def _ensure_cache_for_batch(self, primary_scope_value: str, secondary_scope_value: str | None): """ Ensures the in-memory entity cache is loaded and current for the given scope. @@ -427,12 +382,10 @@ def _ensure_cache_for_batch( ): self.logger.info(f"Refreshing in memory cache") try: - self.in_memory_cache, self.in_memory_patterns = ( - self.cache_service.get_entities( - self.data_model_service, - primary_scope_value, - secondary_scope_value, - ) + self.in_memory_cache, self.in_memory_patterns = self.cache_service.get_entities( + self.data_model_service, + primary_scope_value, + secondary_scope_value, ) self._cached_primary_scope = primary_scope_value self._cached_secondary_scope = secondary_scope_value @@ -440,13 +393,10 @@ def _ensure_cache_for_batch( # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. if ( e.code == 408 - and e.message - == "Graph query timed out. Reduce load or contention, or optimise your query." + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." ): # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. - self.logger.error( - message=f"Ran into the following error:\n{str(e)}" - ) + self.logger.error(message=f"Ran into the following error:\n{str(e)}") return else: raise e @@ -480,9 +430,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) update_properties = { "annotationStatus": AnnotationStatus.PROCESSING, - "sourceUpdatedTime": datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, "launchFunctionId": self.function_id, "launchFunctionCallId": self.call_id, @@ -493,9 +441,9 @@ def _process_batch(self, batch: BatchOfPairedNodes): if self.config.launch_function.pattern_mode: total_patterns = 0 if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: - total_patterns = len( - self.in_memory_patterns[0].get("sample", []) - ) + len(self.in_memory_patterns[1].get("sample", [])) + total_patterns = len(self.in_memory_patterns[0].get("sample", [])) + len( + self.in_memory_patterns[1].get("sample", []) + ) elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: total_patterns = len(self.in_memory_patterns[0].get("sample", [])) self.logger.info( @@ -556,9 +504,7 @@ def _process_batch(self, batch: BatchOfPairedNodes): ) update_properties = { "annotationStatus": AnnotationStatus.PROCESSING, - "sourceUpdatedTime": datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat(), + "sourceUpdatedTime": datetime.now(timezone.utc).replace(microsecond=0).isoformat(), "diagramDetectJobId": job_id, "launchFunctionId": self.function_id, "launchFunctionCallId": self.call_id, @@ -569,9 +515,9 @@ def _process_batch(self, batch: BatchOfPairedNodes): if self.config.launch_function.pattern_mode: total_patterns = 0 if self.in_memory_patterns and len(self.in_memory_patterns) >= 2: - total_patterns = len( - self.in_memory_patterns[0].get("sample", []) - ) + len(self.in_memory_patterns[1].get("sample", [])) + total_patterns = len(self.in_memory_patterns[0].get("sample", [])) + len( + self.in_memory_patterns[1].get("sample", []) + ) elif self.in_memory_patterns and len(self.in_memory_patterns) >= 1: total_patterns = len(self.in_memory_patterns[0].get("sample", [])) self.logger.info( diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py index c9191137..773b7797 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LoggerService.py @@ -21,9 +21,7 @@ def __init__( os.makedirs(dir_name, exist_ok=True) self.file_handler = open(self.filepath, "a", encoding="utf-8") except Exception as e: - print( - f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}" - ) + print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") self.write = False def _format_message_lines(self, prefix: str, message: str) -> list[str]: @@ -72,9 +70,7 @@ def _print(self, prefix: str, message: str) -> None: for line in lines_to_log: print(line) - def debug( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs a debug-level message. @@ -92,9 +88,7 @@ def debug( if section == "END" or section == "BOTH": self._section() - def info( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs an info-level message. @@ -112,9 +106,7 @@ def info( if section == "END" or section == "BOTH": self._section() - def warning( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs a warning-level message. @@ -132,9 +124,7 @@ def warning( if section == "END" or section == "BOTH": self._section() - def error( - self, message: str, section: Literal["START", "END", "BOTH"] | None = None - ) -> None: + def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: """ Logs an error-level message. @@ -162,9 +152,7 @@ def _section(self) -> None: self.file_handler.write( "--------------------------------------------------------------------------------\n" ) - print( - "--------------------------------------------------------------------------------" - ) + print("--------------------------------------------------------------------------------") def close(self) -> None: """ diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py index 6ffed692..af13b0cf 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/canvas.py @@ -1,5 +1,15 @@ from cognite.client import CogniteClient -from cognite.client.data_classes.data_modeling import NodeOrEdgeData, NodeApply, EdgeApply, ContainerId, ViewId, NodeId, EdgeId, Node, Edge +from cognite.client.data_classes.data_modeling import ( + NodeOrEdgeData, + NodeApply, + EdgeApply, + ContainerId, + ViewId, + NodeId, + EdgeId, + Node, + Edge, +) from cognite.client.data_classes.filters import Equals, And, Not import datetime import uuid @@ -77,7 +87,7 @@ def fetch_existing_canvas(name: str, file_node: Node, client: CogniteClient): existing_canvas = client.data_modeling.instances.retrieve( nodes=NodeId(space=CANVAS_SPACE_INSTANCE, external_id=f"file_annotation_canvas_{file_node.external_id}") ) - + return existing_canvas.nodes[0] if existing_canvas.nodes else None @@ -206,8 +216,10 @@ def dm_generate( def reset_canvas_annotations(canvas_id: str, client: CogniteClient): """Deletes all canvas annotations, which includes nodes and edges""" edge_filter = And( - Equals(property=['edge', 'type'], value={'space': CANVAS_SPACE_CANVAS, 'externalId': 'referencesCanvasAnnotation'}), - Equals(property=['edge', 'startNode'], value={'space': CANVAS_SPACE_INSTANCE, 'externalId': canvas_id}) + Equals( + property=["edge", "type"], value={"space": CANVAS_SPACE_CANVAS, "externalId": "referencesCanvasAnnotation"} + ), + Equals(property=["edge", "startNode"], value={"space": CANVAS_SPACE_INSTANCE, "externalId": canvas_id}), ) edges_to_delete = client.data_modeling.instances.list( @@ -220,11 +232,7 @@ def reset_canvas_annotations(canvas_id: str, client: CogniteClient): nodes_to_delete_ids = [NodeId(space=e.end_node.space, external_id=e.end_node.external_id) for e in edges_to_delete] if edges_to_delete_ids: - client.data_modeling.instances.delete( - edges=edges_to_delete_ids - ) + client.data_modeling.instances.delete(edges=edges_to_delete_ids) if nodes_to_delete_ids: - client.data_modeling.instances.delete( - nodes=nodes_to_delete_ids - ) \ No newline at end of file + client.data_modeling.instances.delete(nodes=nodes_to_delete_ids) diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py index 7b7cb897..f5fe0f4b 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/data_structures.py @@ -12,9 +12,7 @@ class ViewPropertyConfig: instance_space: str | None = None def as_view_id(self) -> ViewId: - return ViewId( - space=self.schema_space, external_id=self.external_id, version=self.version - ) + return ViewId(space=self.schema_space, external_id=self.external_id, version=self.version) def as_property_ref(self, property) -> list[str]: return [self.schema_space, f"{self.external_id}/{self.version}", property] diff --git a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py index b38a3b7a..794f31dd 100644 --- a/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py +++ b/modules/contextualization/cdf_file_annotation/streamlit/file_annotation_dashboard/helper.py @@ -375,7 +375,16 @@ def fetch_manual_patterns(db_name: str, table_name: str) -> pd.DataFrame: if "NotFoundError" not in str(type(e)): st.error(f"Failed to fetch manual patterns: {e}") return pd.DataFrame( - columns=["key", "scope_level", "annotation_type", "primary_scope", "secondary_scope", "sample", "resource_type", "created_by"] + columns=[ + "key", + "scope_level", + "annotation_type", + "primary_scope", + "secondary_scope", + "sample", + "resource_type", + "created_by", + ] ) @@ -397,7 +406,12 @@ def create_key(row): df["key"] = df.apply(create_key, axis=1) df.dropna(subset=["key"], inplace=True) rows_to_write = [ - RowWrite(key=key, columns={"patterns": group[["sample", "resource_type", "annotation_type", "created_by"]].to_dict("records")}) + RowWrite( + key=key, + columns={ + "patterns": group[["sample", "resource_type", "annotation_type", "created_by"]].to_dict("records") + }, + ) for key, group in df.groupby("key") ] @@ -524,10 +538,7 @@ def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> p Fetches entity instances from the specified data model view and returns a tidy DataFrame. """ instances = client.data_modeling.instances.list( - instance_type="node", - space=entity_view.instance_space, - sources=entity_view.as_view_id(), - limit=-1 + instance_type="node", space=entity_view.instance_space, sources=entity_view.as_view_id(), limit=-1 ) if not instances: @@ -542,7 +553,7 @@ def fetch_entities(entity_view: ViewPropertyConfig, resource_property: str) -> p row["name"] = props.get("name") row["resourceType"] = props.get(resource_property) row["sysUnit"] = props.get("sysUnit") - + for k, v in props.items(): if k not in row: row[k] = v @@ -562,7 +573,7 @@ def show_connect_unmatched_ui( tab, db_name, pattern_table, - apply_config + apply_config, ): """ Displays the UI to connect a single unmatched tag to either an Asset or a File. @@ -611,7 +622,7 @@ def show_connect_unmatched_ui( df_entities_display.loc[:, "Select"] = False df_entities_display.at[idx, "Select"] = True - filterable_columns = [col for col in ["sysUnit", "resourceType"] if col in df_entities_display.columns] + filterable_columns = [col for col in ["sysUnit", "resourceType"] if col in df_entities_display.columns] for filterable_column in filterable_columns: unique_values = sorted(df_entities_display[filterable_column].dropna().unique().tolist()) @@ -620,12 +631,12 @@ def show_connect_unmatched_ui( f"Filter by {filterable_column}", key=f"sb_filterable_column_{filterable_column}_{tab}", options=[None] + unique_values, - index=0 + index=0, ) if selected_value: df_entities_display = df_entities_display[df_entities_display[filterable_column] == selected_value] - + all_columns = df_entities_display.columns.tolist() default_columns = ["Select", "name", "resourceType", "sysUnit", "externalId"] @@ -634,7 +645,7 @@ def show_connect_unmatched_ui( f"Select columns to display ({entity_type}s)", options=all_columns, default=[col for col in default_columns if col in all_columns], - key=f"ms_selected_columns_{tab}_{entity_type}" + key=f"ms_selected_columns_{tab}_{entity_type}", ) entity_editor_key = f"{entity_type}_editor_{tag_text}_{tab}" @@ -646,7 +657,7 @@ def show_connect_unmatched_ui( "name": "Name", "externalId": "External ID", "resourceType": "Resource Type", - "sysUnit": "Sys Unit" + "sysUnit": "Sys Unit", }, use_container_width=True, hide_index=True, @@ -669,7 +680,7 @@ def show_connect_unmatched_ui( selected_entity = df_entities.loc[st.session_state.selected_entity_to_connect_index] if st.button( f"Connect '{tag_text}' to '{selected_entity['name']}' in {str(len(associated_files)) + ' files' if len(associated_files) > 1 else str(len(associated_files)) + ' file'}", - key=f"btn_connect_tag_to_entities_{tab}" + key=f"btn_connect_tag_to_entities_{tab}", ): success, count, error = create_tag_connection( client, @@ -687,14 +698,11 @@ def show_connect_unmatched_ui( st.toast( f"{count} annotation{'s' if count > 1 else ''} created from tag '{tag_text}' to {entity_type} '{selected_entity['name']}' " f"in {len(associated_files)} file{'s' if len(associated_files) > 1 else ''}!", - icon=":material/check_small:" + icon=":material/check_small:", ) st.cache_data.clear() else: - st.toast( - body=f"Failed to connect tag '{tag_text}': {error}", - icon=":material/error:" - ) + st.toast(body=f"Failed to connect tag '{tag_text}': {error}", icon=":material/error:") def create_tag_connection( @@ -712,11 +720,7 @@ def create_tag_connection( updated_edges = [] try: - rows = client.raw.rows.list( - db_name=db_name, - table_name=table_name, - limit=-1 - ) + rows = client.raw.rows.list(db_name=db_name, table_name=table_name, limit=-1) sink_node_space = apply_config["sinkNode"]["space"] @@ -733,32 +737,26 @@ def create_tag_connection( external_id=edge_external_id, type=DirectRelationReference(space=row_data.get("viewSpace"), external_id=annotation_type), start_node=DirectRelationReference(space=row_data.get("startNodeSpace"), external_id=file_id), - end_node=DirectRelationReference(space=selected_entity.get("space"), external_id=selected_entity.get("externalId")) + end_node=DirectRelationReference( + space=selected_entity.get("space"), external_id=selected_entity.get("externalId") + ), ) ) - + row_data["endNode"] = selected_entity["externalId"] row_data["endNodeSpace"] = selected_entity["space"] - resource_type = selected_entity["resourceType"] if selected_entity["resourceType"] else entity_view.external_id + resource_type = ( + selected_entity["resourceType"] if selected_entity["resourceType"] else entity_view.external_id + ) row_data["endNodeResourceType"] = resource_type row_data["status"] = "Approved" - updated_rows.append( - RowWrite( - key=edge_external_id, - columns=row_data - ) - ) + updated_rows.append(RowWrite(key=edge_external_id, columns=row_data)) if updated_rows: - client.raw.rows.insert( - db_name=db_name, - table_name=table_name, - row=updated_rows, - ensure_parent=True - ) + client.raw.rows.insert(db_name=db_name, table_name=table_name, row=updated_rows, ensure_parent=True) if updated_edges: client.data_modeling.instances.apply(edges=updated_edges, replace=False) @@ -768,15 +766,8 @@ def create_tag_connection( return False, 0, str(e) -def build_unmatched_tags_with_regions( - df: pd.DataFrame, - file_id: str, - potential_new_annotations: list[str] -): - df_filtered = df[ - (df["startNode"] == file_id) & - (df["startNodeText"].isin(potential_new_annotations)) - ] +def build_unmatched_tags_with_regions(df: pd.DataFrame, file_id: str, potential_new_annotations: list[str]): + df_filtered = df[(df["startNode"] == file_id) & (df["startNodeText"].isin(potential_new_annotations))] unmatched_tags_with_regions = [] @@ -790,9 +781,6 @@ def build_unmatched_tags_with_regions( ] } - unmatched_tags_with_regions.append({ - "text": row["startNodeText"], - "regions": [region] - }) + unmatched_tags_with_regions.append({"text": row["startNodeText"], "regions": [region]}) - return unmatched_tags_with_regions \ No newline at end of file + return unmatched_tags_with_regions From 0fea0e0a3ec56ef21fa42400dd09fdf1ab31cf65 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 16:53:18 -0500 Subject: [PATCH 088/128] misc changes --- .../contextualization/cdf_file_annotation/CONTRIBUTING.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md b/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md index 946a72c3..582be646 100644 --- a/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md +++ b/modules/contextualization/cdf_file_annotation/CONTRIBUTING.md @@ -61,11 +61,11 @@ Once the issue has been discussed and you're ready to contribute: - Reference the related issue in the PR description (e.g., "Closes #123" or "Fixes #456") - Provide a clear description of what changed and why - Include any relevant testing details or screenshots - - Add `@jack-cognite` as a reviewer (or the current maintainer) + - Add `@dude-with-a-mug` as a reviewer (or the current maintainer) ### 3. Code Review and Approval -- **All PRs require approval** from the project maintainer (@jack-cognite or designated reviewer) before merging +- **All PRs require approval** from the project maintainer (@dude-with-a-mug or designated reviewer) before merging - The maintainer will review your code for: - Code quality and adherence to project standards @@ -83,7 +83,6 @@ Once the issue has been discussed and you're ready to contribute: ### Python Code Style -- Follow [PEP 8](https://pep8.org/) style guidelines - Use type hints for all function parameters and return values - Maximum line length: 120 characters (as configured in the project) - Use meaningful variable and function names From 677049e78041e9ceaf2f55501396c62a11376432 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 17:48:10 -0500 Subject: [PATCH 089/128] updated mermaid diagrams for launch and finalize phase --- .../cdf_file_annotation/README.md | 94 ++++++++++++++++++- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/README.md b/modules/contextualization/cdf_file_annotation/README.md index 626519e6..4feb4cde 100644 --- a/modules/contextualization/cdf_file_annotation/README.md +++ b/modules/contextualization/cdf_file_annotation/README.md @@ -41,8 +41,6 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since ### Launch Phase -![LaunchService](https://github.com/user-attachments/assets/3e5ba403-50bb-4f6a-a723-be8947c65ebc) - - **Goal**: Launch the annotation jobs for files that are ready. - **Process**: 1. It queries for `AnnotationState` instances with a "New" or "Retry" status. @@ -58,11 +56,48 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since - A `standard annotation` job to find and link known entities with confidence scoring. - A `pattern mode` job (if enabled) to detect all text matching the pattern samples, creating a searchable reference catalog. 5. It updates the `AnnotationState` instance with both the `diagramDetectJobId` and `patternModeJobId` (if applicable) and sets the overall `annotationStatus` to "Processing". +
+Click to view Mermaid flowchart for Launch Phase + +```mermaid +flowchart TD + Start([Start Launch Phase]) --> QueryFiles[Query AnnotationState
for New or Retry status] + QueryFiles --> CheckFiles{Any files
to process?} + CheckFiles -->|No| End([End]) + CheckFiles -->|Yes| GroupFiles[Group files by
primary scope
e.g., site, unit] + + GroupFiles --> NextScope{Next scope
group?} + NextScope -->|Yes| CheckCache{Valid cache
exists in RAW?} + + CheckCache -->|No - Stale/Missing| QueryEntities[Query data model for
entities within scope] + QueryEntities --> GenPatterns[Auto-generate pattern samples
from entity aliases
e.g., FT-101A → #91;FT#93;-000#91;A#93;] + GenPatterns --> GetManual[Retrieve manual pattern
overrides from RAW catalog
GLOBAL, site, or unit level] + GetManual --> MergePatterns[Merge and deduplicate
auto-generated and
manual patterns] + MergePatterns --> StoreCache[Store entity list and
pattern samples in
RAW cache] + StoreCache --> UseCache[Use entities and patterns] + + CheckCache -->|Yes - Valid| LoadCache[Load entities and
patterns from RAW cache] + LoadCache --> UseCache + + UseCache --> ProcessBatch[Process files in batches
up to max batch size] + ProcessBatch --> SubmitJobs[Submit Diagram Detect jobs:
1 Standard annotation
2 Pattern mode if enabled] + SubmitJobs --> UpdateState[Update AnnotationState:
- Set status to Processing
- Store both job IDs] + UpdateState --> NextScope + NextScope -->|No more groups| QueryFiles + + style Start fill:#d4f1d4 + style End fill:#f1d4d4 + style CheckFiles fill:#fff4e6 + style CheckCache fill:#fff4e6 + style NextScope fill:#fff4e6 + style UseCache fill:#e6f3ff + style UpdateState fill:#e6f3ff +``` + +
### Finalize Phase -![FinalizeService](https://github.com/user-attachments/assets/152d9eaf-afdb-46fe-9125-11430ff10bc9) - - **Goal**: Retrieve, process, and store the results of completed annotation jobs. - **Process**: 1. It queries for `AnnotationState` instances with a "Processing" or "Finalizing" status (using optimistic locking to claim jobs). @@ -76,6 +111,57 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since - **Pattern annotations**: Creates edges linking files to a configurable "sink node" for review, writes results to a dedicated `doc_pattern` RAW table for the searchable catalog. 5. Updates the file node tag from "AnnotationInProcess" to "Annotated". 6. Updates the `AnnotationState` status to "Annotated", "Failed", or back to "New" (if more pages remain), tracking page progress for large files. +
+Click to view Mermaid flowchart for Finalize Phase + +```mermaid +flowchart TD + Start([Start Finalize Phase]) --> QueryState[Query for ONE AnnotationState
with Processing status
Use optimistic locking to claim it] + QueryState --> CheckState{Found annotation
state instance?} + CheckState -->|No| End([End]) + CheckState -->|Yes| GetJobId[Extract job ID and
pattern mode job ID] + + GetJobId --> FindFiles[Find ALL files with
the same job ID] + FindFiles --> CheckJobs{Both standard
and pattern jobs
complete?} + CheckJobs -->|No| ResetStatus[Update AnnotationStates
back to Processing
Wait 30 seconds] + ResetStatus --> QueryState + + CheckJobs -->|Yes| RetrieveResults[Retrieve results from
both completed jobs] + RetrieveResults --> MergeResults[Merge regular and pattern
results by file ID
Creates unified result per file] + MergeResults --> LoopFiles[For each file in merged results] + + LoopFiles --> ProcessResults[Process file results:
- Create stable hash for deduplication
- Filter standard by confidence threshold
- Skip pattern duplicates] + + ProcessResults --> CheckClean{First run for
multi-page file?} + CheckClean -->|Yes| CleanOld[Clean old annotations] + CheckClean -->|No| CreateEdges + CleanOld --> CreateEdges[Create edges in data model] + + CreateEdges --> StandardEdges[Standard annotations:
Link file to entities
Write to doc_tag and doc_doc RAW tables] + StandardEdges --> PatternEdges[Pattern annotations:
Link file to sink node
Write to doc_pattern RAW table] + + PatternEdges --> UpdateTag[Update file tag:
AnnotationInProcess → Annotated] + UpdateTag --> PrepareUpdate[Prepare AnnotationState update:
- Annotated if complete
- Failed if error
- New if more pages remain
Track page progress] + + PrepareUpdate --> MoreFiles{More files in
merged results?} + MoreFiles -->|Yes| LoopFiles + MoreFiles -->|No| BatchUpdate[Batch update ALL
AnnotationState instances
for this job] + + BatchUpdate --> QueryState + + style Start fill:#d4f1d4 + style End fill:#f1d4d4 + style CheckState fill:#fff4e6 + style CheckJobs fill:#fff4e6 + style CheckClean fill:#fff4e6 + style MoreFiles fill:#fff4e6 + style MergeResults fill:#e6f3ff + style ProcessResults fill:#e6f3ff + style CreateEdges fill:#e6f3ff + style BatchUpdate fill:#e6f3ff +``` + +
## Configuration From c93976636a79524479f36a84e8085a7a3fbdd3bc Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 17:56:20 -0500 Subject: [PATCH 090/128] updated detailed_guides --- .../detailed_guides/CONFIG.md | 39 +++++++++------- .../detailed_guides/CONFIG_PATTERNS.md | 44 ++++++++++++++++--- .../detailed_guides/DEVELOPING.md | 15 +++++-- 3 files changed, 72 insertions(+), 26 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG.md b/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG.md index 76f9a866..b26ce858 100644 --- a/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG.md +++ b/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG.md @@ -63,25 +63,29 @@ Settings for the main annotation job launching process. Parsed by the `LaunchFun - `targetEntitiesSearchProperty` (str): Property on `targetEntitiesView` for matching (e.g., `aliases`). - `primaryScopeProperty` (str, optional): File property for primary grouping/context (e.g., `site`). If set to `None` or omitted, the function processes files without a primary scope grouping. _(Pydantic field: `primary_scope_property`)_ - `secondaryScopeProperty` (str, optional): File property for secondary grouping/context (e.g., `unit`). Defaults to `None`. _(Pydantic field: `secondary_scope_property`)_ + - `patternMode` (bool): Enables pattern-based detection mode alongside standard entity matching. When `True`, automatically generates regex-like patterns from entity aliases and detects all matching text in files. Defaults to `False`. _(Pydantic field: `pattern_mode`)_ + - `fileResourceProperty` (str, optional): Property on `fileView` to use for file-to-file link resource matching. Defaults to `None`. _(Pydantic field: `file_resource_property`)_ + - `targetEntitiesResourceProperty` (str, optional): Property on `targetEntitiesView` to use for resource matching. Defaults to `None`. _(Pydantic field: `target_entities_resource_property`)_ - **`dataModelService`** (`DataModelServiceConfig`): **Note:** For the query configurations below, you can provide a single query object or a list of query objects. If a list is provided, the queries are combined with a logical **OR**. - `getFilesToProcessQuery` (`QueryConfig | list[QueryConfig]`): Selects `AnnotationState` nodes ready for launching (e.g., status "New", "Retry"). - - `getTargetEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries entities from `targetEntitiesView` for the cache. - - `getFileEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries file entities from `fileView` for the cache. + - `getTargetEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries entities from `targetEntitiesView` for the cache (e.g., assets tagged "DetectInDiagrams"). + - `getFileEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries file entities from `fileView` for the cache, enabling file-to-file linking (e.g., files tagged "DetectInDiagrams"). - **`cacheService`** (`CacheServiceConfig`): - `cacheTimeLimit` (int): Cache validity in hours (e.g., `24`). - `rawDb` (str): RAW database for the entity cache (e.g., `db_file_annotation`). - `rawTableCache` (str): RAW table for the entity cache (e.g., `annotation_entities_cache`). + - `rawManualPatternsCatalog` (str): RAW table for storing manual pattern overrides at GLOBAL, site, or unit levels (e.g., `manual_patterns_catalog`). _(Pydantic field: `raw_manual_patterns_catalog`)_ - **`annotationService`** (`AnnotationServiceConfig`): - - `pageRange` (int): Parameter for creating start and end page for `FileReference`. - - `partialMatch` (bool): Parameter for `client.diagrams.detect()`. - - `minTokens` (int): Parameter for `client.diagrams.detect()`. - - `diagramDetectConfig` (`DiagramDetectConfigModel`, optional): Detailed API configuration. + - `pageRange` (int): Number of pages to process per batch for large documents. For files with more than `pageRange` pages, the file is processed iteratively in chunks (e.g., `50`). + - `partialMatch` (bool): Parameter for `client.diagrams.detect()`. Enables partial text matching. + - `minTokens` (int, optional): Parameter for `client.diagrams.detect()`. Minimum number of tokens required for a match. + - `diagramDetectConfig` (`DiagramDetectConfigModel`, optional): Detailed API configuration for diagram detection. - Contains fields like `connectionFlags` (`ConnectionFlagsConfig`), `customizeFuzziness` (`CustomizeFuzzinessConfig`), `readEmbeddedText`, etc. - The Pydantic model's `as_config()` method converts this into an SDK `DiagramDetectConfig` object. @@ -93,23 +97,24 @@ Settings for processing completed annotation jobs. Parsed by the `FinalizeFuncti - **Direct Parameters:** - - `cleanOldAnnotations` (bool): If `True`, deletes existing annotations before applying new ones. - - `maxRetryAttempts` (int): Max retries for a file if processing fails. + - `cleanOldAnnotations` (bool): If `True`, deletes existing annotations before applying new ones (only on the first run for multi-page files). _(Pydantic field: `clean_old_annotations`)_ + - `maxRetryAttempts` (int): Maximum number of retry attempts for a file before marking it as "Failed". _(Pydantic field: `max_retry_attempts`)_ - **`retrieveService`** (`RetrieveServiceConfig`): - - `getJobIdQuery` (`QueryConfig`): Selects `AnnotationState` nodes whose jobs are ready for result retrieval (e.g., status "Processing", `diagramDetectJobId` exists). + - `getJobIdQuery` (`QueryConfig`): Selects `AnnotationState` nodes whose jobs are ready for result retrieval. Uses optimistic locking to claim jobs (e.g., status "Processing", `diagramDetectJobId` exists). _(Pydantic field: `get_job_id_query`)_ - **`applyService`** (`ApplyServiceConfig`): - - `autoApprovalThreshold` (float): Confidence score for "Approved" status. - - `autoSuggestThreshold` (float): Confidence score for "Suggested" status. - -- **`reportService`** (`ReportServiceConfig`): - - `rawDb` (str): RAW DB for reports. - - `rawTableDocTag` (str): RAW table for document-tag links. - - `rawTableDocDoc` (str): RAW table for document-document links. - - `rawBatchSize` (int): Rows to batch before writing to RAW. + - `autoApprovalThreshold` (float): Confidence score threshold for automatically approving standard annotations (e.g., `1.0` for exact matches only). _(Pydantic field: `auto_approval_threshold`)_ + - `autoSuggestThreshold` (float): Confidence score threshold for suggesting standard annotations for review (e.g., `1.0`). _(Pydantic field: `auto_suggest_threshold`)_ + - `sinkNode` (`SinkNodeConfig`): Configuration for the target node where pattern mode annotations are linked for review. _(Pydantic field: `sink_node`)_ + - `space` (str): The space where the sink node resides. + - `externalId` (str): The external ID of the sink node. _(Pydantic field: `external_id`)_ + - `rawDb` (str): RAW database for storing annotation reports. _(Pydantic field: `raw_db`)_ + - `rawTableDocTag` (str): RAW table name for document-to-asset annotation links (e.g., `doc_tag`). _(Pydantic field: `raw_table_doc_tag`)_ + - `rawTableDocDoc` (str): RAW table name for document-to-document annotation links (e.g., `doc_doc`). _(Pydantic field: `raw_table_doc_doc`)_ + - `rawTableDocPattern` (str): RAW table name for pattern mode detections, creating a searchable catalog of potential entity matches (e.g., `doc_pattern`). _(Pydantic field: `raw_table_doc_pattern`)_ --- diff --git a/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG_PATTERNS.md b/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG_PATTERNS.md index 8015c183..b64748f6 100644 --- a/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG_PATTERNS.md +++ b/modules/contextualization/cdf_file_annotation/detailed_guides/CONFIG_PATTERNS.md @@ -99,7 +99,41 @@ launchFunction: # ... (rest of launchFunction config) ``` -### Recipe 4: Fine-Tuning the Diagram Detection API +### Recipe 4: Enabling and Configuring Pattern Mode + +**Goal:** Enable pattern-based detection alongside standard entity matching to create a comprehensive searchable catalog of potential entity occurrences in files. + +**Scenario:** You want to detect all text in files that matches patterns generated from entity aliases (e.g., "FT-101A" generates pattern "[FT]-000[A]"), in addition to standard exact entity matching. + +**Configuration:** +Enable `patternMode` in the `launchFunction` section and configure the sink node in `finalizeFunction.applyService`. + +```yaml +# In ep_file_annotation.config.yaml + +launchFunction: + patternMode: True # Enable pattern detection mode + # ... (other configs) + cacheService: + rawManualPatternsCatalog: "manual_patterns_catalog" # Table for manual pattern overrides + +finalizeFunction: + # ... (other configs) + applyService: + sinkNode: + space: "sp_pattern_review" # Space where pattern detections are linked + externalId: "pattern_detection_sink" # Sink node for review + rawTableDocPattern: "doc_pattern" # RAW table for pattern detections +``` + +**Pattern Mode Features:** + +- **Auto-generation**: Automatically creates regex-like patterns from entity aliases +- **Manual overrides**: Add custom patterns to RAW table at GLOBAL, site, or unit levels +- **Deduplication**: Automatically skips pattern detections that duplicate standard annotations +- **Separate catalog**: Pattern detections stored separately for review in `doc_pattern` RAW table + +### Recipe 5: Fine-Tuning the Diagram Detection API **Goal:** Adjust the behavior of the diagram detection model, for example, by making it more or less strict about fuzzy text matching. @@ -119,7 +153,7 @@ launchFunction: # ... (other DiagramDetectConfig properties) ``` -### Recipe 5: Combining Queries with OR Logic +### Recipe 6: Combining Queries with OR Logic **Goal:** To select files for processing that meet one of several distinct criteria. This is useful when you want to combine different sets of filters with a logical OR. @@ -165,7 +199,7 @@ prepareFunction: targetProperty: tags ``` -### Recipe 6: Annotating Files Without a Scope +### Recipe 7: Annotating Files Without a Scope **Goal:** To annotate files that do not have a `primaryScopeProperty` (e.g., `city`). This is useful for processing files that are not assigned to a specific city or for a global-level annotation process. @@ -190,7 +224,7 @@ launchFunction: This section covers high-level architectural decisions about how the template finds and partitions data. The choice between these patterns is fundamental and depends on your organization's requirements for governance, security, and operational structure. -### Recipe 7: Global Scoping (Searching Across All Spaces) +### Recipe 8: Global Scoping (Searching Across All Spaces) **Goal:** To run a single, unified annotation process that finds and annotates all new files based on their properties, regardless of which physical `instanceSpace` they reside in. @@ -220,7 +254,7 @@ dataModelViews: - When a single team uses a single, consistent set of rules to annotate all files across the organization. - For simpler systems where strict data partitioning between different domains is not a requirement. -### Recipe 8: Isolated Scoping (Targeting a Specific Space) +### Recipe 9: Isolated Scoping (Targeting a Specific Space) **Goal:** To run a dedicated annotation process that operates only within a single, physically separate data partition. diff --git a/modules/contextualization/cdf_file_annotation/detailed_guides/DEVELOPING.md b/modules/contextualization/cdf_file_annotation/detailed_guides/DEVELOPING.md index f389f58b..615cae2e 100644 --- a/modules/contextualization/cdf_file_annotation/detailed_guides/DEVELOPING.md +++ b/modules/contextualization/cdf_file_annotation/detailed_guides/DEVELOPING.md @@ -26,11 +26,17 @@ While any service can be replaced, these are the most common candidates for cust - **`AbstractLaunchService`**: The orchestrator for the launch function. You would implement this if your project requires a fundamentally different file batching, grouping, or processing workflow that can't be achieved with the `primary_scope_property` and `secondary_scope_property` configuration. +- **`AbstractFinalizeService`**: The orchestrator for the finalize function. Implement this if your project needs custom job claiming logic, result merging strategies, or unique annotation state update patterns. + - **`IDataModelService`**: The gateway to Cognite Data Fusion. Implement this if your project needs highly optimized or complex queries to fetch files and entities that go beyond the declarative `QueryConfig` filter system. -- **`IApplyService`**: The service responsible for writing annotations back to the data model. Implement this if your project has custom rules for how to set annotation properties (like status) or needs to create additional relationships in the data model. +- **`IRetrieveService`**: Handles retrieving diagram detection job results and claiming jobs with optimistic locking. Implement this if you need custom job claiming strategies or want to integrate with external job tracking systems. + +- **`IApplyService`**: The service responsible for writing annotations back to the data model and RAW tables. Implement this if your project has custom rules for confidence thresholds, deduplication logic, or needs to create additional relationships in the data model or external systems. + +- **`ICacheService`**: Manages the in-memory entity cache and pattern generation. You might implement this if your project has a different caching strategy (e.g., different cache key logic, custom pattern generation algorithms, or fetching context from an external system). -- **`ICacheService`**: Manages the in-memory entity cache. You might implement this if your project has a different caching strategy (e.g., different cache key logic, or fetching context from an external system). +- **`IAnnotationService`**: Handles interaction with the Cognite Diagram Detect API. Implement this if you need custom retry logic, want to use a different annotation API, or need to pre/post-process annotation requests. ## How to Create a Custom Implementation @@ -102,7 +108,7 @@ class HighPriorityLaunchService(GeneralLaunchService): ### Step 2: Use Your Custom Implementation ```python -# In fn_dm_context_annotation_launch/handler.py +# In fn_file_annotation_launch/handler.py # ... (other imports) from services.LaunchService import AbstractLaunchService @@ -110,7 +116,7 @@ from services.LaunchService import AbstractLaunchService from services.my_custom_launch_service import HighPriorityLaunchService # 2. Instantiate your new custom class instead of GeneralLaunchService -def _create_launch_service(config, client, logger, tracker) -> AbstractLaunchService: +def _create_launch_service(config, client, logger, tracker, function_call_info) -> AbstractLaunchService: cache_instance: ICacheService = create_general_cache_service(config, client, logger) data_model_instance: IDataModelService = create_general_data_model_service( config, client, logger @@ -126,6 +132,7 @@ def _create_launch_service(config, client, logger, tracker) -> AbstractLaunchSer data_model_service=data_model_instance, cache_service=cache_instance, annotation_service=annotation_instance, + function_call_info=function_call_info, ) return launch_instance From cdd7683c4fefc03798d31fc1e47ce888ac3a669a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 17:58:30 -0500 Subject: [PATCH 091/128] updated the quickstart_setup to create assets from the equipment --- .../local_setup/quickstart_setup.ipynb | 117 +++++++++++------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb b/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb index e06317ae..fbccf141 100644 --- a/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb +++ b/modules/contextualization/cdf_file_annotation/local_setup/quickstart_setup.ipynb @@ -16,9 +16,12 @@ "\n", "from cognite.client.data_classes.data_modeling import (\n", " Node,\n", + " NodeId,\n", " NodeList,\n", " NodeApplyList,\n", " ViewId,\n", + " NodeApply,\n", + " NodeOrEdgeData,\n", ")" ] }, @@ -101,7 +104,7 @@ "outputs": [], "source": [ "# Replace the value of organization with the one used in config..yaml\n", - "organization: str = \"tx\"\n", + "organization: str = \n", "file_view_name: str = f\"{organization}File\"\n", "\n", "# Create a view class\n", @@ -122,7 +125,7 @@ "source": [ "# retrieve instances of txFile\n", "files: NodeList[Node] = cdf_client.data_modeling.instances.list(instance_type=\"node\", sources=file_view.as_view_id(), limit=-1)\n", - "print(files[0])" + "print(files[1])" ] }, { @@ -137,6 +140,11 @@ "\n", "for file in file_node_apply_list:\n", " file.sources[0].properties[\"tags\"] = [\"ToAnnotate\", \"DetectInDiagrams\"]\n", + " alias = []\n", + " name = file.sources[0].properties[\"name\"]\n", + " alias.append(name.replace(\".pdf\", \"\"))\n", + " file.sources[0].properties[\"aliases\"] = alias\n", + "\n", "print(file_node_apply_list[0])" ] }, @@ -156,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "701df86b", + "id": "9579ff52", "metadata": {}, "outputs": [], "source": [ @@ -167,65 +175,84 @@ " external_id=equipment_view_name,\n", " version=\"v1\",\n", " instance_space=\"springfield_instances\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c40f234", - "metadata": {}, - "outputs": [], - "source": [ + ")\n", + "\n", "# retrieve instances of txEquipment\n", "equipments: NodeList[Node] = cdf_client.data_modeling.instances.list(instance_type=\"node\", sources=equipment_view.as_view_id(), limit=-1)\n", - "print(equipments[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "969ae758", - "metadata": {}, - "outputs": [], - "source": [ - "equipment_node_apply_list: NodeApplyList = equipments.as_write()\n", "\n", - "for equipment in equipment_node_apply_list:\n", - " equipment.sources[0].properties[\"tags\"] = [\"DetectInDiagrams\"]\n", - "print(equipment_node_apply_list[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01eacbe1", - "metadata": {}, - "outputs": [], - "source": [ - "cdf_client.data_modeling.instances.apply(equipment_node_apply_list)" + "# Now lets do the same with the equipment nodes in the project so that we have entities to match against\n", + "asset_view_name: str = f\"{organization}Asset\"\n", + "asset_view: ViewPropertyConfig = ViewPropertyConfig(\n", + " schema_space=\"sp_enterprise_process_industry\",\n", + " external_id=asset_view_name,\n", + " version=\"v1\",\n", + " instance_space=\"springfield_instances\",\n", + ")\n", + "\n", + "asset_node_apply_list = []\n", + "for equipment in equipments:\n", + " external_id = \"asset:\"+equipment.external_id\n", + " space = equipment.space\n", + "\n", + " properties:dict = {}\n", + " equipment_name = equipment.properties[equipment_view.as_view_id()][\"name\"]\n", + "\n", + " properties[\"tags\"] = [\"DetectInDiagrams\"]\n", + " properties[\"name\"] = equipment_name\n", + " properties[\"description\"] = equipment.properties[equipment_view.as_view_id()][\"description\"]\n", + " properties[\"sourceId\"] = equipment.properties[equipment_view.as_view_id()][\"sourceId\"]\n", + " properties[\"sourceUpdatedUser\"] = equipment.properties[equipment_view.as_view_id()][\"sourceUpdatedUser\"]\n", + "\n", + " aliases = []\n", + " name_tokens = equipment_name.split(\"-\")\n", + " alt_alias = \"\"\n", + " aliases.append(equipment_name)\n", + " for index,token in enumerate(name_tokens):\n", + " if index == 0:\n", + " continue\n", + " if index == 1:\n", + " alt_alias = token\n", + " else:\n", + " alt_alias = alt_alias + \"-\" + token\n", + " aliases.append(alt_alias)\n", + " \n", + " properties[\"aliases\"] = aliases\n", + " asset_node_apply_list.append(\n", + " NodeApply(\n", + " space=equipment.space,\n", + " external_id=\"asset:\"+equipment.external_id,\n", + " sources=[\n", + " NodeOrEdgeData(\n", + " source=asset_view.as_view_id(),\n", + " properties=properties,\n", + " )\n", + " ],\n", + " )\n", + " )\n", + "\n", + "print(len(asset_node_apply_list))\n", + "print(asset_node_apply_list[0])\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "17ee6c05", + "id": "aad9ad6b", "metadata": {}, "outputs": [], "source": [ - "# In case you're interested in seeing the instances of file annotation state\n", - "fileAnnotationState_view: ViewPropertyConfig = ViewPropertyConfig(\n", - " schema_space= \"sp_hdm\",\n", - " external_id=\"FileAnnotationState\",\n", - " version = \"v1.0.0\",\n", + "update_results = cdf_client.data_modeling.instances.apply(\n", + " nodes=asset_node_apply_list,\n", + " auto_create_direct_relations=True,\n", + " replace=True, # ensures we reset the properties of the node\n", ")\n", - "cdf_client.data_modeling.instances.list(sources=fileAnnotationState_view.as_view_id())" + "print(update_results)" ] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": ".venv (3.12.9)", "language": "python", "name": "python3" }, From 0f0a5f7c880a3fd129f77a9b76fd29c94e567ade Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 18:00:00 -0500 Subject: [PATCH 092/128] extraction pipeline now deploys with up to date yaml guide --- ...ep_file_annotation.ExtractionPipeline.yaml | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml index 7dd00a93..0a3993e1 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml @@ -54,10 +54,8 @@ documentation: > Configures the initial setup phase, primarily for selecting files to be annotated. Parsed by the `PrepareFunction` Pydantic model. - **Note:** For the query configurations below, you can provide a single query object or a list of query objects. If a list is provided, the queries are combined with a logical **OR**. - - **`getFilesForAnnotationResetQuery`** (`QueryConfig | list[QueryConfig]`, optional): - **Purpose:** Selects specific files to have their annotation status reset (e.g., remove "Annotated"/"AnnotationInProcess" tags) to make them eligible for re-annotation. @@ -80,25 +78,29 @@ documentation: > - `targetEntitiesSearchProperty` (str): Property on `targetEntitiesView` for matching (e.g., `aliases`). - `primaryScopeProperty` (str, optional): File property for primary grouping/context (e.g., `site`). If set to `None` or omitted, the function processes files without a primary scope grouping. _(Pydantic field: `primary_scope_property`)_ - `secondaryScopeProperty` (str, optional): File property for secondary grouping/context (e.g., `unit`). Defaults to `None`. _(Pydantic field: `secondary_scope_property`)_ + - `patternMode` (bool): Enables pattern-based detection mode alongside standard entity matching. When `True`, automatically generates regex-like patterns from entity aliases and detects all matching text in files. Defaults to `False`. _(Pydantic field: `pattern_mode`)_ + - `fileResourceProperty` (str, optional): Property on `fileView` to use for file-to-file link resource matching. Defaults to `None`. _(Pydantic field: `file_resource_property`)_ + - `targetEntitiesResourceProperty` (str, optional): Property on `targetEntitiesView` to use for resource matching. Defaults to `None`. _(Pydantic field: `target_entities_resource_property`)_ - **`dataModelService`** (`DataModelServiceConfig`): **Note:** For the query configurations below, you can provide a single query object or a list of query objects. If a list is provided, the queries are combined with a logical **OR**. - `getFilesToProcessQuery` (`QueryConfig | list[QueryConfig]`): Selects `AnnotationState` nodes ready for launching (e.g., status "New", "Retry"). - - `getTargetEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries entities from `targetEntitiesView` for the cache. - - `getFileEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries file entities from `fileView` for the cache. + - `getTargetEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries entities from `targetEntitiesView` for the cache (e.g., assets tagged "DetectInDiagrams"). + - `getFileEntitiesQuery` (`QueryConfig | list[QueryConfig]`): Queries file entities from `fileView` for the cache, enabling file-to-file linking (e.g., files tagged "DetectInDiagrams"). - **`cacheService`** (`CacheServiceConfig`): - `cacheTimeLimit` (int): Cache validity in hours (e.g., `24`). - `rawDb` (str): RAW database for the entity cache (e.g., `db_file_annotation`). - `rawTableCache` (str): RAW table for the entity cache (e.g., `annotation_entities_cache`). + - `rawManualPatternsCatalog` (str): RAW table for storing manual pattern overrides at GLOBAL, site, or unit levels (e.g., `manual_patterns_catalog`). _(Pydantic field: `raw_manual_patterns_catalog`)_ - **`annotationService`** (`AnnotationServiceConfig`): - - `pageRange` (int): Parameter for creating start and end page for `FileReference`. - - `partialMatch` (bool): Parameter for `client.diagrams.detect()`. - - `minTokens` (int): Parameter for `client.diagrams.detect()`. - - `diagramDetectConfig` (`DiagramDetectConfigModel`, optional): Detailed API configuration. + - `pageRange` (int): Number of pages to process per batch for large documents. For files with more than `pageRange` pages, the file is processed iteratively in chunks (e.g., `50`). + - `partialMatch` (bool): Parameter for `client.diagrams.detect()`. Enables partial text matching. + - `minTokens` (int, optional): Parameter for `client.diagrams.detect()`. Minimum number of tokens required for a match. + - `diagramDetectConfig` (`DiagramDetectConfigModel`, optional): Detailed API configuration for diagram detection. - Contains fields like `connectionFlags` (`ConnectionFlagsConfig`), `customizeFuzziness` (`CustomizeFuzzinessConfig`), `readEmbeddedText`, etc. - The Pydantic model's `as_config()` method converts this into an SDK `DiagramDetectConfig` object. @@ -110,23 +112,24 @@ documentation: > - **Direct Parameters:** - - `cleanOldAnnotations` (bool): If `True`, deletes existing annotations before applying new ones. - - `maxRetryAttempts` (int): Max retries for a file if processing fails. + - `cleanOldAnnotations` (bool): If `True`, deletes existing annotations before applying new ones (only on the first run for multi-page files). _(Pydantic field: `clean_old_annotations`)_ + - `maxRetryAttempts` (int): Maximum number of retry attempts for a file before marking it as "Failed". _(Pydantic field: `max_retry_attempts`)_ - **`retrieveService`** (`RetrieveServiceConfig`): - - `getJobIdQuery` (`QueryConfig`): Selects `AnnotationState` nodes whose jobs are ready for result retrieval (e.g., status "Processing", `diagramDetectJobId` exists). + - `getJobIdQuery` (`QueryConfig`): Selects `AnnotationState` nodes whose jobs are ready for result retrieval. Uses optimistic locking to claim jobs (e.g., status "Processing", `diagramDetectJobId` exists). _(Pydantic field: `get_job_id_query`)_ - **`applyService`** (`ApplyServiceConfig`): - - `autoApprovalThreshold` (float): Confidence score for "Approved" status. - - `autoSuggestThreshold` (float): Confidence score for "Suggested" status. - - - **`reportService`** (`ReportServiceConfig`): - - `rawDb` (str): RAW DB for reports. - - `rawTableDocTag` (str): RAW table for document-tag links. - - `rawTableDocDoc` (str): RAW table for document-document links. - - `rawBatchSize` (int): Rows to batch before writing to RAW. + - `autoApprovalThreshold` (float): Confidence score threshold for automatically approving standard annotations (e.g., `1.0` for exact matches only). _(Pydantic field: `auto_approval_threshold`)_ + - `autoSuggestThreshold` (float): Confidence score threshold for suggesting standard annotations for review (e.g., `1.0`). _(Pydantic field: `auto_suggest_threshold`)_ + - `sinkNode` (`SinkNodeConfig`): Configuration for the target node where pattern mode annotations are linked for review. _(Pydantic field: `sink_node`)_ + - `space` (str): The space where the sink node resides. + - `externalId` (str): The external ID of the sink node. _(Pydantic field: `external_id`)_ + - `rawDb` (str): RAW database for storing annotation reports. _(Pydantic field: `raw_db`)_ + - `rawTableDocTag` (str): RAW table name for document-to-asset annotation links (e.g., `doc_tag`). _(Pydantic field: `raw_table_doc_tag`)_ + - `rawTableDocDoc` (str): RAW table name for document-to-document annotation links (e.g., `doc_doc`). _(Pydantic field: `raw_table_doc_doc`)_ + - `rawTableDocPattern` (str): RAW table name for pattern mode detections, creating a searchable catalog of potential entity matches (e.g., `doc_pattern`). _(Pydantic field: `raw_table_doc_pattern`)_ --- @@ -150,6 +153,5 @@ documentation: > - **`limit`** (Optional[int], default `-1`): Specifies the upper limit of instances that can be retrieved from the query. - The Python code uses `QueryConfig.build_filter()` (which internally uses `FilterConfig.as_filter()`) to convert these YAML definitions into Cognite SDK `Filter` objects for querying CDF. From a93d09e717ae88f6f79f4b98cc097837b35a45c6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Tue, 14 Oct 2025 18:12:52 -0500 Subject: [PATCH 093/128] edited length of ep documentation --- .../ep_file_annotation.ExtractionPipeline.yaml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml index 0a3993e1..8af0b484 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml @@ -13,23 +13,6 @@ rawTables: tableName: {{ rawManualPatternsCatalog }} source: "Files" documentation: > - # Guide to Configuring the Annotation Function via YAML - - This document outlines how to use the `ep_file_annotation.config.yaml` file to control the behavior of the Annotation Function. The Python code, particularly `ConfigService.py`, uses Pydantic models to parse this YAML, making the function adaptable to different data models and operational parameters. - - ## Overall Structure - - The YAML configuration is organized into logical blocks that correspond to different phases and components of the toolkit: - - - `dataModelViews`: Defines common Data Model views used across functions. - - `prepareFunction`: Settings for the initial file preparation phase. - - `launchFunction`: Settings for launching annotation tasks. - - `finalizeFunction`: Settings for processing and finalizing annotation results. - - The entire structure is parsed into a main `Config` Pydantic model. - - --- - ## 1. `dataModelViews` This section specifies the Data Model views the function will interact with. Each view is defined using a structure mapping to the `ViewPropertyConfig` Pydantic model. From 9b2dc9a041ec8f3250ee3d711d46b7ab8a178da6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Mon, 13 Oct 2025 11:12:37 -0500 Subject: [PATCH 094/128] initial commit - added the repair service --- .../fn_file_annotation_repair/dependencies.py | 17 + .../fn_file_annotation_repair/handler.py | 43 +++ .../services/ConfigService.py | 305 ++++++++++++++++++ .../services/LoggerService.py | 97 ++++++ .../services/RepairService.py | 124 +++++++ 5 files changed, 586 insertions(+) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py new file mode 100644 index 00000000..5e223ffd --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py @@ -0,0 +1,17 @@ +from cognite.client import CogniteClient +from services.ConfigService import Config, load_config_parameters +from services.LoggerService import CogniteFunctionLogger +from services.RepairService import GeneralRepairService + +def create_services(data: dict, client: CogniteClient): + """Factory function to create all necessary services.""" + config, client = load_config_parameters(client=client, function_data=data) + logger = CogniteFunctionLogger(log_level=data.get("logLevel", "INFO")) + + repair_service = GeneralRepairService( + client=client, + config=config, + logger=logger, + ) + + return repair_service, logger \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py new file mode 100644 index 00000000..68941633 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py @@ -0,0 +1,43 @@ +import time +from datetime import datetime, timezone, timedelta +from cognite.client import CogniteClient +from dependencies import create_services + +def handle(data: dict, function_call_info: dict, client: CogniteClient): + """Main entry point for the Cognite Function.""" + start_time = datetime.now(timezone.utc) + repair_service, logger = create_services(data, client) + + try: + # Run in a loop for a maximum of 7 minutes + while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): + result = repair_service.run() + if result == "Done": + logger.info("No more candidates to process. Exiting.", section="END") + break + time.sleep(10) # Pause between batches + + return {"status": "success", "message": "Repair function completed a cycle."} + except Exception as e: + logger.error(f"An unexpected error occurred: {e}", section="BOTH") + return {"status": "failure", "message": str(e)} + +def run_locally(config_file: dict, log_path: str | None = None): + """Entry point for local execution and debugging.""" + from dependencies import create_client, get_env_variables + + env_vars = get_env_variables() + client = create_client(env_vars) + + # Mock function_call_info for local runs + function_call_info = {"function_id": "local", "call_id": "local"} + + handle(config_file, function_call_info, client) + +if __name__ == "__main__": + # Example for running locally + config = { + "ExtractionPipelineExtId": "ep_file_annotation", # Replace with your pipeline ID + "logLevel": "DEBUG" + } + run_locally(config) \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py new file mode 100644 index 00000000..15f71ede --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py @@ -0,0 +1,305 @@ +from enum import Enum +from typing import Any, Literal, cast, Optional + +import yaml +from cognite.client.data_classes.contextualization import ( + DiagramDetectConfig, + ConnectionFlags, + CustomizeFuzziness, + DirectionWeights, +) +from cognite.client.data_classes.data_modeling import NodeId +from cognite.client.data_classes.filters import Filter +from cognite.client import CogniteClient +from cognite.client import data_modeling as dm +from cognite.client.exceptions import CogniteAPIError +from pydantic import BaseModel, Field +from pydantic.alias_generators import to_camel +from utils.DataStructures import AnnotationStatus, FilterOperator + + +# Configuration Classes +class ViewPropertyConfig(BaseModel, alias_generator=to_camel): + schema_space: str + instance_space: Optional[str] = None + external_id: str + version: str + annotation_type: Optional[Literal["diagrams.FileLink", "diagrams.AssetLink"]] = None + + def as_view_id(self) -> dm.ViewId: + return dm.ViewId(space=self.schema_space, external_id=self.external_id, version=self.version) + + def as_property_ref(self, property) -> list[str]: + return [self.schema_space, f"{self.external_id}/{self.version}", property] + + +class FilterConfig(BaseModel, alias_generator=to_camel): + values: Optional[list[AnnotationStatus | str] | AnnotationStatus | str] = None + negate: bool = False + operator: FilterOperator + target_property: str + + def as_filter(self, view_properties: ViewPropertyConfig) -> Filter: + property_reference = view_properties.as_property_ref(self.target_property) + + # Converts enum value into string -> i.e.) in the case of AnnotationStatus + if isinstance(self.values, list): + find_values = [v.value if isinstance(v, Enum) else v for v in self.values] + elif isinstance(self.values, Enum): + find_values = self.values.value + else: + find_values = self.values + + filter: Filter + if find_values is None: + if self.operator == FilterOperator.EXISTS: + filter = dm.filters.Exists(property=property_reference) + else: + raise ValueError(f"Operator {self.operator} requires a value") + elif self.operator == FilterOperator.IN: + if not isinstance(find_values, list): + raise ValueError(f"Operator 'IN' requires a list of values for property {self.target_property}") + filter = dm.filters.In(property=property_reference, values=find_values) + elif self.operator == FilterOperator.EQUALS: + filter = dm.filters.Equals(property=property_reference, value=find_values) + elif self.operator == FilterOperator.CONTAINSALL: + filter = dm.filters.ContainsAll(property=property_reference, values=find_values) + elif self.operator == FilterOperator.SEARCH: + filter = dm.filters.Search(property=property_reference, value=find_values) + else: + raise NotImplementedError(f"Operator {self.operator} is not implemented.") + + if self.negate: + return dm.filters.Not(filter) + else: + return filter + + +class QueryConfig(BaseModel, alias_generator=to_camel): + target_view: ViewPropertyConfig + filters: list[FilterConfig] + limit: Optional[int] = -1 + + def build_filter(self) -> Filter: + list_filters: list[Filter] = [f.as_filter(self.target_view) for f in self.filters] + + if len(list_filters) == 1: + return list_filters[0] + else: + return dm.filters.And(*list_filters) # NOTE: '*' Unpacks each filter in the list + + +class ConnectionFlagsConfig(BaseModel, alias_generator=to_camel): + no_text_inbetween: Optional[bool] = None + natural_reading_order: Optional[bool] = None + + def as_connection_flag(self) -> ConnectionFlags: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return ConnectionFlags(**params) + + +class CustomizeFuzzinessConfig(BaseModel, alias_generator=to_camel): + fuzzy_score: Optional[float] = None + max_boxes: Optional[int] = None + min_chars: Optional[int] = None + + def as_customize_fuzziness(self) -> CustomizeFuzziness: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return CustomizeFuzziness(**params) + + +class DirectionWeightsConfig(BaseModel, alias_generator=to_camel): + left: Optional[float] = None + right: Optional[float] = None + up: Optional[float] = None + down: Optional[float] = None + + def as_direction_weights(self) -> DirectionWeights: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return DirectionWeights(**params) + + +class DiagramDetectConfigModel(BaseModel, alias_generator=to_camel): + # NOTE: configs come from V7 of the cognite python sdk cognite SDK + annotation_extract: Optional[bool] = None + case_sensitive: Optional[bool] = None + connection_flags: Optional[ConnectionFlagsConfig] = None + customize_fuzziness: Optional[CustomizeFuzzinessConfig] = None + direction_delta: Optional[float] = None + direction_weights: Optional[DirectionWeightsConfig] = None + min_fuzzy_score: Optional[float] = None + read_embedded_text: Optional[bool] = None + remove_leading_zeros: Optional[bool] = None + substitutions: Optional[dict[str, list[str]]] = None + + def as_config(self) -> DiagramDetectConfig: + params = {} + if self.annotation_extract is not None: + params["annotation_extract"] = self.annotation_extract + if self.case_sensitive is not None: + params["case_sensitive"] = self.case_sensitive + if self.connection_flags is not None: + params["connection_flags"] = self.connection_flags.as_connection_flag() + if self.customize_fuzziness is not None: + params["customize_fuzziness"] = self.customize_fuzziness.as_customize_fuzziness() + if self.direction_delta is not None: + params["direction_delta"] = self.direction_delta + if self.direction_weights is not None: + params["direction_weights"] = self.direction_weights.as_direction_weights() + if self.min_fuzzy_score is not None: + params["min_fuzzy_score"] = self.min_fuzzy_score + if self.read_embedded_text is not None: + params["read_embedded_text"] = self.read_embedded_text + if self.remove_leading_zeros is not None: + params["remove_leading_zeros"] = self.remove_leading_zeros + if self.substitutions is not None: + params["substitutions"] = self.substitutions + + return DiagramDetectConfig(**params) + + +# Launch Related Configs +class DataModelServiceConfig(BaseModel, alias_generator=to_camel): + get_files_to_process_query: QueryConfig | list[QueryConfig] + get_target_entities_query: QueryConfig | list[QueryConfig] + get_file_entities_query: QueryConfig | list[QueryConfig] + + +class CacheServiceConfig(BaseModel, alias_generator=to_camel): + cache_time_limit: int + raw_db: str + raw_table_cache: str + raw_manual_patterns_catalog: str + + +class AnnotationServiceConfig(BaseModel, alias_generator=to_camel): + page_range: int = Field(gt=0, le=50) + partial_match: bool = True + min_tokens: int = 1 + diagram_detect_config: Optional[DiagramDetectConfigModel] = None + + +class PrepareFunction(BaseModel, alias_generator=to_camel): + get_files_for_annotation_reset_query: Optional[QueryConfig | list[QueryConfig]] = None + get_files_to_annotate_query: QueryConfig | list[QueryConfig] + + +class LaunchFunction(BaseModel, alias_generator=to_camel): + batch_size: int = Field(gt=0, le=50) + primary_scope_property: str + secondary_scope_property: Optional[str] = None + file_search_property: str = "aliases" + target_entities_search_property: str = "aliases" + pattern_mode: bool + file_resource_property: Optional[str] = None + target_entities_resource_property: Optional[str] = None + data_model_service: DataModelServiceConfig + cache_service: CacheServiceConfig + annotation_service: AnnotationServiceConfig + + +# Finalize Related Configs +class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): + get_job_id_query: QueryConfig | list[QueryConfig] + + +class ApplyServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId + auto_approval_threshold: float = Field(gt=0.0, le=1.0) + auto_suggest_threshold: float = Field(gt=0.0, le=1.0) + + +class ReportServiceConfig(BaseModel, alias_generator=to_camel): + raw_db: str + raw_table_doc_tag: str + raw_table_doc_doc: str + raw_table_doc_pattern: str + raw_batch_size: int + + +class FinalizeFunction(BaseModel, alias_generator=to_camel): + clean_old_annotations: bool + max_retry_attempts: int + retrieve_service: RetrieveServiceConfig + apply_service: ApplyServiceConfig + report_service: ReportServiceConfig + + +class DataModelViews(BaseModel, alias_generator=to_camel): + core_annotation_view: ViewPropertyConfig + annotation_state_view: ViewPropertyConfig + file_view: ViewPropertyConfig + target_entities_view: ViewPropertyConfig + + +class Config(BaseModel, alias_generator=to_camel): + data_model_views: DataModelViews + prepare_function: PrepareFunction + launch_function: LaunchFunction + finalize_function: FinalizeFunction + + @classmethod + def parse_direct_relation(cls, value: Any) -> Any: + if isinstance(value, dict): + return dm.DirectRelationReference.load(value) + return value + + +# Functions to construct queries +def get_limit_from_query(query: QueryConfig | list[QueryConfig]) -> int: + """ + Determines the retrieval limit from a query configuration. + Handles 'None' by treating it as the default -1 (unlimited). + """ + default_limit = -1 + if isinstance(query, list): + if not query: + return default_limit + limits = [q.limit if q.limit is not None else default_limit for q in query] + return max(limits) + else: + return query.limit if query.limit is not None else default_limit + + +def build_filter_from_query(query: QueryConfig | list[QueryConfig]) -> Filter: + """ + Builds a Cognite Filter from a query configuration. + + If the query is a list, it builds a filter for each item and combines them with a logical OR. + If the query is a single object, it builds the filter directly from it. + """ + if isinstance(query, list): + list_filters: list[Filter] = [q.build_filter() for q in query] + if not list_filters: + raise ValueError("Query list cannot be empty.") + return dm.filters.Or(*list_filters) if len(list_filters) > 1 else list_filters[0] + else: + return query.build_filter() + + +def load_config_parameters( + client: CogniteClient, + function_data: dict[str, Any], +) -> Config: + """ + Retrieves the configuration parameters from the function data and loads the configuration from CDF. + """ + if "ExtractionPipelineExtId" not in function_data: + raise ValueError("Missing key 'ExtractionPipelineExtId' in input data to the function") + + pipeline_ext_id = function_data["ExtractionPipelineExtId"] + try: + raw_config = client.extraction_pipelines.config.retrieve(pipeline_ext_id) + if raw_config.config is None: + raise ValueError(f"No config found for extraction pipeline: {pipeline_ext_id!r}") + except CogniteAPIError: + raise RuntimeError(f"Not able to retrieve pipeline config for extraction pipeline: {pipeline_ext_id!r}") + + loaded_yaml_data = yaml.safe_load(raw_config.config) + + if isinstance(loaded_yaml_data, dict): + return Config.model_validate(loaded_yaml_data) + else: + raise ValueError( + "Invalid configuration structure from CDF: \nExpected a YAML dictionary with a top-level 'config' key." + ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py new file mode 100644 index 00000000..17f24d6b --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py @@ -0,0 +1,97 @@ +from typing import Literal +import os + + +class CogniteFunctionLogger: + def __init__( + self, + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO", + write: bool = False, + filepath: str | None = None, + ): + self.log_level = log_level.upper() + self.write = write + self.filepath = filepath + self.file_handler = None + + if self.filepath and self.write: + try: + dir_name = os.path.dirname(self.filepath) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + self.file_handler = open(self.filepath, "a", encoding="utf-8") + except Exception as e: + print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") + self.write = False + + def _format_message_lines(self, prefix: str, message: str) -> list[str]: + formatted_lines = [] + if "\n" not in message: + formatted_lines.append(f"{prefix} {message}") + else: + lines = message.split("\n") + formatted_lines.append(f"{prefix}{lines[0]}") + padding = " " * len(prefix) + for line_content in lines[1:]: + formatted_lines.append(f"{padding} {line_content}") + return formatted_lines + + def _print(self, prefix: str, message: str) -> None: + lines_to_log = self._format_message_lines(prefix, message) + if self.write and self.file_handler: + try: + for line in lines_to_log: + print(line) + self.file_handler.write(line + "\n") + self.file_handler.flush() + except Exception as e: + print(f"[LOGGER_SETUP_ERROR] Could not write to {self.filepath}: {e}") + elif not self.write: + for line in lines_to_log: + print(line) + + def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + if section == "START" or section == "BOTH": + self._section() + if self.log_level == "DEBUG": + self._print("[DEBUG]", message) + if section == "END" or section == "BOTH": + self._section() + + def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + if section == "START" or section == "BOTH": + self._section() + if self.log_level in ("DEBUG", "INFO"): + self._print("[INFO]", message) + if section == "END" or section == "BOTH": + self._section() + + def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + if section == "START" or section == "BOTH": + self._section() + if self.log_level in ("DEBUG", "INFO", "WARNING"): + self._print("[WARNING]", message) + if section == "END" or section == "BOTH": + self._section() + + def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + if section == "START" or section == "BOTH": + self._section() + self._print("[ERROR]", message) + if section == "END" or section == "BOTH": + self._section() + + def _section(self) -> None: + if self.write and self.file_handler: + self.file_handler.write( + "--------------------------------------------------------------------------------\n" + ) + print("--------------------------------------------------------------------------------") + + def close(self) -> None: + if self.file_handler: + try: + self.file_handler.close() + except Exception as e: + print(f"[LOGGER_CLEANUP_ERROR] Error closing log file: {e}") + self.file_handler = None diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py new file mode 100644 index 00000000..515359ef --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py @@ -0,0 +1,124 @@ +import abc +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import ( + EdgeList, + EdgeApply, + NodeOrEdgeData, + DirectRelationReference, +) +from services.ConfigService import Config +from services.LoggerService import CogniteFunctionLogger +from utils.DataStructures import DiagramAnnotationStatus + + +class IRepairService(abc.ABC): + @abc.abstractmethod + def run(self) -> str | None: + pass + + +class GeneralRepairService(IRepairService): + def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): + self.client = client + self.config = config + self.logger = logger + self.core_annotation_view = self.config.data_model_views.core_annotation_view + self.file_view = self.config.data_model_views.file_view + self.target_entities_view = self.config.data_model_views.target_entities_view + self.sink_node_ref = DirectRelationReference( + space=self.config.finalize_function.sink_node.space, + external_id=self.config.finalize_function.sink_node.external_id, + ) + + def run(self) -> str | None: + """Main entrypoint for the repair service.""" + candidates = self._get_repair_candidates() + if not candidates: + self.logger.info("No repair candidates found.") + return "Done" + + self.logger.info(f"Found {len(candidates)} repair candidates. Starting processing.") + edges_to_update = [] + for edge in candidates: + properties = edge.properties[self.core_annotation_view.as_view_id()] + text_to_find = properties.get("startNodeText") + if not text_to_find: + continue + if properties.get("type") == "diagrams.FileLink": + search_space: str | None = self.file_view.instance_space + else: + search_space: str | None = self.target_entities_view.instance_space + found_nodes = self._find_global_entity(text_to_find, search_space) + edge_apply = self._prepare_edge_update(edge, found_nodes) + if edge_apply: + edges_to_update.append(edge_apply) + + if edges_to_update: + self.client.data_modeling.instances.apply(edges=edges_to_update) + self.logger.info(f"Successfully processed {len(edges_to_update)} edges.") + else: + self.logger.info("No edges were updated in this run.") + + return None # Continue running if more candidates might exist + + def _get_repair_candidates(self) -> EdgeList | None: + """Queries for suggested edges pointing to the sink node that haven't been repair-attempted.""" + return self.client.data_modeling.instances.list( + instance_type="edge", + sources=[self.core_annotation_view.as_view_id()], + filter={ + "and": [ + { + "equals": { + "property": ["edge", "endNode"], + "value": {"space": self.sink_node_ref.space, "externalId": self.sink_node_ref.external_id}, + } + }, + {"equals": {"property": self.core_annotation_view.as_property_ref("status"), "value": "Suggested"}}, + { + "not": { + "containsAny": { + "property": self.core_annotation_view.as_property_ref("tags"), + "values": ["repair-attempted"], + } + } + }, + ] + }, + limit=500, # Batch size + ) + + def _find_global_entity(self, text: str, space: str | None): + """Performs a global, un-scoped search for an entity matching the given text.""" + # NOTE: This approach is likely the slowest since we have to query against all instances, in a given space. + # Pros: The most accurate and guaranteed approach + # Cons: Will likely timeout as the amount of instances in a given space increase + return self.client.data_modeling.instances.list( + instance_type="node", + sources=[self.target_entities_view.as_view_id()], + filter={"equals": {"property": self.target_entities_view.as_property_ref("aliases"), "value": text}}, + space=space, + limit=2, # Limit to 2 to detect ambiguity + ) + + def _prepare_edge_update(self, edge: EdgeApply, found_nodes) -> EdgeApply | None: + """Prepares the EdgeApply object for the update based on the number of matches found.""" + edge_apply = edge.as_write() + properties = edge_apply.sources[0].properties + tags = properties.get("tags", []) + + if len(found_nodes) == 1: # Success + self.logger.info(f"Found single match for '{properties.get('startNodeText')}'. Promoting edge.") + edge_apply.end_node = found_nodes[0].as_direct_relation() + properties["status"] = DiagramAnnotationStatus.APPROVED.value + tags.append("repaired-auto") + elif len(found_nodes) == 0: # Failure + self.logger.info(f"Found no match for '{properties.get('startNodeText')}'. Rejecting edge.") + properties["status"] = DiagramAnnotationStatus.REJECTED.value + tags.append("repair-attempted") + else: # Ambiguous + self.logger.info(f"Found multiple matches for '{properties.get('startNodeText')}'. Marking as ambiguous.") + tags.extend(["repair-attempted", "ambiguous-match"]) + + properties["tags"] = tags + return edge_apply From 649c00638f760361a33c349f10126c7afa0e72dd Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 15 Oct 2025 11:08:30 -0500 Subject: [PATCH 095/128] changed repair service to promote service --- .../dependencies.py | 85 +++++++++++++++++++ .../handler.py | 41 ++++++--- .../services/ConfigService.py | 0 .../services/LoggerService.py | 0 .../services/PromoteService.py} | 33 +++---- .../fn_file_annotation_repair/dependencies.py | 17 ---- 6 files changed, 129 insertions(+), 47 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py rename modules/contextualization/cdf_file_annotation/functions/{fn_file_annotation_repair => fn_file_annotation_promote}/handler.py (57%) rename modules/contextualization/cdf_file_annotation/functions/{fn_file_annotation_repair => fn_file_annotation_promote}/services/ConfigService.py (100%) rename modules/contextualization/cdf_file_annotation/functions/{fn_file_annotation_repair => fn_file_annotation_promote}/services/LoggerService.py (100%) rename modules/contextualization/cdf_file_annotation/functions/{fn_file_annotation_repair/services/RepairService.py => fn_file_annotation_promote/services/PromoteService.py} (81%) delete mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py new file mode 100644 index 00000000..326e50cb --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py @@ -0,0 +1,85 @@ +import os + +from pathlib import Path +from dotenv import load_dotenv +from typing import Any, Tuple, Literal +from cognite.client import CogniteClient, ClientConfig, global_config +from cognite.client.credentials import OAuthClientCredentials +from utils.DataStructures import EnvConfig + +from services.ConfigService import Config, load_config_parameters +from services.LoggerService import CogniteFunctionLogger + + +def get_env_variables() -> EnvConfig: + print("Loading environment variables from .env...") + + project_path = (Path(__file__).parent / ".env").resolve() + print(f"project_path is set to: {project_path}") + + load_dotenv() + + required_envvars = ( + "CDF_PROJECT", + "CDF_CLUSTER", + "IDP_TENANT_ID", + "IDP_CLIENT_ID", + "IDP_CLIENT_SECRET", + ) + + missing = [envvar for envvar in required_envvars if envvar not in os.environ] + if missing: + raise ValueError(f"Missing one or more env.vars: {missing}") + + return EnvConfig( + cdf_project=os.getenv("CDF_PROJECT"), # type: ignore + cdf_cluster=os.getenv("CDF_CLUSTER"), # type: ignore + tenant_id=os.getenv("IDP_TENANT_ID"), # type: ignore + client_id=os.getenv("IDP_CLIENT_ID"), # type: ignore + client_secret=os.getenv("IDP_CLIENT_SECRET"), # type: ignore + ) + + +def create_client(env_config: EnvConfig, debug: bool = False): + SCOPES = [f"https://{env_config.cdf_cluster}.cognitedata.com/.default"] + TOKEN_URL = f"https://login.microsoftonline.com/{env_config.tenant_id}/oauth2/v2.0/token" + creds = OAuthClientCredentials( + token_url=TOKEN_URL, + client_id=env_config.client_id, + client_secret=env_config.client_secret, + scopes=SCOPES, + ) + settings = { + "disable_ssl": True, + } + global_config.apply_settings(settings) + cnf = ClientConfig( + client_name="DEV_Working", + project=env_config.cdf_project, + base_url=f"https://p001.plink.{env_config.cdf_cluster}.cognitedata.com", + credentials=creds, + debug=debug, + ) + client = CogniteClient(cnf) + return client + + +def create_logger_service(log_level, filepath: str | None): + if filepath: + write = True + else: + write = False + if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR"]: + return CogniteFunctionLogger() + else: + return CogniteFunctionLogger(log_level=log_level, write=write, filepath=filepath) + + +def create_config_service( + function_data: dict[str, Any], client: CogniteClient | None = None +) -> Tuple[Config, CogniteClient]: + if not client: + env_config = get_env_variables() + client = create_client(env_config) + config = load_config_parameters(client=client, function_data=function_data) + return config, client diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py similarity index 57% rename from modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py rename to modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index 68941633..b4b1e960 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -1,43 +1,56 @@ +import os +import sys import time from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient -from dependencies import create_services +from dependencies import create_config_service, create_logger_service +from services.PromoteService import GeneralPromoteService + def handle(data: dict, function_call_info: dict, client: CogniteClient): """Main entry point for the Cognite Function.""" start_time = datetime.now(timezone.utc) - repair_service, logger = create_services(data, client) + + config, client = create_config_service(function_data=data) + logger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) + promote_service = GeneralPromoteService( + client=client, + config=config, + logger=logger, + ) try: # Run in a loop for a maximum of 7 minutes while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): - result = repair_service.run() + result = promote_service.run() if result == "Done": logger.info("No more candidates to process. Exiting.", section="END") break - time.sleep(10) # Pause between batches - - return {"status": "success", "message": "Repair function completed a cycle."} + time.sleep(10) # Pause between batches + + return {"status": "success", "message": "promote function completed a cycle."} except Exception as e: logger.error(f"An unexpected error occurred: {e}", section="BOTH") return {"status": "failure", "message": str(e)} -def run_locally(config_file: dict, log_path: str | None = None): + +def run_locally(config_file: dict): """Entry point for local execution and debugging.""" from dependencies import create_client, get_env_variables env_vars = get_env_variables() client = create_client(env_vars) - + # Mock function_call_info for local runs function_call_info = {"function_id": "local", "call_id": "local"} - + handle(config_file, function_call_info, client) + if __name__ == "__main__": - # Example for running locally - config = { - "ExtractionPipelineExtId": "ep_file_annotation", # Replace with your pipeline ID - "logLevel": "DEBUG" + config_file = { + "ExtractionPipelineExtId": sys.argv[1], + "logLevel": sys.argv[2], + "logPath": sys.argv[3], } - run_locally(config) \ No newline at end of file + run_locally(config_file) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py similarity index 100% rename from modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/ConfigService.py rename to modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/LoggerService.py similarity index 100% rename from modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/LoggerService.py rename to modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/LoggerService.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py similarity index 81% rename from modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py rename to modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 515359ef..d3170410 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/services/RepairService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -11,13 +11,13 @@ from utils.DataStructures import DiagramAnnotationStatus -class IRepairService(abc.ABC): +class IPromoteService(abc.ABC): @abc.abstractmethod def run(self) -> str | None: pass -class GeneralRepairService(IRepairService): +class GeneralPromoteService(IPromoteService): def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): self.client = client self.config = config @@ -26,18 +26,18 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio self.file_view = self.config.data_model_views.file_view self.target_entities_view = self.config.data_model_views.target_entities_view self.sink_node_ref = DirectRelationReference( - space=self.config.finalize_function.sink_node.space, - external_id=self.config.finalize_function.sink_node.external_id, + space=self.config.finalize_function.apply_service.sink_node.space, + external_id=self.config.finalize_function.apply_service.sink_node.external_id, ) def run(self) -> str | None: - """Main entrypoint for the repair service.""" - candidates = self._get_repair_candidates() + """Main entrypoint for the Promote service.""" + candidates = self._get_Promote_candidates() if not candidates: - self.logger.info("No repair candidates found.") + self.logger.info("No Promote candidates found.") return "Done" - self.logger.info(f"Found {len(candidates)} repair candidates. Starting processing.") + self.logger.info(f"Found {len(candidates)} Promote candidates. Starting processing.") edges_to_update = [] for edge in candidates: properties = edge.properties[self.core_annotation_view.as_view_id()] @@ -55,14 +55,15 @@ def run(self) -> str | None: if edges_to_update: self.client.data_modeling.instances.apply(edges=edges_to_update) + self.client.raw.rows.insert self.logger.info(f"Successfully processed {len(edges_to_update)} edges.") else: self.logger.info("No edges were updated in this run.") return None # Continue running if more candidates might exist - def _get_repair_candidates(self) -> EdgeList | None: - """Queries for suggested edges pointing to the sink node that haven't been repair-attempted.""" + def _get_Promote_candidates(self) -> EdgeList | None: + """Queries for suggested edges pointing to the sink node that haven't been Promote-attempted.""" return self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view.as_view_id()], @@ -79,7 +80,7 @@ def _get_repair_candidates(self) -> EdgeList | None: "not": { "containsAny": { "property": self.core_annotation_view.as_property_ref("tags"), - "values": ["repair-attempted"], + "values": ["Promote-attempted"], } } }, @@ -96,7 +97,7 @@ def _find_global_entity(self, text: str, space: str | None): return self.client.data_modeling.instances.list( instance_type="node", sources=[self.target_entities_view.as_view_id()], - filter={"equals": {"property": self.target_entities_view.as_property_ref("aliases"), "value": text}}, + filter={"in": {"property": self.target_entities_view.as_property_ref("aliases"), "values": [text]}}, space=space, limit=2, # Limit to 2 to detect ambiguity ) @@ -109,16 +110,16 @@ def _prepare_edge_update(self, edge: EdgeApply, found_nodes) -> EdgeApply | None if len(found_nodes) == 1: # Success self.logger.info(f"Found single match for '{properties.get('startNodeText')}'. Promoting edge.") - edge_apply.end_node = found_nodes[0].as_direct_relation() + edge_apply.end_node = DirectRelationReference(found_nodes[0].space, found_nodes[0].external_id) properties["status"] = DiagramAnnotationStatus.APPROVED.value - tags.append("repaired-auto") + tags.append("Promoteed-auto") elif len(found_nodes) == 0: # Failure self.logger.info(f"Found no match for '{properties.get('startNodeText')}'. Rejecting edge.") properties["status"] = DiagramAnnotationStatus.REJECTED.value - tags.append("repair-attempted") + tags.append("Promote-attempted") else: # Ambiguous self.logger.info(f"Found multiple matches for '{properties.get('startNodeText')}'. Marking as ambiguous.") - tags.extend(["repair-attempted", "ambiguous-match"]) + tags.extend(["Promote-attempted", "ambiguous-match"]) properties["tags"] = tags return edge_apply diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py deleted file mode 100644 index 5e223ffd..00000000 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_repair/dependencies.py +++ /dev/null @@ -1,17 +0,0 @@ -from cognite.client import CogniteClient -from services.ConfigService import Config, load_config_parameters -from services.LoggerService import CogniteFunctionLogger -from services.RepairService import GeneralRepairService - -def create_services(data: dict, client: CogniteClient): - """Factory function to create all necessary services.""" - config, client = load_config_parameters(client=client, function_data=data) - logger = CogniteFunctionLogger(log_level=data.get("logLevel", "INFO")) - - repair_service = GeneralRepairService( - client=client, - config=config, - logger=logger, - ) - - return repair_service, logger \ No newline at end of file From 018621ee6eb20089a74a9fd539527a35e103614a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Wed, 15 Oct 2025 16:44:12 -0500 Subject: [PATCH 096/128] built out cache mechanism and query plan for the text --- .../dependencies.py | 46 ++- .../fn_file_annotation_promote/handler.py | 15 +- .../services/CacheService.py | 292 +++++++++++++++ .../services/ConfigService.py | 7 +- .../services/EntitySearchService.py | 306 ++++++++++++++++ .../services/PromoteService.py | 241 ++++++++++--- .../utils/DataStructures.py | 331 ++++++++++++++++++ 7 files changed, 1181 insertions(+), 57 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py index 326e50cb..8d26fbb8 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py @@ -9,6 +9,8 @@ from services.ConfigService import Config, load_config_parameters from services.LoggerService import CogniteFunctionLogger +from services.EntitySearchService import EntitySearchService +from services.CacheService import CacheService def get_env_variables() -> EnvConfig: @@ -56,7 +58,7 @@ def create_client(env_config: EnvConfig, debug: bool = False): cnf = ClientConfig( client_name="DEV_Working", project=env_config.cdf_project, - base_url=f"https://p001.plink.{env_config.cdf_cluster}.cognitedata.com", + base_url=f"https://{env_config.cdf_cluster}.cognitedata.com", credentials=creds, debug=debug, ) @@ -83,3 +85,45 @@ def create_config_service( client = create_client(env_config) config = load_config_parameters(client=client, function_data=function_data) return config, client + + +def create_entity_search_service( + config: Config, client: CogniteClient, logger: CogniteFunctionLogger +) -> EntitySearchService: + """Creates an EntitySearchService instance for finding entities by text.""" + # Get required configuration + core_annotation_view = config.data_model_views.core_annotation_view + file_view = config.data_model_views.file_view + target_entities_view = config.data_model_views.target_entities_view + regular_annotation_space = file_view.instance_space + + if not regular_annotation_space: + raise ValueError("regular_annotation_space (file_view.instance_space) is required but was None") + + return EntitySearchService( + client=client, + logger=logger, + core_annotation_view_id=core_annotation_view.as_view_id(), + file_view_id=file_view.as_view_id(), + target_entities_view_id=target_entities_view.as_view_id(), + regular_annotation_space=regular_annotation_space, + ) + + +def create_cache_service( + config: Config, client: CogniteClient, logger: CogniteFunctionLogger, entity_search_service: EntitySearchService +) -> CacheService: + """Creates a CacheService instance for caching text→entity mappings.""" + raw_db = config.finalize_function.apply_service.raw_db + file_view = config.data_model_views.file_view + target_entities_view = config.data_model_views.target_entities_view + + return CacheService( + client=client, + logger=logger, + raw_db=raw_db, + normalize_fn=entity_search_service.normalize, # Reuse normalization from entity search + file_view_id=file_view.as_view_id(), + target_entities_view_id=target_entities_view.as_view_id(), + cache_table_name="promote_text_to_entity_cache", + ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index b4b1e960..b08bfb4e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -3,7 +3,12 @@ import time from datetime import datetime, timezone, timedelta from cognite.client import CogniteClient -from dependencies import create_config_service, create_logger_service +from dependencies import ( + create_config_service, + create_logger_service, + create_entity_search_service, + create_cache_service, +) from services.PromoteService import GeneralPromoteService @@ -13,10 +18,18 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient): config, client = create_config_service(function_data=data) logger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) + + # Create service dependencies + entity_search_service = create_entity_search_service(config, client, logger) + cache_service = create_cache_service(config, client, logger, entity_search_service) + + # Create promote service with injected dependencies promote_service = GeneralPromoteService( client=client, config=config, logger=logger, + entity_search_service=entity_search_service, + cache_service=cache_service, ) try: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py new file mode 100644 index 00000000..169dbef5 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -0,0 +1,292 @@ +import abc +from datetime import datetime, timezone, timedelta +from typing import Callable +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import Node, NodeList +from cognite.client.data_classes.data_modeling.ids import ViewId +from cognite.client.data_classes.raw import Row +from services.LoggerService import CogniteFunctionLogger + + +class ICacheService(abc.ABC): + @abc.abstractmethod + def get(self, text: str, annotation_type: str) -> Node | None: + """Retrieves a cached entity node for the given text and annotation type.""" + pass + + @abc.abstractmethod + def set(self, text: str, annotation_type: str, node: Node) -> None: + """Caches an entity node for the given text and annotation type.""" + pass + + @abc.abstractmethod + def get_from_memory(self, text: str, annotation_type: str) -> Node | None: + """Retrieves from in-memory cache only (no persistent storage lookup).""" + pass + + +class CacheService(ICacheService): + """ + Manages two-tier caching for text → entity mappings. + + TIER 1: In-memory cache (this run only) + - Ultra-fast lookup (<1ms) + - Includes negative caching (None for no match) + + TIER 2: Persistent RAW cache (all runs) + - Fast lookup (5-10ms) + - Benefits all future runs + - Tracks hit count for analytics + """ + + def __init__( + self, + client: CogniteClient, + logger: CogniteFunctionLogger, + raw_db: str, + normalize_fn: Callable[[str], str], + file_view_id: ViewId, + target_entities_view_id: ViewId, + cache_table_name: str = "promote_text_to_entity_cache", + ): + """ + Initializes the cache service. + + Args: + client: Cognite client + logger: Logger instance + raw_db: RAW database name + normalize_fn: Function to normalize text for cache keys + file_view_id: View ID for file entities + target_entities_view_id: View ID for target entities (assets, etc.) + cache_table_name: Name of the RAW table for persistent cache + """ + self.client = client + self.logger = logger + self.raw_db = raw_db + self.normalize = normalize_fn + self.file_view_id = file_view_id + self.target_entities_view_id = target_entities_view_id + self.cache_table_name = cache_table_name + + # In-memory cache: {(text, type): (space, ext_id) or None} + self._memory_cache: dict[tuple[str, str], tuple[str, str] | None] = {} + + def get(self, text: str, annotation_type: str) -> Node | None: + """ + Retrieves a cached entity node for the given text and annotation type. + + Checks in-memory cache first, then falls back to persistent RAW cache. + + Args: + text: The text to look up + annotation_type: Type of annotation ("diagrams.FileLink" or "diagrams.AssetLink") + + Returns: + Cached Node if found, None if cache miss + """ + cache_key = (text, annotation_type) + + # TIER 1: In-memory cache (instant) + if cache_key in self._memory_cache: + cached_result = self._memory_cache[cache_key] + if cached_result is None: + # Negative cache entry + return None + + # Retrieve the node from cache + space, ext_id = cached_result + view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + + try: + retrieved = self.client.data_modeling.instances.retrieve_nodes(nodes=(space, ext_id), sources=view_id) + if retrieved: + self.logger.debug(f"✓ In-memory cache HIT for '{text}'") + node = self._extract_single_node(retrieved) + return node + except Exception as e: + self.logger.warning(f"Failed to retrieve cached node for '{text}': {e}") + # Invalidate this cache entry + del self._memory_cache[cache_key] + return None + + # TIER 2: Persistent RAW cache (fast) + cached_node = self._get_from_persistent_cache(text, annotation_type) + if cached_node: + self.logger.info(f"✓ Persistent cache HIT for '{text}'") + # Populate in-memory cache for future lookups in this run + self._memory_cache[cache_key] = (cached_node.space, cached_node.external_id) + return cached_node + + # Cache miss + return None + + def get_from_memory(self, text: str, annotation_type: str) -> Node | None: + """ + Retrieves from in-memory cache only (no persistent storage lookup). + + Useful for checking if we've already looked up this text in this run. + + Args: + text: The text to look up + annotation_type: Type of annotation + + Returns: + Cached Node if found in memory, None otherwise + """ + cache_key = (text, annotation_type) + if cache_key not in self._memory_cache: + return None + + cached_result = self._memory_cache[cache_key] + if cached_result is None: + return None + + space, ext_id = cached_result + view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + + try: + retrieved = self.client.data_modeling.instances.retrieve_nodes(nodes=(space, ext_id), sources=view_id) + if retrieved: + return self._extract_single_node(retrieved) + except Exception: + pass + + return None + + def set(self, text: str, annotation_type: str, node: Node | None) -> None: + """ + Caches an entity node for the given text and annotation type. + + Only caches unambiguous single matches. Updates both in-memory and persistent caches. + + Args: + text: The text being cached + annotation_type: Type of annotation + node: The entity node to cache, or None for negative caching + """ + cache_key = (text, annotation_type) + + if node is None: + # Negative cache entry (remember that no match was found) + self._memory_cache[cache_key] = None + self.logger.debug(f"✓ Cached negative result for '{text}'") + return + + # Positive cache entry + self._memory_cache[cache_key] = (node.space, node.external_id) + self._set_in_persistent_cache(text, annotation_type, node) + self.logger.debug(f"✓ Cached unambiguous match for '{text}' → {node.external_id}") + + def _get_from_persistent_cache(self, text: str, annotation_type: str) -> Node | None: + """ + Checks persistent RAW cache for text → entity mapping. + + Returns: + Node if cache hit, None if miss + """ + try: + # Normalize text for consistent cache keys + cache_key = self.normalize(text) + + row = self.client.raw.rows.retrieve( + db_name=self.raw_db, + table_name=self.cache_table_name, + key=cache_key, + ) + + if not row or not row.columns: + return None + + # Verify annotation type matches + if row.columns.get("annotationType") != annotation_type: + return None + + # Retrieve the cached node + end_node_space = row.columns.get("endNodeSpace") + end_node_ext_id = row.columns.get("endNode") + + if not end_node_space or not end_node_ext_id: + return None + + view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + + retrieved = self.client.data_modeling.instances.retrieve_nodes( + nodes=(end_node_space, end_node_ext_id), sources=view_id + ) + + if retrieved: + return self._extract_single_node(retrieved) + + return None + + except Exception as e: + # Cache miss or error - just continue without cache + self.logger.debug(f"Cache check failed for '{text}': {e}") + return None + + def _set_in_persistent_cache(self, text: str, annotation_type: str, node: Node) -> None: + """ + Updates persistent RAW cache with text → entity mapping. + Only caches unambiguous single matches. + """ + try: + cache_key = self.normalize(text) + + cache_data = Row( + key=cache_key, + columns={ + "originalText": text, + "endNode": node.external_id, + "endNodeSpace": node.space, + "annotationType": annotation_type, + "lastUpdateTimeUtcIso": datetime.now(timezone.utc).isoformat(), + }, + ) + + self.client.raw.rows.insert( + db_name=self.raw_db, + table_name=self.cache_table_name, + row=cache_data, + ensure_parent=True, + ) + + except Exception as e: + # Don't fail the run if cache update fails + self.logger.warning(f"Failed to update cache for '{text}': {e}") + + def _extract_single_node(self, retrieved: Node | NodeList) -> Node | None: + """ + Extracts a single Node from the retrieved result. + + Handles both single Node and NodeList returns from the SDK. + """ + if isinstance(retrieved, NodeList) and len(retrieved) > 0: + first_node = list(retrieved)[0] + return first_node if isinstance(first_node, Node) else None + elif isinstance(retrieved, Node): + return retrieved + else: + return None + + def get_stats(self) -> dict[str, int]: + """ + Returns statistics about the in-memory cache. + + Returns: + Dictionary with cache statistics + """ + total_entries = len(self._memory_cache) + negative_entries = sum(1 for v in self._memory_cache.values() if v is None) + positive_entries = total_entries - negative_entries + + return { + "total_entries": total_entries, + "positive_entries": positive_entries, + "negative_entries": negative_entries, + } + + def clear_memory_cache(self) -> None: + """Clears the in-memory cache. Useful for testing.""" + self._memory_cache.clear() + self.logger.debug("In-memory cache cleared") diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index 15f71ede..8b5bd257 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -204,17 +204,13 @@ class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): class ApplyServiceConfig(BaseModel, alias_generator=to_camel): - sink_node: NodeId auto_approval_threshold: float = Field(gt=0.0, le=1.0) auto_suggest_threshold: float = Field(gt=0.0, le=1.0) - - -class ReportServiceConfig(BaseModel, alias_generator=to_camel): + sink_node: NodeId raw_db: str raw_table_doc_tag: str raw_table_doc_doc: str raw_table_doc_pattern: str - raw_batch_size: int class FinalizeFunction(BaseModel, alias_generator=to_camel): @@ -222,7 +218,6 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): max_retry_attempts: int retrieve_service: RetrieveServiceConfig apply_service: ApplyServiceConfig - report_service: ReportServiceConfig class DataModelViews(BaseModel, alias_generator=to_camel): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py new file mode 100644 index 00000000..fef1a521 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -0,0 +1,306 @@ +import abc +import re +from typing import Callable +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import Node, NodeList, ViewId +from cognite.client.data_classes.filters import Filter, Equals, In +from services.LoggerService import CogniteFunctionLogger + + +class IEntitySearchService(abc.ABC): + @abc.abstractmethod + def find_entity(self, text: str, annotation_type: str, entity_space: str) -> list[Node]: + """Finds entities matching the given text using multiple strategies.""" + pass + + +class EntitySearchService(IEntitySearchService): + """ + Finds entities by text using multiple search strategies. + + Search Strategy: + 1. Existing annotations (fast, reliable, leverages proven connections) + 2. Global entity search (slow, comprehensive, fallback) + + Utilities: + - Text variation generation (handles case, leading zeros) + - Text normalization (for comparison) + """ + + def __init__( + self, + client: CogniteClient, + logger: CogniteFunctionLogger, + core_annotation_view_id: ViewId, + file_view_id: ViewId, + target_entities_view_id: ViewId, + regular_annotation_space: str, + ): + """ + Initializes the entity search service. + + Args: + client: Cognite client + logger: Logger instance + core_annotation_view_id: View ID for annotation edges + file_view_id: View ID for file entities + target_entities_view_id: View ID for target entities (assets, etc.) + regular_annotation_space: Space where regular (non-pattern) annotations are stored + """ + self.client = client + self.logger = logger + self.core_annotation_view_id = core_annotation_view_id + self.file_view_id = file_view_id + self.target_entities_view_id = target_entities_view_id + self.regular_annotation_space = regular_annotation_space + + def find_entity(self, text: str, annotation_type: str, entity_space: str) -> list[Node]: + """ + Finds entities matching the given text using multiple strategies. + + This is the main entry point for entity search. + + Strategy: + 1. Try existing annotations (fast, 50-100ms) + 2. Fall back to global search (slow, 500ms-2s) + + Args: + text: Text to search for (e.g., "V-123", "G18A-921") + annotation_type: Type of annotation ("diagrams.FileLink" or "diagrams.AssetLink") + entity_space: Space to search in for global fallback + + Returns: + List of matched nodes: + - [] if no match found + - [node] if single unambiguous match + - [node1, node2] if ambiguous (multiple matches) + """ + # STRATEGY 1: Query existing annotations (primary, fast) + found_nodes = self.find_from_existing_annotations(text, annotation_type) + + if not found_nodes: + # STRATEGY 2: Global entity search (fallback, slow) + self.logger.debug(f"No match in existing annotations for '{text}'. Trying global entity search.") + found_nodes = self.find_global_entity(text, entity_space) + + return found_nodes + + def find_from_existing_annotations(self, text: str, annotation_type: str) -> list[Node]: + """ + Searches for existing successful annotations with matching startNodeText. + + This is MUCH faster than querying all entity aliases because: + 1. Queries edges directly with server-side filtering (indexed and fast) + 2. Uses IN filter with text variations to handle common differences + 3. Only searches proven successful annotations + 4. Handles cross-scope scenarios naturally (entity in different site/unit) + + Args: + text: The text to search for (e.g., "G18A-921") + annotation_type: "diagrams.FileLink" or "diagrams.AssetLink" + + Returns: + List of matched entity nodes (0, 1, or 2+ for ambiguous) + """ + try: + # Generate variations of the search text + text_variations = self.generate_text_variations(text) + self.logger.debug(f"Searching for text variations: {text_variations}") + + # Query edges directly with IN filter (server-side, fast!) + # These are annotation edges from regular diagram detect (not pattern mode) + text_filter: Filter = In(self.core_annotation_view_id.as_property_ref("startNodeText"), text_variations) + edges = self.client.data_modeling.instances.list( + instance_type="edge", + sources=[self.core_annotation_view_id], + filter=text_filter, + space=self.regular_annotation_space, # Where regular annotations live + limit=1000, # Reasonable limit + ) + + if not edges: + return [] + + # Count occurrences of each endNode + matched_end_nodes = {} # {(space, externalId): count} + for edge in edges: + # Check annotation type matches + edge_props = edge.properties.get(self.core_annotation_view_id, {}) + edge_type = edge_props.get("type") + + if edge_type != annotation_type: + continue # Skip edges of different type + + # Extract endNode from the edge + end_node_ref = edge.end_node + if end_node_ref: + key = (end_node_ref.space, end_node_ref.external_id) + matched_end_nodes[key] = matched_end_nodes.get(key, 0) + 1 + + if not matched_end_nodes: + return [] + + # If multiple different endNodes found, it's ambiguous + if len(matched_end_nodes) > 1: + self.logger.warning( + f"Found {len(matched_end_nodes)} different entities for '{text}' in existing annotations. " + f"This indicates data quality issues or legitimate ambiguity." + ) + # Return list of most common matches (limit to 2 for ambiguity detection) + sorted_matches = sorted(matched_end_nodes.items(), key=lambda x: x[1], reverse=True) + top_matches = [match[0] for match in sorted_matches[:2]] + else: + # Single consistent match found + top_matches = [list(matched_end_nodes.keys())[0]] + + # Fetch the actual node objects for the matched entities + view_to_use = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + + matched_nodes = [] + for space, ext_id in top_matches: + retrieved = self.client.data_modeling.instances.retrieve_nodes( + nodes=(space, ext_id), sources=view_to_use + ) + # Handle both single Node and NodeList returns + if retrieved: + if isinstance(retrieved, list): + matched_nodes.extend(retrieved) + else: + matched_nodes.append(retrieved) + + if matched_nodes: + self.logger.info( + f"Found {len(matched_nodes)} match(es) for '{text}' from existing annotations " + f"(appeared {matched_end_nodes.get((matched_nodes[0].space, matched_nodes[0].external_id), 0)} times)" + ) + + return matched_nodes + + except Exception as e: + self.logger.error(f"Error searching existing annotations for '{text}': {e}") + return [] + + def find_global_entity(self, text: str, entity_space: str) -> list[Node]: + """ + Performs a global, un-scoped search for an entity matching the given text. + Uses normalized matching to handle variations like "V-0912" vs "V-912". + + NOTE: This approach queries all instances in a given space. + Pros: The most accurate and guaranteed approach + Cons: Will likely timeout as the amount of instances in a given space increase + + Args: + text: Text to search for + entity_space: Space to search in + + Returns: + List of matched nodes (0, 1, or 2 for ambiguity detection) + """ + # Normalize the search text + normalized_text = self.normalize(text) + + # Fetch all entities in the space (with reasonable limit) + # NOTE: We can't do normalized matching server-side, so we fetch and filter client-side + try: + entities = self.client.data_modeling.instances.list( + instance_type="node", + sources=[self.target_entities_view_id], + space=entity_space, + limit=1000, # Reasonable limit to prevent timeouts + ) + except Exception as e: + self.logger.error(f"Error fetching entities from space '{entity_space}': {e}") + return [] + + # Client-side normalized matching against aliases + matches = [] + for entity in entities: + entity_props = entity.properties.get(self.target_entities_view_id, {}) + aliases = entity_props.get("aliases", []) + + # Ensure aliases is iterable + if not isinstance(aliases, list): + continue + + # Check if any alias matches after normalization + for alias in aliases: + if isinstance(alias, str) and self.normalize(alias) == normalized_text: + matches.append(entity) + # Stop after finding 2 matches (ambiguous case) + if len(matches) >= 2: + self.logger.warning( + f"Found multiple entities with alias matching '{text}' (normalized: '{normalized_text}'). " + f"This is ambiguous." + ) + return matches[:2] + break # Move to next entity after finding match + + if matches: + self.logger.info(f"Found {len(matches)} match(es) for '{text}' via global entity search") + + return matches + + def generate_text_variations(self, text: str) -> list[str]: + """ + Generates common variations of a text string to improve matching. + + Examples: + "14-V-0937" → ["14-V-0937", "14-V-937", "14-v-0937", "14-v-937"] + "P&ID-001" → ["P&ID-001", "P&ID-1", "p&id-001", "p&id-1"] + + Args: + text: Original text from pattern detection + + Returns: + List of text variations (original + common transformations) + """ + variations = set() + variations.add(text) # Always include original + + # Add lowercase version + variations.add(text.lower()) + + # Remove leading zeros from number sequences + def strip_leading_zeros_in_text(s: str) -> str: + return re.sub(r"\b0+(\d+)", r"\1", s) + + variations.add(strip_leading_zeros_in_text(text)) + variations.add(strip_leading_zeros_in_text(text.lower())) + + return list(variations) + + def normalize(self, s: str) -> str: + """ + Normalizes a string for comparison. + + Process: + 1. Ensures it's a string + 2. Removes all non-alphanumeric characters + 3. Converts to lowercase + 4. Removes leading zeros from any sequence of digits + + Examples: + "V-0912" -> "v912" + "FT-101A" -> "ft101a" + "P&ID-0001" -> "pid1" + + Args: + s: String to normalize + + Returns: + Normalized string + """ + if not isinstance(s, str): + return "" + + # Step 1: Basic cleaning (e.g., "V-0912" -> "v0912") + s = re.sub(r"[^a-zA-Z0-9]", "", s).lower() + + # Step 2: Define a replacer function that converts any matched number to an int and back to a string + def strip_leading_zeros(match): + # match.group(0) is the matched string (e.g., "0912") + return str(int(match.group(0))) + + # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string + # This turns "v0912" into "v912" + return re.sub(r"\d+", strip_leading_zeros, s) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index d3170410..dec70ccf 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -1,13 +1,20 @@ import abc +from typing import Any from cognite.client import CogniteClient +from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ( + Edge, EdgeList, EdgeApply, + Node, NodeOrEdgeData, DirectRelationReference, + NodeList, ) from services.ConfigService import Config from services.LoggerService import CogniteFunctionLogger +from services.CacheService import CacheService +from services.EntitySearchService import EntitySearchService from utils.DataStructures import DiagramAnnotationStatus @@ -18,7 +25,14 @@ def run(self) -> str | None: class GeneralPromoteService(IPromoteService): - def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctionLogger): + def __init__( + self, + client: CogniteClient, + config: Config, + logger: CogniteFunctionLogger, + entity_search_service: EntitySearchService, + cache_service: CacheService, + ): self.client = client self.config = config self.logger = logger @@ -29,40 +43,88 @@ def __init__(self, client: CogniteClient, config: Config, logger: CogniteFunctio space=self.config.finalize_function.apply_service.sink_node.space, external_id=self.config.finalize_function.apply_service.sink_node.external_id, ) + self.raw_db = self.config.finalize_function.apply_service.raw_db + self.raw_pattern_table = self.config.finalize_function.apply_service.raw_table_doc_pattern + self.raw_doc_doc_table = self.config.finalize_function.apply_service.raw_table_doc_doc + self.raw_doc_tag_table = self.config.finalize_function.apply_service.raw_table_doc_tag + + # Injected service dependencies + self.entity_search_service = entity_search_service + self.cache_service = cache_service def run(self) -> str | None: """Main entrypoint for the Promote service.""" - candidates = self._get_Promote_candidates() + candidates = self._get_promote_candidates() if not candidates: self.logger.info("No Promote candidates found.") return "Done" self.logger.info(f"Found {len(candidates)} Promote candidates. Starting processing.") - edges_to_update = [] + + # Group candidates by (startNodeText, annotationType) for deduplication + grouped_candidates: dict[tuple[str, str], list[Edge]] = {} for edge in candidates: properties = edge.properties[self.core_annotation_view.as_view_id()] - text_to_find = properties.get("startNodeText") - if not text_to_find: + text = properties.get("startNodeText") + annotation_type = edge.type.external_id + + if text and annotation_type: + key = (text, annotation_type) + if key not in grouped_candidates: + grouped_candidates[key] = [] + grouped_candidates[key].append(edge) + + self.logger.info( + f"Grouped {len(candidates)} candidates into {len(grouped_candidates)} unique text/type combinations. " + f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided." + ) + + edges_to_update = [] + raw_rows_to_update = [] + + # Process each unique text/type combination once + for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items(): + entity_space = ( + self.file_view.instance_space + if annotation_type == "diagrams.FileLink" + else self.target_entities_view.instance_space + ) + + if not entity_space: + self.logger.warning(f"Could not determine entity space for type '{annotation_type}'. Skipping.") continue - if properties.get("type") == "diagrams.FileLink": - search_space: str | None = self.file_view.instance_space - else: - search_space: str | None = self.target_entities_view.instance_space - found_nodes = self._find_global_entity(text_to_find, search_space) - edge_apply = self._prepare_edge_update(edge, found_nodes) - if edge_apply: - edges_to_update.append(edge_apply) + + # Strategy: Check cache → query edges → fallback to global search + found_nodes = self._find_entity_with_cache(text_to_find, annotation_type, entity_space) + + # Apply the same result to ALL edges with this text + for edge in edges_with_same_text: + edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) + + if edge_apply: + edges_to_update.append(edge_apply) + if raw_row: + raw_rows_to_update.append(raw_row) if edges_to_update: self.client.data_modeling.instances.apply(edges=edges_to_update) - self.client.raw.rows.insert - self.logger.info(f"Successfully processed {len(edges_to_update)} edges.") - else: + self.logger.info(f"Successfully updated {len(edges_to_update)} edges in data model.") + + if raw_rows_to_update: + self.client.raw.rows.insert( + db_name=self.raw_db, + table_name=self.raw_pattern_table, + row=raw_rows_to_update, + ensure_parent=True, + ) + self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.") + + if not edges_to_update and not raw_rows_to_update: self.logger.info("No edges were updated in this run.") return None # Continue running if more candidates might exist - def _get_Promote_candidates(self) -> EdgeList | None: + def _get_promote_candidates(self) -> EdgeList | None: """Queries for suggested edges pointing to the sink node that haven't been Promote-attempted.""" return self.client.data_modeling.instances.list( instance_type="edge", @@ -89,37 +151,118 @@ def _get_Promote_candidates(self) -> EdgeList | None: limit=500, # Batch size ) - def _find_global_entity(self, text: str, space: str | None): - """Performs a global, un-scoped search for an entity matching the given text.""" - # NOTE: This approach is likely the slowest since we have to query against all instances, in a given space. - # Pros: The most accurate and guaranteed approach - # Cons: Will likely timeout as the amount of instances in a given space increase - return self.client.data_modeling.instances.list( - instance_type="node", - sources=[self.target_entities_view.as_view_id()], - filter={"in": {"property": self.target_entities_view.as_property_ref("aliases"), "values": [text]}}, - space=space, - limit=2, # Limit to 2 to detect ambiguity - ) + def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: str) -> list | None: + """ + Finds entity for text using multi-tier caching strategy. + + Strategy: + 1. Check cache (in-memory + persistent RAW) + 2. Use EntitySearchService (annotation edges → global search) + 3. Update cache if unambiguous match found + + Args: + text: Text to search for + annotation_type: Type of annotation + entity_space: Space to search in + + Returns: + List of matched nodes (empty if no match, 2+ if ambiguous) + """ + # TIER 1 & 2: Check cache (in-memory + persistent) + cached_node = self.cache_service.get(text, annotation_type) + if cached_node is not None: + return [cached_node] - def _prepare_edge_update(self, edge: EdgeApply, found_nodes) -> EdgeApply | None: - """Prepares the EdgeApply object for the update based on the number of matches found.""" + # Check if we've already determined there's no match + # (negative caching is handled internally by cache service) + if self.cache_service.get_from_memory(text, annotation_type) is None: + # We've checked this before in this run and found nothing + if (text, annotation_type) in self.cache_service._memory_cache: + return [] + + # TIER 3 & 4: Use EntitySearchService (edges → global search) + found_nodes = self.entity_search_service.find_entity(text, annotation_type, entity_space) + + # Update cache based on result + if found_nodes and len(found_nodes) == 1: + # Unambiguous match - cache it + self.cache_service.set(text, annotation_type, found_nodes[0]) + elif not found_nodes: + # No match - cache negative result + self.cache_service.set(text, annotation_type, None) + # Don't cache ambiguous results (len > 1) + + return found_nodes + + def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | None, RowWrite | None]: + """ + Prepares the EdgeApply and RowWrite objects for updating both data model and RAW table. + Returns a tuple of (edge_apply, raw_row) where either can be None if update is not needed. + """ + # Get the current edge properties before creating the write version + edge_props = edge.properties.get(self.core_annotation_view.as_view_id(), {}) + current_tags = edge_props.get("tags", []) + updated_tags = list(current_tags) if isinstance(current_tags, list) else [] + + # Now create the write version edge_apply = edge.as_write() - properties = edge_apply.sources[0].properties - tags = properties.get("tags", []) - - if len(found_nodes) == 1: # Success - self.logger.info(f"Found single match for '{properties.get('startNodeText')}'. Promoting edge.") - edge_apply.end_node = DirectRelationReference(found_nodes[0].space, found_nodes[0].external_id) - properties["status"] = DiagramAnnotationStatus.APPROVED.value - tags.append("Promoteed-auto") - elif len(found_nodes) == 0: # Failure - self.logger.info(f"Found no match for '{properties.get('startNodeText')}'. Rejecting edge.") - properties["status"] = DiagramAnnotationStatus.REJECTED.value - tags.append("Promote-attempted") - else: # Ambiguous - self.logger.info(f"Found multiple matches for '{properties.get('startNodeText')}'. Marking as ambiguous.") - tags.extend(["Promote-attempted", "ambiguous-match"]) - - properties["tags"] = tags - return edge_apply + + # Fetch existing RAW row to preserve all data + raw_data: dict[str, Any] = {} + try: + existing_row = self.client.raw.rows.retrieve( + db_name=self.raw_db, table_name=self.raw_pattern_table, key=edge.external_id + ) + if existing_row and existing_row.columns: + raw_data = {k: v for k, v in existing_row.columns.items()} + except Exception as e: + self.logger.warning(f"Could not retrieve RAW row for edge {edge.external_id}: {e}") + + # Prepare update properties for the edge + update_properties: dict = {} + + if len(found_nodes) == 1: # Success - single match found + matched_node = found_nodes[0] + self.logger.info(f"Found single match for '{edge_props.get('startNodeText')}'. Promoting edge.") + + # Update edge to point to the found entity + edge_apply.end_node = DirectRelationReference(matched_node.space, matched_node.external_id) + update_properties["status"] = DiagramAnnotationStatus.APPROVED.value + updated_tags.append("promoted-auto") + + # Update RAW row with new end node information + raw_data["endNode"] = matched_node.external_id + raw_data["endNodeSpace"] = matched_node.space + raw_data["status"] = DiagramAnnotationStatus.APPROVED.value + + # Get resource type from the matched entity + entity_props = matched_node.properties.get(self.target_entities_view.as_view_id(), {}) + resource_type = entity_props.get("resourceType") or entity_props.get("type") + if resource_type: + raw_data["endNodeResourceType"] = resource_type + + elif len(found_nodes) == 0: # Failure - no match found + self.logger.info(f"Found no match for '{edge_props.get('startNodeText')}'. Rejecting edge.") + update_properties["status"] = DiagramAnnotationStatus.REJECTED.value + updated_tags.append("promote-attempted") + + # Update RAW row status + raw_data["status"] = DiagramAnnotationStatus.REJECTED.value + + else: # Ambiguous - multiple matches found + self.logger.info(f"Found multiple matches for '{edge_props.get('startNodeText')}'. Marking as ambiguous.") + updated_tags.extend(["promote-attempted", "ambiguous-match"]) + + # Don't change status, just add tags to RAW + raw_data["status"] = edge_props.get("status", DiagramAnnotationStatus.SUGGESTED.value) + + # Update edge properties + update_properties["tags"] = updated_tags + edge_apply.sources[0] = NodeOrEdgeData( + source=self.core_annotation_view.as_view_id(), properties=update_properties + ) + + # Create RowWrite object for RAW table update + raw_row = RowWrite(key=edge.external_id, columns=raw_data) if raw_data else None + + return edge_apply, raw_row diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py new file mode 100644 index 00000000..b8b418bb --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py @@ -0,0 +1,331 @@ +from dataclasses import dataclass, asdict, field +from typing import Literal, cast +from enum import Enum +from datetime import datetime, timezone, timedelta + +from cognite.client.data_classes.data_modeling import ( + Node, + NodeId, + NodeApply, + NodeOrEdgeData, + ViewId, +) +from cognite.client.data_classes.contextualization import ( + FileReference, +) + + +@dataclass +class EnvConfig: + """ + Data structure holding the configs to connect to CDF client locally + """ + + cdf_project: str + cdf_cluster: str + tenant_id: str + client_id: str + client_secret: str + + +class DiagramAnnotationStatus(str, Enum): + SUGGESTED = "Suggested" + APPROVED = "Approved" + REJECTED = "Rejected" + + +class AnnotationStatus(str, Enum): + """ + Defines the types of values that the annotationStatus property can be for the Annotation State Instances. + Inherits from 'str' so that the enum members are also string instances, + making them directly usable where a string is expected (e.g., serialization). + Holds the different values that the annotationStatus property can be for the Annotation State Instances. + """ + + NEW = "New" + RETRY = "Retry" + PROCESSING = "Processing" + FINALIZING = "Finalizing" + ANNOTATED = "Annotated" + FAILED = "Failed" + + +class FilterOperator(str, Enum): + """ + Defines the types of filter operations that can be specified in the configuration. + Inherits from 'str' so that the enum members are also string instances, + making them directly usable where a string is expected (e.g., serialization). + """ + + EQUALS = "Equals" # Checks for equality against a single value. + EXISTS = "Exists" # Checks if a property exists (is not null). + CONTAINSALL = "ContainsAll" # Checks if an item contains all specified values for a given property + IN = "In" # Checks if a value is within a list of specified values. Not implementing CONTAINSANY b/c IN is usually more suitable + SEARCH = "Search" # Performs full text search on a specified property + + +@dataclass +class AnnotationState: + """ + Data structure holding the mpcAnnotationState view properties. Time will convert to Timestamp when ingested into CDF. + """ + + annotationStatus: AnnotationStatus + linkedFile: dict[str, str] = field(default_factory=dict) + attemptCount: int = 0 + annotationMessage: str | None = None + diagramDetectJobId: int | None = None + sourceCreatedTime: str = field( + default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() + ) + sourceUpdatedTime: str = field( + default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() + ) + sourceCreatedUser: str = "fn_dm_context_annotation_launch" + sourceUpdatedUser: str = "fn_dm_context_annotation_launch" + + def _create_external_id(self) -> str: + """ + Create a deterministic external ID so that we can replace mpcAnnotationState of files that have been updated and aren't new + """ + prefix = "an_state" + linked_file_space = self.linkedFile["space"] + linked_file_id = self.linkedFile["externalId"] + return f"{prefix}_{linked_file_space}_{linked_file_id}" + + def to_dict(self) -> dict: + return asdict(self) + + def to_node_apply(self, node_space: str, annotation_state_view: ViewId) -> NodeApply: + external_id: str = self._create_external_id() + + return NodeApply( + space=node_space, + external_id=external_id, + sources=[ + NodeOrEdgeData( + source=annotation_state_view, + properties=self.to_dict(), + ) + ], + ) + + +@dataclass +class FileProcessingBatch: + primary_scope_value: str + secondary_scope_value: str | None + files: list[Node] + + +@dataclass +class entity: + """ + data structure for the 'entities' fed into diagram detect, + { + "external_id": file.external_id, + "name": file.properties[job_config.file_view.as_view_id()]["name"], + "space": file.space, + "annotation_type": job_config.file_view.type, + "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], + "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], + } + """ + + external_id: str + name: str + space: str + annotation_type: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + resource_type: str + search_property: list[str] = field(default_factory=list) + + def to_dict(self): + return asdict(self) + + +@dataclass +class BatchOfNodes: + nodes: list[Node] = field(default_factory=list) + ids: list[NodeId] = field(default_factory=list) + apply: list[NodeApply] = field(default_factory=list) + + def add(self, node: Node): + self.nodes.append(node) + node_id = node.as_id() + self.ids.append(node_id) + return + + def clear(self): + self.nodes.clear() + self.ids.clear() + self.apply.clear() + return + + def update_node_properties(self, new_properties: dict, view_id: ViewId): + for node in self.nodes: + node_apply = NodeApply( + space=node.space, + external_id=node.external_id, + existing_version=None, + sources=[ + NodeOrEdgeData( + source=view_id, + properties=new_properties, + ) + ], + ) + self.apply.append(node_apply) + return + + +@dataclass +class BatchOfPairedNodes: + """ + Where nodeA is an instance of the file view and nodeB is an instance of the annotation state view + """ + + file_to_state_map: dict[NodeId, Node] + batch_files: BatchOfNodes = field(default_factory=BatchOfNodes) + batch_states: BatchOfNodes = field(default_factory=BatchOfNodes) + file_references: list[FileReference] = field(default_factory=list) + + def add_pair(self, file_node: Node, file_reference: FileReference): + self.file_references.append(file_reference) + self.batch_files.add(file_node) + file_node_id: NodeId = file_node.as_id() + state_node: Node = self.file_to_state_map[file_node_id] + self.batch_states.add(state_node) + + def create_file_reference( + self, + file_node_id: NodeId, + page_range: int, + annotation_state_view_id: ViewId, + ) -> FileReference: + """ + Create a file reference that has a page range for annotation. + The current implementation of the detect api 20230101-beta only allows annotation of files up to 50 pages. + Thus, this is my idea of how we can enables annotating files that are more than 50 pages long. + + The annotatedPageCount and pageCount properties won't be set in the initial creation of the annotation state nodes. + That's because we don't know how many pages are in the pdf until we run the diagram detect job where the page count gets returned from the results of the job. + Thus, annotatedPageCount and pageCount get set in the finalize function. + The finalize function will set the page count properties based on the page count that returned from diagram detect job results. + - If the pdf has less than 50 pages, say 3 pages, then... + - annotationStatus property will get set to 'complete' + - annotatedPageCount and pageCount properties will be set to 3. + - Elif the pdf has more than 50 pages, say 80, then... + - annotationStatus property will get set to 'new' + - annotatedPageCount set to 50 + - pageCount set to 80 + - attemptCount doesn't get incremented + + NOTE: Chose to create the file_reference here b/c I already have access to the file node and state node. + If I chose to have this logic in the launchService then we'd have to iterate on all of the nodes that have already been added. + Thus -> O(N) + O(N) to create the BatchOfPairedNodes and then to create the file references + Instead, this approach makes it just O(N) + """ + annotation_state_node: Node = self.file_to_state_map[file_node_id] + annotated_page_count: int | None = cast( + int, + annotation_state_node.properties[annotation_state_view_id].get("annotatedPageCount"), + ) + page_count: int | None = cast( + int, + annotation_state_node.properties[annotation_state_view_id].get("pageCount"), + ) + if not annotated_page_count or not page_count: + file_reference: FileReference = FileReference( + file_instance_id=file_node_id, + first_page=1, + last_page=page_range, + ) + else: + # NOTE: adding 1 here since that annotated_page_count variable holds the last page that was annotated. Thus we want to annotate the following page + # e.g.) first run annotates pages 1-50 second run would annotate 51-100 + first_page = annotated_page_count + 1 + last_page = annotated_page_count + page_range + if page_count <= last_page: + last_page = page_count + file_reference: FileReference = FileReference( + file_instance_id=file_node_id, + first_page=first_page, + last_page=last_page, + ) + + return file_reference + + def clear_pair(self): + self.batch_files.clear() + self.batch_states.clear() + self.file_references.clear() + + def size(self) -> int: + return len(self.file_references) + + def is_empty(self) -> bool: + if self.file_references: + return False + return True + + +@dataclass +class PerformanceTracker: + """ + Keeps track of metrics + """ + + files_success: int = 0 + files_failed: int = 0 + total_runs: int = 0 + total_time_delta: timedelta = timedelta(0) + latest_run_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + def _run_time(self) -> timedelta: + time_delta = datetime.now(timezone.utc) - self.latest_run_time + return time_delta + + def _average_run_time(self) -> timedelta: + if self.total_runs == 0: + return timedelta(0) + return self.total_time_delta / self.total_runs + + def add_files(self, success: int, failed: int = 0): + self.files_success += success + self.files_failed += failed + + def generate_local_report(self) -> str: + self.total_runs += 1 + time_delta = self._run_time() + self.total_time_delta += time_delta + self.latest_run_time = datetime.now(timezone.utc) + + report = f"run time: {time_delta}" + return report + + def generate_overall_report(self) -> str: + report = f" Run started {datetime.now(timezone.utc)}\n- total runs: {self.total_runs}\n- total files processed: {self.files_success+self.files_failed}\n- successful files: {self.files_success}\n- failed files: {self.files_failed}\n- total run time: {self.total_time_delta}\n- average run time: {self._average_run_time()}" + return report + + def generate_ep_run( + self, + caller: Literal["Launch", "Finalize"], + function_id: str | None, + call_id: str | None, + ) -> str: + """Generates the report string for the extraction pipeline run.""" + report = ( + f"(caller:{caller}, function_id:{function_id}, call_id:{call_id}) - " + f"total files processed: {self.files_success + self.files_failed} - " + f"successful files: {self.files_success} - " + f"failed files: {self.files_failed}" + ) + return report + + def reset(self) -> None: + self.files_success = 0 + self.files_failed = 0 + self.total_runs: int = 0 + self.total_time_delta = timedelta(0) + self.latest_run_time = datetime.now(timezone.utc) + print("PerformanceTracker state has been reset") From 33be3f46b43786bf965b19952c5d51dd4e3e121e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 09:31:23 -0500 Subject: [PATCH 097/128] added a sourceUser for rows uploaded to promote_text cache --- .../services/CacheService.py | 5 +++++ .../services/EntitySearchService.py | 5 +++-- .../services/PromoteService.py | 11 ++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index 169dbef5..1b9ffa15 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -68,6 +68,7 @@ def __init__( self.file_view_id = file_view_id self.target_entities_view_id = target_entities_view_id self.cache_table_name = cache_table_name + self.function_id = "fn_file_annotation_promote" # In-memory cache: {(text, type): (space, ext_id) or None} self._memory_cache: dict[tuple[str, str], tuple[str, str] | None] = {} @@ -229,6 +230,9 @@ def _set_in_persistent_cache(self, text: str, annotation_type: str, node: Node) """ Updates persistent RAW cache with text → entity mapping. Only caches unambiguous single matches. + # NOTE: This cache has two entry points. One entry point is automatically generated connections (e.g. from this code) + # The second entry point is from the streamlit app. Manual promotions through the streamlit app will have the result cached into the RAW table. + # The sourceCreatedUser will be the functionId for auto generated cache rows and will be a usersId for the manual promotions. """ try: cache_key = self.normalize(text) @@ -241,6 +245,7 @@ def _set_in_persistent_cache(self, text: str, annotation_type: str, node: Node) "endNodeSpace": node.space, "annotationType": annotation_type, "lastUpdateTimeUtcIso": datetime.now(timezone.utc).isoformat(), + "sourceCreatedUser": self.function_id, }, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index fef1a521..6e08fab7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -107,8 +107,9 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis text_variations = self.generate_text_variations(text) self.logger.debug(f"Searching for text variations: {text_variations}") - # Query edges directly with IN filter (server-side, fast!) - # These are annotation edges from regular diagram detect (not pattern mode) + # Query edges directly with IN filter + # These are annotation edges that are from regular diagram detect (not pattern mode) + # NOTE: manually promoted results from pattern mode are added to the text_filter: Filter = In(self.core_annotation_view_id.as_property_ref("startNodeText"), text_variations) edges = self.client.data_modeling.instances.list( instance_type="edge", diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index dec70ccf..43f0e573 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -125,7 +125,7 @@ def run(self) -> str | None: return None # Continue running if more candidates might exist def _get_promote_candidates(self) -> EdgeList | None: - """Queries for suggested edges pointing to the sink node that haven't been Promote-attempted.""" + """Queries for suggested edges pointing to the sink node that haven't been PromoteAttempted.""" return self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view.as_view_id()], @@ -142,7 +142,7 @@ def _get_promote_candidates(self) -> EdgeList | None: "not": { "containsAny": { "property": self.core_annotation_view.as_property_ref("tags"), - "values": ["Promote-attempted"], + "values": ["PromoteAttempted"], } } }, @@ -228,7 +228,7 @@ def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | Non # Update edge to point to the found entity edge_apply.end_node = DirectRelationReference(matched_node.space, matched_node.external_id) update_properties["status"] = DiagramAnnotationStatus.APPROVED.value - updated_tags.append("promoted-auto") + updated_tags.append("PromotedAuto") # Update RAW row with new end node information raw_data["endNode"] = matched_node.external_id @@ -244,20 +244,21 @@ def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | Non elif len(found_nodes) == 0: # Failure - no match found self.logger.info(f"Found no match for '{edge_props.get('startNodeText')}'. Rejecting edge.") update_properties["status"] = DiagramAnnotationStatus.REJECTED.value - updated_tags.append("promote-attempted") + updated_tags.append("PromoteAttempted") # Update RAW row status raw_data["status"] = DiagramAnnotationStatus.REJECTED.value else: # Ambiguous - multiple matches found self.logger.info(f"Found multiple matches for '{edge_props.get('startNodeText')}'. Marking as ambiguous.") - updated_tags.extend(["promote-attempted", "ambiguous-match"]) + updated_tags.extend(["PromoteAttempted", "AmbiguousMatch"]) # Don't change status, just add tags to RAW raw_data["status"] = edge_props.get("status", DiagramAnnotationStatus.SUGGESTED.value) # Update edge properties update_properties["tags"] = updated_tags + raw_data["tags"] = updated_tags edge_apply.sources[0] = NodeOrEdgeData( source=self.core_annotation_view.as_view_id(), properties=update_properties ) From 254956fd8ca544899341d5fad41e2ff1c3f1a64e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 10:10:07 -0500 Subject: [PATCH 098/128] updated docstring and type hints --- .../dependencies.py | 137 ++++++++++--- .../fn_file_annotation_promote/handler.py | 78 ++++++-- .../services/CacheService.py | 107 ++++++++--- .../services/EntitySearchService.py | 83 +++++--- .../services/PromoteService.py | 181 ++++++++++++++---- 5 files changed, 455 insertions(+), 131 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py index 8d26fbb8..44cff27b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py @@ -2,7 +2,7 @@ from pathlib import Path from dotenv import load_dotenv -from typing import Any, Tuple, Literal +from typing import Any, Tuple, Literal, cast from cognite.client import CogniteClient, ClientConfig, global_config from cognite.client.credentials import OAuthClientCredentials from utils.DataStructures import EnvConfig @@ -14,14 +14,30 @@ def get_env_variables() -> EnvConfig: + """ + Loads environment variables required for CDF authentication from .env file. + + Required environment variables: + - CDF_PROJECT: CDF project name + - CDF_CLUSTER: CDF cluster (e.g., westeurope-1) + - IDP_TENANT_ID: Azure AD tenant ID + - IDP_CLIENT_ID: Azure AD application client ID + - IDP_CLIENT_SECRET: Azure AD application client secret + + Returns: + EnvConfig object containing all required environment variables + + Raises: + ValueError: If any required environment variables are missing + """ print("Loading environment variables from .env...") - project_path = (Path(__file__).parent / ".env").resolve() + project_path: Path = (Path(__file__).parent / ".env").resolve() print(f"project_path is set to: {project_path}") load_dotenv() - required_envvars = ( + required_envvars: tuple[str, ...] = ( "CDF_PROJECT", "CDF_CLUSTER", "IDP_TENANT_ID", @@ -29,7 +45,7 @@ def get_env_variables() -> EnvConfig: "IDP_CLIENT_SECRET", ) - missing = [envvar for envvar in required_envvars if envvar not in os.environ] + missing: list[str] = [envvar for envvar in required_envvars if envvar not in os.environ] if missing: raise ValueError(f"Missing one or more env.vars: {missing}") @@ -42,31 +58,52 @@ def get_env_variables() -> EnvConfig: ) -def create_client(env_config: EnvConfig, debug: bool = False): - SCOPES = [f"https://{env_config.cdf_cluster}.cognitedata.com/.default"] - TOKEN_URL = f"https://login.microsoftonline.com/{env_config.tenant_id}/oauth2/v2.0/token" - creds = OAuthClientCredentials( +def create_client(env_config: EnvConfig, debug: bool = False) -> CogniteClient: + """ + Creates an authenticated CogniteClient using OAuth client credentials flow. + + Args: + env_config: Environment configuration containing CDF connection details + debug: Whether to enable debug mode on the client (default: False) + + Returns: + Authenticated CogniteClient instance + """ + SCOPES: list[str] = [f"https://{env_config.cdf_cluster}.cognitedata.com/.default"] + TOKEN_URL: str = f"https://login.microsoftonline.com/{env_config.tenant_id}/oauth2/v2.0/token" + creds: OAuthClientCredentials = OAuthClientCredentials( token_url=TOKEN_URL, client_id=env_config.client_id, client_secret=env_config.client_secret, scopes=SCOPES, ) - settings = { + settings: dict[str, bool] = { "disable_ssl": True, } global_config.apply_settings(settings) - cnf = ClientConfig( + cnf: ClientConfig = ClientConfig( client_name="DEV_Working", project=env_config.cdf_project, base_url=f"https://{env_config.cdf_cluster}.cognitedata.com", credentials=creds, debug=debug, ) - client = CogniteClient(cnf) + client: CogniteClient = CogniteClient(cnf) return client -def create_logger_service(log_level, filepath: str | None): +def create_logger_service(log_level: str, filepath: str | None) -> CogniteFunctionLogger: + """ + Creates a logger service for tracking function execution. + + Args: + log_level: Logging level ("DEBUG", "INFO", "WARNING", "ERROR") + filepath: Optional file path for writing logs to disk + + Returns: + CogniteFunctionLogger instance configured with specified settings + """ + write: bool if filepath: write = True else: @@ -74,28 +111,63 @@ def create_logger_service(log_level, filepath: str | None): if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR"]: return CogniteFunctionLogger() else: - return CogniteFunctionLogger(log_level=log_level, write=write, filepath=filepath) + # Cast to Literal type to satisfy type checker + validated_log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = cast( + Literal["DEBUG", "INFO", "WARNING", "ERROR"], log_level + ) + return CogniteFunctionLogger(log_level=validated_log_level, write=write, filepath=filepath) def create_config_service( function_data: dict[str, Any], client: CogniteClient | None = None ) -> Tuple[Config, CogniteClient]: + """ + Creates configuration service and CogniteClient for the function. + + Loads configuration from CDF based on the ExtractionPipelineExtId provided in function_data. + If no client is provided, creates one using environment variables. + + Args: + function_data: Dictionary containing function input data (must include ExtractionPipelineExtId) + client: Optional pre-initialized CogniteClient (if None, creates new client) + + Returns: + Tuple of (Config, CogniteClient) + """ if not client: - env_config = get_env_variables() + env_config: EnvConfig = get_env_variables() client = create_client(env_config) - config = load_config_parameters(client=client, function_data=function_data) + config: Config = load_config_parameters(client=client, function_data=function_data) return config, client def create_entity_search_service( config: Config, client: CogniteClient, logger: CogniteFunctionLogger ) -> EntitySearchService: - """Creates an EntitySearchService instance for finding entities by text.""" + """ + Creates an EntitySearchService instance for finding entities by text. + + Factory function that initializes EntitySearchService with all required dependencies + extracted from the configuration. + + Args: + config: Configuration object containing data model views + client: CogniteClient for API interactions + logger: Logger instance for tracking execution + + Returns: + Initialized EntitySearchService instance + + Raises: + ValueError: If regular_annotation_space (file_view.instance_space) is None + """ # Get required configuration - core_annotation_view = config.data_model_views.core_annotation_view - file_view = config.data_model_views.file_view - target_entities_view = config.data_model_views.target_entities_view - regular_annotation_space = file_view.instance_space + from services.ConfigService import ViewPropertyConfig + + core_annotation_view: ViewPropertyConfig = config.data_model_views.core_annotation_view + file_view: ViewPropertyConfig = config.data_model_views.file_view + target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + regular_annotation_space: str | None = file_view.instance_space if not regular_annotation_space: raise ValueError("regular_annotation_space (file_view.instance_space) is required but was None") @@ -113,10 +185,27 @@ def create_entity_search_service( def create_cache_service( config: Config, client: CogniteClient, logger: CogniteFunctionLogger, entity_search_service: EntitySearchService ) -> CacheService: - """Creates a CacheService instance for caching text→entity mappings.""" - raw_db = config.finalize_function.apply_service.raw_db - file_view = config.data_model_views.file_view - target_entities_view = config.data_model_views.target_entities_view + """ + Creates a CacheService instance for caching text→entity mappings. + + Factory function that initializes CacheService with all required dependencies. + Importantly, reuses the normalize() function from EntitySearchService to ensure + consistent text normalization between caching and searching. + + Args: + config: Configuration object containing RAW database settings and data model views + client: CogniteClient for API interactions + logger: Logger instance for tracking execution + entity_search_service: EntitySearchService instance (to reuse normalize function) + + Returns: + Initialized CacheService instance + """ + from services.ConfigService import ViewPropertyConfig + + raw_db: str = config.finalize_function.apply_service.raw_db + file_view: ViewPropertyConfig = config.data_model_views.file_view + target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view return CacheService( client=client, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index b08bfb4e..d156c52f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -10,21 +10,55 @@ create_cache_service, ) from services.PromoteService import GeneralPromoteService +from services.ConfigService import Config +from services.LoggerService import CogniteFunctionLogger +from services.EntitySearchService import EntitySearchService +from services.CacheService import CacheService -def handle(data: dict, function_call_info: dict, client: CogniteClient): - """Main entry point for the Cognite Function.""" - start_time = datetime.now(timezone.utc) +def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[str, str]: + """ + Main entry point for the Cognite Function - promotes pattern-mode annotations. + This function runs in a loop for up to 7 minutes, processing batches of pattern-mode + annotations. For each batch: + 1. Retrieves candidate edges (pattern-mode annotations pointing to sink node) + 2. Searches for matching entities using EntitySearchService (with caching) + 3. Updates edges and RAW tables based on search results + 4. Pauses 10 seconds between batches + + Pattern-mode annotations are created when diagram detection finds text matching + regex patterns but can't match it to the provided entity list. This function + attempts to resolve those annotations post-hoc. + + Args: + data: Function input data containing: + - ExtractionPipelineExtId: ID of extraction pipeline for config + - logLevel: Logging level (DEBUG, INFO, WARNING, ERROR) + - logPath: Optional path for writing logs to file + function_call_info: Metadata about the function call (not currently used) + client: Pre-initialized CogniteClient for API interactions + + Returns: + Dictionary with execution status: + - {"status": "success", "message": "..."} on normal completion + - {"status": "failure", "message": "..."} on error + + Raises: + Exception: Any unexpected errors are caught, logged, and returned in status dict + """ + start_time: datetime = datetime.now(timezone.utc) + + config: Config config, client = create_config_service(function_data=data) - logger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) + logger: CogniteFunctionLogger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) # Create service dependencies - entity_search_service = create_entity_search_service(config, client, logger) - cache_service = create_cache_service(config, client, logger, entity_search_service) + entity_search_service: EntitySearchService = create_entity_search_service(config, client, logger) + cache_service: CacheService = create_cache_service(config, client, logger, entity_search_service) # Create promote service with injected dependencies - promote_service = GeneralPromoteService( + promote_service: GeneralPromoteService = GeneralPromoteService( client=client, config=config, logger=logger, @@ -35,7 +69,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient): try: # Run in a loop for a maximum of 7 minutes while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): - result = promote_service.run() + result: str | None = promote_service.run() if result == "Done": logger.info("No more candidates to process. Exiting.", section="END") break @@ -47,15 +81,33 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient): return {"status": "failure", "message": str(e)} -def run_locally(config_file: dict): - """Entry point for local execution and debugging.""" +def run_locally(config_file: dict) -> None: + """ + Entry point for local execution and debugging. + + Runs the promote function locally using environment variables for authentication + instead of Cognite Functions runtime. Useful for development and testing. + + Args: + config_file: Configuration dictionary containing: + - ExtractionPipelineExtId: ID of extraction pipeline for config + - logLevel: Logging level (DEBUG, INFO, WARNING, ERROR) + - logPath: Path for writing logs to file + + Returns: + None (execution results are logged) + + Raises: + ValueError: If required environment variables are missing + """ from dependencies import create_client, get_env_variables + from utils.DataStructures import EnvConfig - env_vars = get_env_variables() - client = create_client(env_vars) + env_vars: EnvConfig = get_env_variables() + client: CogniteClient = create_client(env_vars) # Mock function_call_info for local runs - function_call_info = {"function_id": "local", "call_id": "local"} + function_call_info: dict[str, str] = {"function_id": "local", "call_id": "local"} handle(config_file, function_call_info, client) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index 1b9ffa15..12711918 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -1,6 +1,6 @@ import abc from datetime import datetime, timezone, timedelta -from typing import Callable +from typing import Callable, Any from cognite.client import CogniteClient from cognite.client.data_classes.data_modeling import Node, NodeList from cognite.client.data_classes.data_modeling.ids import ViewId @@ -9,34 +9,73 @@ class ICacheService(abc.ABC): + """ + Interface for services that cache text → entity mappings to improve lookup performance. + """ + @abc.abstractmethod def get(self, text: str, annotation_type: str) -> Node | None: - """Retrieves a cached entity node for the given text and annotation type.""" + """ + Retrieves a cached entity node for the given text and annotation type. + + Args: + text: Text to look up + annotation_type: Type of annotation + + Returns: + Cached Node if found, None if cache miss + """ pass @abc.abstractmethod - def set(self, text: str, annotation_type: str, node: Node) -> None: - """Caches an entity node for the given text and annotation type.""" + def set(self, text: str, annotation_type: str, node: Node | None) -> None: + """ + Caches an entity node for the given text and annotation type. + + Args: + text: Text being cached + annotation_type: Type of annotation + node: Entity node to cache, or None for negative caching + """ pass @abc.abstractmethod def get_from_memory(self, text: str, annotation_type: str) -> Node | None: - """Retrieves from in-memory cache only (no persistent storage lookup).""" + """ + Retrieves from in-memory cache only (no persistent storage lookup). + + Args: + text: Text to look up + annotation_type: Type of annotation + + Returns: + Cached Node if found in memory, None otherwise + """ pass class CacheService(ICacheService): """ - Manages two-tier caching for text → entity mappings. + Manages two-tier caching for text → entity mappings to dramatically improve performance. - TIER 1: In-memory cache (this run only) + **TIER 1: In-Memory Cache** (This Run Only): - Ultra-fast lookup (<1ms) - - Includes negative caching (None for no match) + - Dictionary stored in memory: {(text, type): (space, id) or None} + - Includes negative caching (remembers "no match found") + - Cleared when function execution ends - TIER 2: Persistent RAW cache (all runs) + **TIER 2: Persistent RAW Cache** (All Runs): - Fast lookup (5-10ms) - - Benefits all future runs + - Stored in RAW table: promote_text_to_entity_cache + - Benefits all future function runs indefinitely - Tracks hit count for analytics + - Only caches unambiguous single matches + + **Performance Impact:** + - First lookup: 50-100ms (query annotation edges) + - Cached lookup (same run): <1ms (5000x faster) + - Cached lookup (future run): 5-10ms (10-20x faster) + - Self-improving: Gets faster as cache fills """ def __init__( @@ -86,24 +125,30 @@ def get(self, text: str, annotation_type: str) -> Node | None: Returns: Cached Node if found, None if cache miss """ - cache_key = (text, annotation_type) + cache_key: tuple[str, str] = (text, annotation_type) # TIER 1: In-memory cache (instant) if cache_key in self._memory_cache: - cached_result = self._memory_cache[cache_key] + cached_result: tuple[str, str] | None = self._memory_cache[cache_key] if cached_result is None: # Negative cache entry return None # Retrieve the node from cache + space: str + ext_id: str space, ext_id = cached_result - view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + view_id: ViewId = ( + self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + ) try: - retrieved = self.client.data_modeling.instances.retrieve_nodes(nodes=(space, ext_id), sources=view_id) + retrieved: Any = self.client.data_modeling.instances.retrieve_nodes( + nodes=(space, ext_id), sources=view_id + ) if retrieved: self.logger.debug(f"✓ In-memory cache HIT for '{text}'") - node = self._extract_single_node(retrieved) + node: Node | None = self._extract_single_node(retrieved) return node except Exception as e: self.logger.warning(f"Failed to retrieve cached node for '{text}': {e}") @@ -112,7 +157,7 @@ def get(self, text: str, annotation_type: str) -> Node | None: return None # TIER 2: Persistent RAW cache (fast) - cached_node = self._get_from_persistent_cache(text, annotation_type) + cached_node: Node | None = self._get_from_persistent_cache(text, annotation_type) if cached_node: self.logger.info(f"✓ Persistent cache HIT for '{text}'") # Populate in-memory cache for future lookups in this run @@ -135,19 +180,21 @@ def get_from_memory(self, text: str, annotation_type: str) -> Node | None: Returns: Cached Node if found in memory, None otherwise """ - cache_key = (text, annotation_type) + cache_key: tuple[str, str] = (text, annotation_type) if cache_key not in self._memory_cache: return None - cached_result = self._memory_cache[cache_key] + cached_result: tuple[str, str] | None = self._memory_cache[cache_key] if cached_result is None: return None + space: str + ext_id: str space, ext_id = cached_result - view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + view_id: ViewId = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id try: - retrieved = self.client.data_modeling.instances.retrieve_nodes(nodes=(space, ext_id), sources=view_id) + retrieved: Any = self.client.data_modeling.instances.retrieve_nodes(nodes=(space, ext_id), sources=view_id) if retrieved: return self._extract_single_node(retrieved) except Exception: @@ -166,7 +213,7 @@ def set(self, text: str, annotation_type: str, node: Node | None) -> None: annotation_type: Type of annotation node: The entity node to cache, or None for negative caching """ - cache_key = (text, annotation_type) + cache_key: tuple[str, str] = (text, annotation_type) if node is None: # Negative cache entry (remember that no match was found) @@ -188,9 +235,9 @@ def _get_from_persistent_cache(self, text: str, annotation_type: str) -> Node | """ try: # Normalize text for consistent cache keys - cache_key = self.normalize(text) + cache_key: str = self.normalize(text) - row = self.client.raw.rows.retrieve( + row: Any = self.client.raw.rows.retrieve( db_name=self.raw_db, table_name=self.cache_table_name, key=cache_key, @@ -204,15 +251,17 @@ def _get_from_persistent_cache(self, text: str, annotation_type: str) -> Node | return None # Retrieve the cached node - end_node_space = row.columns.get("endNodeSpace") - end_node_ext_id = row.columns.get("endNode") + end_node_space: Any = row.columns.get("endNodeSpace") + end_node_ext_id: Any = row.columns.get("endNode") if not end_node_space or not end_node_ext_id: return None - view_id = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + view_id: ViewId = ( + self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + ) - retrieved = self.client.data_modeling.instances.retrieve_nodes( + retrieved: Any = self.client.data_modeling.instances.retrieve_nodes( nodes=(end_node_space, end_node_ext_id), sources=view_id ) @@ -235,9 +284,9 @@ def _set_in_persistent_cache(self, text: str, annotation_type: str, node: Node) # The sourceCreatedUser will be the functionId for auto generated cache rows and will be a usersId for the manual promotions. """ try: - cache_key = self.normalize(text) + cache_key: str = self.normalize(text) - cache_data = Row( + cache_data: Row = Row( key=cache_key, columns={ "originalText": text, diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index 6e08fab7..ebab1908 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -1,6 +1,6 @@ import abc import re -from typing import Callable +from typing import Callable, Any from cognite.client import CogniteClient from cognite.client.data_classes.data_modeling import Node, NodeList, ViewId from cognite.client.data_classes.filters import Filter, Equals, In @@ -8,23 +8,46 @@ class IEntitySearchService(abc.ABC): + """ + Interface for services that find entities by text using various search strategies. + """ + @abc.abstractmethod def find_entity(self, text: str, annotation_type: str, entity_space: str) -> list[Node]: - """Finds entities matching the given text using multiple strategies.""" + """ + Finds entities matching the given text using multiple strategies. + + Args: + text: Text to search for + annotation_type: Type of annotation being searched + entity_space: Space to search in for global fallback + + Returns: + List of matched Node objects + """ pass class EntitySearchService(IEntitySearchService): """ - Finds entities by text using multiple search strategies. + Finds entities by text using multiple search strategies with automatic fallback. - Search Strategy: - 1. Existing annotations (fast, reliable, leverages proven connections) - 2. Global entity search (slow, comprehensive, fallback) + This service implements a two-tier search strategy for finding entities: - Utilities: - - Text variation generation (handles case, leading zeros) - - Text normalization (for comparison) + **Strategy 1 - Existing Annotations** (Primary, Fast: 50-100ms): + - Queries annotation edges from regular diagram detect + - Uses server-side IN filter with text variations + - Returns entities that were successfully annotated before + - Handles cross-scope scenarios naturally (entity in different site/unit) + + **Strategy 2 - Global Entity Search** (Fallback, Slow: 500ms-2s): + - Fetches all entities in space (limit 1000) + - Client-side normalized matching against aliases + - Comprehensive but may timeout with large entity counts + + **Utilities:** + - `generate_text_variations()`: Creates common variations (case, leading zeros) + - `normalize()`: Normalizes text for comparison (removes special chars, lowercase, strips zeros) """ def __init__( @@ -76,7 +99,7 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis - [node1, node2] if ambiguous (multiple matches) """ # STRATEGY 1: Query existing annotations (primary, fast) - found_nodes = self.find_from_existing_annotations(text, annotation_type) + found_nodes: list[Node] = self.find_from_existing_annotations(text, annotation_type) if not found_nodes: # STRATEGY 2: Global entity search (fallback, slow) @@ -104,14 +127,14 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis """ try: # Generate variations of the search text - text_variations = self.generate_text_variations(text) + text_variations: list[str] = self.generate_text_variations(text) self.logger.debug(f"Searching for text variations: {text_variations}") # Query edges directly with IN filter # These are annotation edges that are from regular diagram detect (not pattern mode) - # NOTE: manually promoted results from pattern mode are added to the + # NOTE: manually promoted results from pattern mode are added to the text_filter: Filter = In(self.core_annotation_view_id.as_property_ref("startNodeText"), text_variations) - edges = self.client.data_modeling.instances.list( + edges: Any = self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view_id], filter=text_filter, @@ -123,43 +146,48 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis return [] # Count occurrences of each endNode - matched_end_nodes = {} # {(space, externalId): count} + matched_end_nodes: dict[tuple[str, str], int] = {} # {(space, externalId): count} for edge in edges: # Check annotation type matches - edge_props = edge.properties.get(self.core_annotation_view_id, {}) - edge_type = edge_props.get("type") + edge_props: dict[str, Any] = edge.properties.get(self.core_annotation_view_id, {}) + edge_type: Any = edge_props.get("type") if edge_type != annotation_type: continue # Skip edges of different type # Extract endNode from the edge - end_node_ref = edge.end_node + end_node_ref: Any = edge.end_node if end_node_ref: - key = (end_node_ref.space, end_node_ref.external_id) + key: tuple[str, str] = (end_node_ref.space, end_node_ref.external_id) matched_end_nodes[key] = matched_end_nodes.get(key, 0) + 1 if not matched_end_nodes: return [] # If multiple different endNodes found, it's ambiguous + top_matches: list[tuple[str, str]] if len(matched_end_nodes) > 1: self.logger.warning( f"Found {len(matched_end_nodes)} different entities for '{text}' in existing annotations. " f"This indicates data quality issues or legitimate ambiguity." ) # Return list of most common matches (limit to 2 for ambiguity detection) - sorted_matches = sorted(matched_end_nodes.items(), key=lambda x: x[1], reverse=True) + sorted_matches: list[tuple[tuple[str, str], int]] = sorted( + matched_end_nodes.items(), key=lambda x: x[1], reverse=True + ) top_matches = [match[0] for match in sorted_matches[:2]] else: # Single consistent match found top_matches = [list(matched_end_nodes.keys())[0]] # Fetch the actual node objects for the matched entities - view_to_use = self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + view_to_use: ViewId = ( + self.file_view_id if annotation_type == "diagrams.FileLink" else self.target_entities_view_id + ) - matched_nodes = [] + matched_nodes: list[Node] = [] for space, ext_id in top_matches: - retrieved = self.client.data_modeling.instances.retrieve_nodes( + retrieved: Any = self.client.data_modeling.instances.retrieve_nodes( nodes=(space, ext_id), sources=view_to_use ) # Handle both single Node and NodeList returns @@ -198,10 +226,11 @@ def find_global_entity(self, text: str, entity_space: str) -> list[Node]: List of matched nodes (0, 1, or 2 for ambiguity detection) """ # Normalize the search text - normalized_text = self.normalize(text) + normalized_text: str = self.normalize(text) # Fetch all entities in the space (with reasonable limit) # NOTE: We can't do normalized matching server-side, so we fetch and filter client-side + entities: Any try: entities = self.client.data_modeling.instances.list( instance_type="node", @@ -214,10 +243,10 @@ def find_global_entity(self, text: str, entity_space: str) -> list[Node]: return [] # Client-side normalized matching against aliases - matches = [] + matches: list[Node] = [] for entity in entities: - entity_props = entity.properties.get(self.target_entities_view_id, {}) - aliases = entity_props.get("aliases", []) + entity_props: dict[str, Any] = entity.properties.get(self.target_entities_view_id, {}) + aliases: Any = entity_props.get("aliases", []) # Ensure aliases is iterable if not isinstance(aliases, list): @@ -255,7 +284,7 @@ def generate_text_variations(self, text: str) -> list[str]: Returns: List of text variations (original + common transformations) """ - variations = set() + variations: set[str] = set() variations.add(text) # Always include original # Add lowercase version diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 43f0e573..08c70f6a 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -1,5 +1,5 @@ import abc -from typing import Any +from typing import Any, Literal from cognite.client import CogniteClient from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ( @@ -19,12 +19,35 @@ class IPromoteService(abc.ABC): + """ + Interface for services that promote pattern-mode annotations by finding entities + and updating annotation edges. + """ + @abc.abstractmethod - def run(self) -> str | None: + def run(self) -> Literal["Done"] | None: + """ + Main execution method for promoting pattern-mode annotations. + + Returns: + "Done" if no more candidates need processing, None if processing should continue. + """ pass class GeneralPromoteService(IPromoteService): + """ + Promotes pattern-mode annotations by finding matching entities and updating annotation edges. + + This service retrieves candidate pattern-mode annotations (edges pointing to sink node), + searches for matching entities using EntitySearchService (with caching via CacheService), + and updates both the data model edges and RAW tables with the results. + + Pattern-mode annotations are created during diagram detection when entities can't be + matched to the provided entity list but match regex patterns. This service attempts to + resolve those annotations by searching existing annotations and entity aliases. + """ + def __init__( self, client: CogniteClient, @@ -33,6 +56,16 @@ def __init__( entity_search_service: EntitySearchService, cache_service: CacheService, ): + """ + Initialize the promote service with required dependencies. + + Args: + client: CogniteClient for API interactions + config: Configuration object containing data model views and settings + logger: Logger instance for tracking execution + entity_search_service: Service for finding entities by text (injected) + cache_service: Service for caching text→entity mappings (injected) + """ self.client = client self.config = config self.logger = logger @@ -52,9 +85,31 @@ def __init__( self.entity_search_service = entity_search_service self.cache_service = cache_service - def run(self) -> str | None: - """Main entrypoint for the Promote service.""" - candidates = self._get_promote_candidates() + def run(self) -> Literal["Done"] | None: + """ + Main execution method for promoting pattern-mode annotations. + + Process flow: + 1. Retrieve candidate edges (pattern-mode annotations not yet promoted) + 2. Group candidates by (text, type) for deduplication + 3. For each unique text/type: + - Check cache for previous results + - Search for matching entity via EntitySearchService + - Update cache with results + 4. Prepare edge and RAW table updates + 5. Apply updates to data model and RAW tables + + Args: + None + + Returns: + "Done" if no candidates found (processing complete), + None if candidates were processed (more batches may exist). + + Raises: + Exception: Any unexpected errors during processing are logged and re-raised. + """ + candidates: EdgeList | None = self._get_promote_candidates() if not candidates: self.logger.info("No Promote candidates found.") return "Done" @@ -64,12 +119,12 @@ def run(self) -> str | None: # Group candidates by (startNodeText, annotationType) for deduplication grouped_candidates: dict[tuple[str, str], list[Edge]] = {} for edge in candidates: - properties = edge.properties[self.core_annotation_view.as_view_id()] - text = properties.get("startNodeText") - annotation_type = edge.type.external_id + properties: dict[str, Any] = edge.properties[self.core_annotation_view.as_view_id()] + text: Any = properties.get("startNodeText") + annotation_type: str = edge.type.external_id if text and annotation_type: - key = (text, annotation_type) + key: tuple[str, str] = (text, annotation_type) if key not in grouped_candidates: grouped_candidates[key] = [] grouped_candidates[key].append(edge) @@ -79,12 +134,12 @@ def run(self) -> str | None: f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided." ) - edges_to_update = [] - raw_rows_to_update = [] + edges_to_update: list[EdgeApply] = [] + raw_rows_to_update: list[RowWrite] = [] # Process each unique text/type combination once for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items(): - entity_space = ( + entity_space: str | None = ( self.file_view.instance_space if annotation_type == "diagrams.FileLink" else self.target_entities_view.instance_space @@ -95,10 +150,12 @@ def run(self) -> str | None: continue # Strategy: Check cache → query edges → fallback to global search - found_nodes = self._find_entity_with_cache(text_to_find, annotation_type, entity_space) + found_nodes: list[Node] | list = self._find_entity_with_cache(text_to_find, annotation_type, entity_space) # Apply the same result to ALL edges with this text for edge in edges_with_same_text: + edge_apply: EdgeApply | None + raw_row: RowWrite | None edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) if edge_apply: @@ -125,7 +182,21 @@ def run(self) -> str | None: return None # Continue running if more candidates might exist def _get_promote_candidates(self) -> EdgeList | None: - """Queries for suggested edges pointing to the sink node that haven't been PromoteAttempted.""" + """ + Retrieves pattern-mode annotation edges that are candidates for promotion. + + Queries for edges where: + - End node is the sink node (placeholder for unresolved entities) + - Status is "Suggested" (not yet approved/rejected) + - Tags do not contain "PromoteAttempted" (haven't been processed yet) + + Args: + None + + Returns: + EdgeList of candidate edges, or None if no candidates found. + Limited to 500 edges per batch for performance. + """ return self.client.data_modeling.instances.list( instance_type="edge", sources=[self.core_annotation_view.as_view_id()], @@ -151,25 +222,34 @@ def _get_promote_candidates(self) -> EdgeList | None: limit=500, # Batch size ) - def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: str) -> list | None: + def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: str) -> list[Node] | list: """ Finds entity for text using multi-tier caching strategy. - Strategy: - 1. Check cache (in-memory + persistent RAW) - 2. Use EntitySearchService (annotation edges → global search) - 3. Update cache if unambiguous match found + Caching strategy: + - TIER 1: In-memory cache (this run, <1ms) + - TIER 2: Persistent RAW cache (all runs, 5-10ms) + - TIER 3: EntitySearchService (annotation edges, 50-100ms) + - TIER 4: EntitySearchService fallback (global search, 500ms-2s) + + Caching behavior: + - Only caches unambiguous single matches (len(found_nodes) == 1) + - Caches negative results (no match found) to avoid repeated lookups + - Does NOT cache ambiguous results (multiple matches) Args: - text: Text to search for - annotation_type: Type of annotation - entity_space: Space to search in + text: Text to search for (e.g., "V-123", "G18A-921") + annotation_type: Type of annotation ("diagrams.FileLink" or "diagrams.AssetLink") + entity_space: Space to search in for global fallback Returns: - List of matched nodes (empty if no match, 2+ if ambiguous) + List of matched Node objects: + - Empty list [] if no match found + - Single-element list [node] if unambiguous match + - Two-element list [node1, node2] if ambiguous (data quality issue) """ # TIER 1 & 2: Check cache (in-memory + persistent) - cached_node = self.cache_service.get(text, annotation_type) + cached_node: Node | None = self.cache_service.get(text, annotation_type) if cached_node is not None: return [cached_node] @@ -181,7 +261,7 @@ def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: return [] # TIER 3 & 4: Use EntitySearchService (edges → global search) - found_nodes = self.entity_search_service.find_entity(text, annotation_type, entity_space) + found_nodes: list[Node] = self.entity_search_service.find_entity(text, annotation_type, entity_space) # Update cache based on result if found_nodes and len(found_nodes) == 1: @@ -194,23 +274,48 @@ def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: return found_nodes - def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | None, RowWrite | None]: + def _prepare_edge_update( + self, edge: Edge, found_nodes: list[Node] | list + ) -> tuple[EdgeApply | None, RowWrite | None]: """ - Prepares the EdgeApply and RowWrite objects for updating both data model and RAW table. - Returns a tuple of (edge_apply, raw_row) where either can be None if update is not needed. + Prepares updates for both data model edge and RAW table based on entity search results. + + Handles three scenarios: + 1. Single match (len==1): Mark as "Approved", point edge to entity, add "PromotedAuto" tag + 2. No match (len==0): Mark as "Rejected", keep pointing to sink, add "PromoteAttempted" tag + 3. Ambiguous (len>=2): Keep "Suggested", add "PromoteAttempted" and "AmbiguousMatch" tags + + For all cases: + - Retrieves existing RAW row to preserve all data + - Updates edge properties (status, tags, endNode if match found) + - Updates RAW row with same changes + - Returns both for atomic update + + Args: + edge: The annotation edge to update (pattern-mode annotation) + found_nodes: List of matched entity nodes from entity search + - [] = no match + - [node] = single unambiguous match + - [node1, node2] = ambiguous (multiple matches) + + Returns: + Tuple of (EdgeApply, RowWrite): + - EdgeApply: Edge update for data model + - RowWrite: Row update for RAW table + Both will always be returned (never None). """ # Get the current edge properties before creating the write version - edge_props = edge.properties.get(self.core_annotation_view.as_view_id(), {}) - current_tags = edge_props.get("tags", []) - updated_tags = list(current_tags) if isinstance(current_tags, list) else [] + edge_props: Any = edge.properties.get(self.core_annotation_view.as_view_id(), {}) + current_tags: Any = edge_props.get("tags", []) + updated_tags: list[str] = list(current_tags) if isinstance(current_tags, list) else [] # Now create the write version - edge_apply = edge.as_write() + edge_apply: EdgeApply = edge.as_write() # Fetch existing RAW row to preserve all data raw_data: dict[str, Any] = {} try: - existing_row = self.client.raw.rows.retrieve( + existing_row: Any = self.client.raw.rows.retrieve( db_name=self.raw_db, table_name=self.raw_pattern_table, key=edge.external_id ) if existing_row and existing_row.columns: @@ -219,10 +324,10 @@ def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | Non self.logger.warning(f"Could not retrieve RAW row for edge {edge.external_id}: {e}") # Prepare update properties for the edge - update_properties: dict = {} + update_properties: dict[str, Any] = {} if len(found_nodes) == 1: # Success - single match found - matched_node = found_nodes[0] + matched_node: Node = found_nodes[0] self.logger.info(f"Found single match for '{edge_props.get('startNodeText')}'. Promoting edge.") # Update edge to point to the found entity @@ -236,8 +341,8 @@ def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | Non raw_data["status"] = DiagramAnnotationStatus.APPROVED.value # Get resource type from the matched entity - entity_props = matched_node.properties.get(self.target_entities_view.as_view_id(), {}) - resource_type = entity_props.get("resourceType") or entity_props.get("type") + entity_props: Any = matched_node.properties.get(self.target_entities_view.as_view_id(), {}) + resource_type: Any = entity_props.get("resourceType") or entity_props.get("type") if resource_type: raw_data["endNodeResourceType"] = resource_type @@ -264,6 +369,6 @@ def _prepare_edge_update(self, edge: Edge, found_nodes) -> tuple[EdgeApply | Non ) # Create RowWrite object for RAW table update - raw_row = RowWrite(key=edge.external_id, columns=raw_data) if raw_data else None + raw_row: RowWrite | None = RowWrite(key=edge.external_id, columns=raw_data) if raw_data else None return edge_apply, raw_row From c26494308000476053127018f69fd04e40d204d0 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 11:32:10 -0500 Subject: [PATCH 099/128] improved logging --- .../fn_file_annotation_promote/handler.py | 19 ++- .../services/CacheService.py | 27 +++-- .../services/PromoteService.py | 63 +++++++--- .../utils/DataStructures.py | 111 ++++++++++++++++++ 4 files changed, 191 insertions(+), 29 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index d156c52f..38de4852 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -14,6 +14,7 @@ from services.LoggerService import CogniteFunctionLogger from services.EntitySearchService import EntitySearchService from services.CacheService import CacheService +from utils.DataStructures import PromoteTracker def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[str, str]: @@ -52,6 +53,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ config: Config config, client = create_config_service(function_data=data) logger: CogniteFunctionLogger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) + tracker: PromoteTracker = PromoteTracker() # Create service dependencies entity_search_service: EntitySearchService = create_entity_search_service(config, client, logger) @@ -62,10 +64,12 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ client=client, config=config, logger=logger, + tracker=tracker, entity_search_service=entity_search_service, cache_service=cache_service, ) + run_status: str = "success" try: # Run in a loop for a maximum of 7 minutes while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): @@ -73,12 +77,19 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ if result == "Done": logger.info("No more candidates to process. Exiting.", section="END") break - time.sleep(10) # Pause between batches + # Log batch report and pause between batches + logger.info(tracker.generate_local_report(), section="START") + time.sleep(10) - return {"status": "success", "message": "promote function completed a cycle."} + return {"status": run_status, "data": data} except Exception as e: - logger.error(f"An unexpected error occurred: {e}", section="BOTH") - return {"status": "failure", "message": str(e)} + run_status = "failure" + msg: str = f"{str(e)}" + logger.error(f"An unexpected error occurred: {msg}", section="BOTH") + return {"status": run_status, "message": msg} + finally: + # Generate overall summary report + logger.info(tracker.generate_overall_report(), section="BOTH") def run_locally(config_file: dict) -> None: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index 12711918..ca13de50 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -61,15 +61,17 @@ class CacheService(ICacheService): **TIER 1: In-Memory Cache** (This Run Only): - Ultra-fast lookup (<1ms) - Dictionary stored in memory: {(text, type): (space, id) or None} - - Includes negative caching (remembers "no match found") + - **Includes negative caching** (remembers "no match found" to avoid repeated searches) - Cleared when function execution ends + - Used for: Both positive matches AND negative results (not found) **TIER 2: Persistent RAW Cache** (All Runs): - Fast lookup (5-10ms) - Stored in RAW table: promote_text_to_entity_cache - Benefits all future function runs indefinitely - Tracks hit count for analytics - - Only caches unambiguous single matches + - **Only caches positive matches** (unambiguous single entities found) + - Does NOT cache negative results (to allow for new entities added over time) **Performance Impact:** - First lookup: 50-100ms (query annotation edges) @@ -147,11 +149,11 @@ def get(self, text: str, annotation_type: str) -> Node | None: nodes=(space, ext_id), sources=view_id ) if retrieved: - self.logger.debug(f"✓ In-memory cache HIT for '{text}'") + self.logger.debug(f"✓ [CACHE] In-memory cache HIT for '{text}'") node: Node | None = self._extract_single_node(retrieved) return node except Exception as e: - self.logger.warning(f"Failed to retrieve cached node for '{text}': {e}") + self.logger.warning(f"[CACHE] Failed to retrieve cached node for '{text}': {e}") # Invalidate this cache entry del self._memory_cache[cache_key] return None @@ -159,7 +161,7 @@ def get(self, text: str, annotation_type: str) -> Node | None: # TIER 2: Persistent RAW cache (fast) cached_node: Node | None = self._get_from_persistent_cache(text, annotation_type) if cached_node: - self.logger.info(f"✓ Persistent cache HIT for '{text}'") + self.logger.info(f"✓ [CACHE] Persistent cache HIT for '{text}'") # Populate in-memory cache for future lookups in this run self._memory_cache[cache_key] = (cached_node.space, cached_node.external_id) return cached_node @@ -206,25 +208,28 @@ def set(self, text: str, annotation_type: str, node: Node | None) -> None: """ Caches an entity node for the given text and annotation type. - Only caches unambiguous single matches. Updates both in-memory and persistent caches. + Caching behavior: + - Positive matches (node provided): Cached in BOTH in-memory AND persistent RAW + - Negative results (node=None): Cached ONLY in-memory (allows for new entities over time) Args: text: The text being cached annotation_type: Type of annotation - node: The entity node to cache, or None for negative caching + node: The entity node to cache, or None for negative caching (in-memory only) """ cache_key: tuple[str, str] = (text, annotation_type) if node is None: - # Negative cache entry (remember that no match was found) + # Negative cache entry (IN-MEMORY ONLY - not persisted to RAW) + # This avoids repeated searches within the same run but allows new entities added later self._memory_cache[cache_key] = None - self.logger.debug(f"✓ Cached negative result for '{text}'") + self.logger.debug(f"✓ [CACHE] Cached negative result for '{text}' (in-memory only)") return - # Positive cache entry + # Positive cache entry (BOTH in-memory AND persistent RAW) self._memory_cache[cache_key] = (node.space, node.external_id) self._set_in_persistent_cache(text, annotation_type, node) - self.logger.debug(f"✓ Cached unambiguous match for '{text}' → {node.external_id}") + self.logger.debug(f"✓ [CACHE] Cached positive match for '{text}' → {node.external_id} (in-memory + RAW)") def _get_from_persistent_cache(self, text: str, annotation_type: str) -> Node | None: """ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 08c70f6a..3944c66c 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -15,7 +15,7 @@ from services.LoggerService import CogniteFunctionLogger from services.CacheService import CacheService from services.EntitySearchService import EntitySearchService -from utils.DataStructures import DiagramAnnotationStatus +from utils.DataStructures import DiagramAnnotationStatus, PromoteTracker class IPromoteService(abc.ABC): @@ -53,6 +53,7 @@ def __init__( client: CogniteClient, config: Config, logger: CogniteFunctionLogger, + tracker: PromoteTracker, entity_search_service: EntitySearchService, cache_service: CacheService, ): @@ -63,12 +64,14 @@ def __init__( client: CogniteClient for API interactions config: Configuration object containing data model views and settings logger: Logger instance for tracking execution + tracker: Performance tracker for metrics (edges promoted/rejected/ambiguous) entity_search_service: Service for finding entities by text (injected) cache_service: Service for caching text→entity mappings (injected) """ self.client = client self.config = config self.logger = logger + self.tracker = tracker self.core_annotation_view = self.config.data_model_views.core_annotation_view self.file_view = self.config.data_model_views.file_view self.target_entities_view = self.config.data_model_views.target_entities_view @@ -109,9 +112,11 @@ def run(self) -> Literal["Done"] | None: Raises: Exception: Any unexpected errors during processing are logged and re-raised. """ + self.logger.info("Starting Promote batch", section="START") + candidates: EdgeList | None = self._get_promote_candidates() if not candidates: - self.logger.info("No Promote candidates found.") + self.logger.info("No Promote candidates found.", section="END") return "Done" self.logger.info(f"Found {len(candidates)} Promote candidates. Starting processing.") @@ -130,13 +135,21 @@ def run(self) -> Literal["Done"] | None: grouped_candidates[key].append(edge) self.logger.info( - f"Grouped {len(candidates)} candidates into {len(grouped_candidates)} unique text/type combinations. " - f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided." + message=f"Grouped {len(candidates)} candidates into {len(grouped_candidates)} unique text/type combinations.", + ) + self.logger.info( + message=f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided.", + section="END", ) edges_to_update: list[EdgeApply] = [] raw_rows_to_update: list[RowWrite] = [] + # Track results for this batch + batch_promoted: int = 0 + batch_rejected: int = 0 + batch_ambiguous: int = 0 + # Process each unique text/type combination once for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items(): entity_space: str | None = ( @@ -152,20 +165,36 @@ def run(self) -> Literal["Done"] | None: # Strategy: Check cache → query edges → fallback to global search found_nodes: list[Node] | list = self._find_entity_with_cache(text_to_find, annotation_type, entity_space) + # Determine result type for tracking + num_edges: int = len(edges_with_same_text) + if len(found_nodes) == 1: + batch_promoted += num_edges + elif len(found_nodes) == 0: + batch_rejected += num_edges + else: # Multiple matches + batch_ambiguous += num_edges + # Apply the same result to ALL edges with this text for edge in edges_with_same_text: - edge_apply: EdgeApply | None - raw_row: RowWrite | None edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) - if edge_apply: + if edge_apply is not None: edges_to_update.append(edge_apply) - if raw_row: + if raw_row is not None: raw_rows_to_update.append(raw_row) + # Update tracker with batch results + self.tracker.add_edges(promoted=batch_promoted, rejected=batch_rejected, ambiguous=batch_ambiguous) + if edges_to_update: self.client.data_modeling.instances.apply(edges=edges_to_update) - self.logger.info(f"Successfully updated {len(edges_to_update)} edges in data model.") + self.logger.info( + f"Successfully updated {len(edges_to_update)} edges in data model:\n" + f" ├─ Promoted: {batch_promoted}\n" + f" ├─ Rejected: {batch_rejected}\n" + f" └─ Ambiguous: {batch_ambiguous}", + section="END", + ) if raw_rows_to_update: self.client.raw.rows.insert( @@ -174,10 +203,10 @@ def run(self) -> Literal["Done"] | None: row=raw_rows_to_update, ensure_parent=True, ) - self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.") + self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.", section="END") if not edges_to_update and not raw_rows_to_update: - self.logger.info("No edges were updated in this run.") + self.logger.info("No edges were updated in this run.", section="END") return None # Continue running if more candidates might exist @@ -328,7 +357,9 @@ def _prepare_edge_update( if len(found_nodes) == 1: # Success - single match found matched_node: Node = found_nodes[0] - self.logger.info(f"Found single match for '{edge_props.get('startNodeText')}'. Promoting edge.") + self.logger.info( + f"✓ Found single match for '{edge_props.get('startNodeText')}' → {matched_node.external_id}. \n\t- Promoting edge: ({edge.space}, {edge.external_id})\n\t- Start node: ({edge.start_node.space}, {edge.start_node.external_id})." + ) # Update edge to point to the found entity edge_apply.end_node = DirectRelationReference(matched_node.space, matched_node.external_id) @@ -347,7 +378,9 @@ def _prepare_edge_update( raw_data["endNodeResourceType"] = resource_type elif len(found_nodes) == 0: # Failure - no match found - self.logger.info(f"Found no match for '{edge_props.get('startNodeText')}'. Rejecting edge.") + self.logger.info( + f"✗ No match found for '{edge_props.get('startNodeText')}'.\n\t- Rejecting edge: ({edge.space}, {edge.external_id})\n\t- Start node: ({edge.start_node.space}, {edge.start_node.external_id})." + ) update_properties["status"] = DiagramAnnotationStatus.REJECTED.value updated_tags.append("PromoteAttempted") @@ -355,7 +388,9 @@ def _prepare_edge_update( raw_data["status"] = DiagramAnnotationStatus.REJECTED.value else: # Ambiguous - multiple matches found - self.logger.info(f"Found multiple matches for '{edge_props.get('startNodeText')}'. Marking as ambiguous.") + self.logger.info( + f"⚠ Multiple matches found for '{edge_props.get('startNodeText')}'.\n\t- Ambiguous edge: ({edge.space}, {edge.external_id})\n\t- Start node: ({edge.start_node.space}, {edge.start_node.external_id})." + ) updated_tags.extend(["PromoteAttempted", "AmbiguousMatch"]) # Don't change status, just add tags to RAW diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py index b8b418bb..3f817694 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/utils/DataStructures.py @@ -329,3 +329,114 @@ def reset(self) -> None: self.total_time_delta = timedelta(0) self.latest_run_time = datetime.now(timezone.utc) print("PerformanceTracker state has been reset") + + +@dataclass +class PromoteTracker: + """ + Tracks metrics for the promote function. + + Metrics: + - edges_promoted: Edges successfully promoted (single match found) + - edges_rejected: Edges rejected (no match found) + - edges_ambiguous: Edges with ambiguous matches (multiple entities found) + - total_runs: Number of batches processed + - total_time_delta: Cumulative runtime + """ + + edges_promoted: int = 0 + edges_rejected: int = 0 + edges_ambiguous: int = 0 + total_runs: int = 0 + total_time_delta: timedelta = field(default_factory=lambda: timedelta(0)) + latest_run_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + def _run_time(self) -> timedelta: + """Calculates time since last run started.""" + time_delta: timedelta = datetime.now(timezone.utc) - self.latest_run_time + return time_delta + + def _average_run_time(self) -> timedelta: + """Calculates average time per batch.""" + if self.total_runs == 0: + return timedelta(0) + return self.total_time_delta / self.total_runs + + def add_edges(self, promoted: int = 0, rejected: int = 0, ambiguous: int = 0) -> None: + """ + Adds edge counts to the tracker. + + Args: + promoted: Number of edges successfully promoted + rejected: Number of edges rejected (no match) + ambiguous: Number of edges with ambiguous matches + """ + self.edges_promoted += promoted + self.edges_rejected += rejected + self.edges_ambiguous += ambiguous + + def generate_local_report(self) -> str: + """ + Generates a report for the current batch. + + Returns: + String report with run time + """ + self.total_runs += 1 + time_delta: timedelta = self._run_time() + self.total_time_delta += time_delta + self.latest_run_time = datetime.now(timezone.utc) + + report: str = f"Batch run time: {time_delta}" + return report + + def generate_overall_report(self) -> str: + """ + Generates a comprehensive report for all runs. + + Returns: + String report with all metrics + """ + total_edges: int = self.edges_promoted + self.edges_rejected + self.edges_ambiguous + report: str = ( + f"Promote Function Summary\n" + f"- Total runs: {self.total_runs}\n" + f"- Total edges processed: {total_edges}\n" + f" ├─ Promoted (auto): {self.edges_promoted}\n" + f" ├─ Rejected (no match): {self.edges_rejected}\n" + f" └─ Ambiguous (multiple matches): {self.edges_ambiguous}\n" + f"- Total run time: {self.total_time_delta}\n" + f"- Average run time: {self._average_run_time()}" + ) + return report + + def generate_ep_run(self, function_id: str | None, call_id: str | None) -> str: + """ + Generates a report string for extraction pipeline logging. + + Args: + function_id: Cognite Function ID + call_id: Cognite Function call ID + + Returns: + String report for extraction pipeline + """ + total_edges: int = self.edges_promoted + self.edges_rejected + self.edges_ambiguous + report: str = ( + f"(caller:Promote, function_id:{function_id}, call_id:{call_id}) - " + f"total edges processed: {total_edges} - " + f"promoted: {self.edges_promoted} - " + f"rejected: {self.edges_rejected} - " + f"ambiguous: {self.edges_ambiguous}" + ) + return report + + def reset(self) -> None: + """Resets all tracker metrics to initial state.""" + self.edges_promoted = 0 + self.edges_rejected = 0 + self.edges_ambiguous = 0 + self.total_runs = 0 + self.total_time_delta = timedelta(0) + self.latest_run_time = datetime.now(timezone.utc) + print("PromoteTracker state has been reset") From b0938dbc7daa1c64db67ff2c01c7c3bb8b108b73 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 13:00:42 -0500 Subject: [PATCH 100/128] added a promoteFunction config section --- .../cdf_file_annotation/default.config.yaml | 3 + .../ep_file_annotation.config.yaml | 35 +++ .../CONFIGURATION_GUIDE.md | 295 ++++++++++++++++++ .../dependencies.py | 11 +- .../services/CacheService.py | 2 +- .../services/ConfigService.py | 65 ++++ .../services/PromoteService.py | 56 +++- .../functions/functions.Function.yaml | 10 + 8 files changed, 461 insertions(+), 16 deletions(-) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index 9389ec03..39c60941 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -19,6 +19,7 @@ rawTableDocDoc: annotation_documents_docs rawTableDocPattern: annotation_documents_patterns rawTableCache: annotation_entities_cache rawManualPatternsCatalog: manual_patterns_catalog +rawTablePromoteCache: promote_text_cache # used in /extraction_pipelines extractionPipelineExternalId: ep_file_annotation @@ -32,6 +33,8 @@ launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, t launchFunctionVersion: v1.0.0 finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID finalizeFunctionVersion: v1.0.0 +promoteFunctionExternalId: fn_file_annotation_promote #NOTE: if this is changed, then the folder holding the promote function must be named the same as the new external ID +promoteFunctionVersion: v1.0.0 functionClientId: ${IDP_CLIENT_ID} functionClientSecret: ${IDP_CLIENT_SECRET} diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 5316ef9e..fb18dd21 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -129,4 +129,39 @@ config: rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} rawTableDocPattern: {{ rawTableDocPattern }} + promoteFunction: + # Query configuration for finding candidate edges to promote + getCandidatesQuery: + targetView: + schemaSpace: cdf_cdm + externalId: CogniteDiagramAnnotation + version: v1 + filters: + - values: "Suggested" # Only process suggested annotations + negate: False + operator: Equals + targetProperty: status + - values: ["PromoteAttempted"] # Skip already attempted edges + negate: True + operator: In + targetProperty: tags + limit: 500 # Number of edges to process per batch + # RAW database configuration + rawDb: {{ rawDb }} + rawTableDocPattern: {{ rawTableDocPattern }} + rawTableDocTag: {{ rawTableDocTag }} + rawTableDocDoc: {{ rawTableDocDoc }} + # Entity search service configuration + entitySearchService: + enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast, checks existing annotation edges) + enableGlobalEntitySearch: true # Fallback: Global entity search - (slow, unstable as instance count grows) + maxEntitySearchLimit: 1000 # Max entities to fetch in global search + textNormalization: + removeSpecialCharacters: true # Remove non-alphanumeric characters + convertToLowercase: true # Normalize case + stripLeadingZeros: true # Handle "V-0912" vs "V-912" + generateVariations: true # Generate common text variations for matching + # Cache service configuration + cacheService: + cacheTableName: {{ rawTablePromoteCache }} diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md new file mode 100644 index 00000000..b19cb081 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md @@ -0,0 +1,295 @@ +# Promote Function Configuration Guide + +## Overview + +The promote function now supports a dedicated `promoteFunction` configuration section in the extraction pipeline config, following the same pattern as `launchFunction` and `finalizeFunction`. This makes the function fully configurable and enables environment-specific tuning. + +The configuration is **organized by service interface**, matching the architectural pattern used throughout the file annotation system. + +## What Was Added + +### 1. Configuration Classes (ConfigService.py) + +The configuration follows a service-oriented structure, grouping settings by their respective service interfaces. + +#### `EntitySearchServiceConfig` + +Configuration for the EntitySearchService, controls entity search strategies and text normalization: + +- `enableExistingAnnotationsSearch` (bool): Enable primary search via annotation edges (fast, 50-100ms) +- `enableGlobalEntitySearch` (bool): Enable fallback global entity search (slow, 500ms-2s) +- `maxEntitySearchLimit` (int): Max entities to fetch in global search (default: 1000, max: 10000) +- `textNormalization` (TextNormalizationConfig): Text normalization settings (nested) + +#### `TextNormalizationConfig` + +Controls text normalization and variation generation (nested within `EntitySearchServiceConfig`): + +- `removeSpecialCharacters` (bool): Remove non-alphanumeric characters +- `convertToLowercase` (bool): Normalize case +- `stripLeadingZeros` (bool): Strip leading zeros (e.g., "V-0912" → "V-912") +- `generateVariations` (bool): Generate common text variations for matching + +#### `PromoteCacheServiceConfig` + +Configuration for the CacheService, controls caching behavior: + +- `cacheTableName` (str): RAW table for text→entity cache (default: "promote_text_to_entity_cache") + +#### `PromoteFunctionConfig` + +Main configuration for the promote function: + +- `getCandidatesQuery` (QueryConfig): Query for finding candidate edges to promote (includes `limit` field for batch size) +- `rawDb` (str): RAW database name +- `rawTableDocPattern` (str): RAW table for pattern-mode annotations +- `rawTableDocTag` (str): RAW table for tag annotations +- `rawTableDocDoc` (str): RAW table for document annotations +- `entitySearchService` (EntitySearchServiceConfig): Entity search service configuration +- `cacheService` (PromoteCacheServiceConfig): Cache service configuration + +**Note**: Batch size is controlled via the `limit` field in `getCandidatesQuery`. If set to `-1` (unlimited), defaults to 500. + +### 2. Updated Files + +- **ConfigService.py**: Added new config classes organized by service +- **PromoteService.py**: Updated to use config values instead of hardcoded constants +- **dependencies.py**: Updated to use config values when creating services +- **ep_file_annotation.config.yaml**: Added example `promoteFunction` section + +## Configuration Example + +Here's the complete `promoteFunction` section added to the extraction pipeline config: + +```yaml +promoteFunction: + # Query configuration for finding candidate edges to promote + getCandidatesQuery: + targetView: + schemaSpace: cdf_cdm + externalId: CogniteDiagramAnnotation + version: v1 + filters: + - values: "Suggested" # Only process suggested annotations + negate: False + operator: Equals + targetProperty: status + - values: ["PromoteAttempted"] # Skip already attempted edges + negate: True + operator: In + targetProperty: tags + limit: 500 # Number of edges to process per batch + + # RAW database configuration + rawDb: { { rawDb } } + rawTableDocPattern: { { rawTableDocPattern } } + rawTableDocTag: { { rawTableDocTag } } + rawTableDocDoc: { { rawTableDocDoc } } + + # Entity search service configuration + entitySearchService: + enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast) + enableGlobalEntitySearch: true # Fallback: Global entity search (slow) + maxEntitySearchLimit: 1000 # Max entities to fetch in global search + textNormalization: + removeSpecialCharacters: true # Remove non-alphanumeric characters + convertToLowercase: true # Normalize case + stripLeadingZeros: true # Handle "V-0912" vs "V-912" + generateVariations: true # Generate common text variations + + # Cache service configuration + cacheService: + cacheTableName: "promote_text_to_entity_cache" +``` + +## Service-Oriented Structure + +The configuration mirrors the actual service architecture: + +### EntitySearchService + +Handles finding entities by text using multiple strategies: + +- Primary: Query existing annotation edges +- Fallback: Global entity search +- Text normalization for matching + +**Config Section:** `entitySearchService` + +### CacheService + +Manages two-tier caching (in-memory + persistent RAW): + +- In-memory cache for this run +- Persistent RAW cache across all runs + +**Config Section:** `cacheService` + +This structure matches the patterns established in: + +- `launchFunction` → `dataModelService`, `cacheService`, `annotationService` +- `finalizeFunction` → `retrieveService`, `applyService` + +## Backward Compatibility + +The implementation includes full backward compatibility: + +1. **Optional Config Section**: The `promoteFunction` section is optional in the Config class +2. **Fallback Behavior**: If `promoteFunction` is not present, the function falls back to: + - RAW database config from `finalizeFunction.applyService` + - Hardcoded filter for candidate queries + - Default values for all other settings +3. **Warning Logs**: When falling back to old behavior, a warning is logged + +## Usage Examples + +### Example 1: Increase Batch Size for Better Performance + +```yaml +promoteFunction: + getCandidatesQuery: + targetView: + schemaSpace: cdf_cdm + externalId: CogniteDiagramAnnotation + version: v1 + filters: + - values: "Suggested" + operator: Equals + targetProperty: status + limit: 1000 # Process more edges per batch (increased from default 500) + # ... rest of config +``` + +### Example 2: Disable Global Search (Only Use Existing Annotations) + +```yaml +promoteFunction: + entitySearchService: + enableExistingAnnotationsSearch: true + enableGlobalEntitySearch: false # Skip slow global search + maxEntitySearchLimit: 1000 + textNormalization: + removeSpecialCharacters: true + convertToLowercase: true + stripLeadingZeros: true + generateVariations: true + # ... rest of config +``` + +### Example 3: Custom Query Filter + +```yaml +promoteFunction: + getCandidatesQuery: + targetView: + schemaSpace: cdf_cdm + externalId: CogniteDiagramAnnotation + version: v1 + filters: + - values: "Suggested" + operator: Equals + targetProperty: status + - values: ["HighPriority"] # Only promote high-priority edges + operator: In + targetProperty: tags + # ... rest of config +``` + +### Example 4: Separate Cache Per Environment + +```yaml +promoteFunction: + cacheService: + cacheTableName: "promote_text_to_entity_cache_prod" # Environment-specific cache + # ... rest of config +``` + +### Example 5: Adjust Text Normalization + +```yaml +promoteFunction: + entitySearchService: + enableExistingAnnotationsSearch: true + enableGlobalEntitySearch: true + maxEntitySearchLimit: 1000 + textNormalization: + removeSpecialCharacters: true + convertToLowercase: false # Preserve case sensitivity + stripLeadingZeros: false # Keep leading zeros + generateVariations: true + # ... rest of config +``` + +## Migration Guide + +### For Existing Deployments + +1. **No Immediate Action Required**: The function continues to work without the new config section +2. **Recommended**: Add the `promoteFunction` section to gain benefits: + - Flexible candidate filtering + - Performance tuning per environment + - Explicit configuration visibility + - Service-oriented organization + +### Adding Configuration to Existing Pipeline + +1. Open your extraction pipeline config file (e.g., `ep_file_annotation.config.yaml`) +2. Add the `promoteFunction` section after `finalizeFunction` +3. Customize values as needed for your environment +4. Deploy the updated configuration + +## Configuration Benefits + +1. **Service-Oriented**: Configuration mirrors actual service architecture +2. **Flexibility**: Easily adjust query filters without code changes +3. **Performance Tuning**: Optimize batch sizes and search strategies per environment +4. **Visibility**: All settings are explicitly documented in config +5. **Consistency**: Follows same pattern as launch/finalize functions +6. **Environment-Specific**: Different configs for dev/test/prod + +## Service Configuration Details + +### EntitySearchService Configuration + +Controls how the promote function finds matching entities: + +| Setting | Type | Default | Description | +| --------------------------------- | ---- | ------- | ------------------------------- | +| `enableExistingAnnotationsSearch` | bool | true | Primary search strategy (fast) | +| `enableGlobalEntitySearch` | bool | true | Fallback search strategy (slow) | +| `maxEntitySearchLimit` | int | 1000 | Max entities in global search | + +**Text Normalization Settings:** + +| Setting | Type | Default | Description | +| ------------------------- | ---- | ------- | ----------------------------- | +| `removeSpecialCharacters` | bool | true | Remove non-alphanumeric chars | +| `convertToLowercase` | bool | true | Case-insensitive matching | +| `stripLeadingZeros` | bool | true | Handle "V-0912" vs "V-912" | +| `generateVariations` | bool | true | Generate text variations | + +### CacheService Configuration + +Controls caching behavior for performance optimization: + +| Setting | Type | Default | Description | +| ---------------- | ---- | ------------------------------ | ------------------------------ | +| `cacheTableName` | str | "promote_text_to_entity_cache" | RAW table for persistent cache | + +## Future Enhancements + +The config structure supports easy addition of new features: + +- Retry logic configuration +- Tagging customization +- Batch processing delays +- Feature flags for A/B testing +- Additional service configurations + +## Questions? + +For questions or issues, refer to: + +- Main README: `cdf_file_annotation/README.md` +- Code documentation in ConfigService.py +- Example configs in extraction_pipelines/ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py index 44cff27b..f5788604 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py @@ -203,10 +203,17 @@ def create_cache_service( """ from services.ConfigService import ViewPropertyConfig - raw_db: str = config.finalize_function.apply_service.raw_db file_view: ViewPropertyConfig = config.data_model_views.file_view target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + # Use promote_function config if available, otherwise fallback to finalize_function + if config.promote_function: + raw_db: str = config.promote_function.raw_db + cache_table_name: str = config.promote_function.cache_service.cache_table_name + else: + raw_db = config.finalize_function.apply_service.raw_db + cache_table_name = "promote_text_to_entity_cache" # Default + return CacheService( client=client, logger=logger, @@ -214,5 +221,5 @@ def create_cache_service( normalize_fn=entity_search_service.normalize, # Reuse normalization from entity search file_view_id=file_view.as_view_id(), target_entities_view_id=target_entities_view.as_view_id(), - cache_table_name="promote_text_to_entity_cache", + cache_table_name=cache_table_name, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index ca13de50..c04b4bfd 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -277,7 +277,7 @@ def _get_from_persistent_cache(self, text: str, annotation_type: str) -> Node | except Exception as e: # Cache miss or error - just continue without cache - self.logger.debug(f"Cache check failed for '{text}': {e}") + self.logger.debug(f"[CACHE] Cache check failed for '{text}': {e}") return None def _set_in_persistent_cache(self, text: str, annotation_type: str, node: Node) -> None: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index 8b5bd257..90042022 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -220,6 +220,70 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): apply_service: ApplyServiceConfig +# Promote Related Configs +class TextNormalizationConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for text normalization and variation generation. + + Controls how text is normalized for matching and what variations are generated + to improve match rates across different naming conventions. + """ + + remove_special_characters: bool = True + convert_to_lowercase: bool = True + strip_leading_zeros: bool = True + generate_variations: bool = True + + +class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the EntitySearchService in the promote function. + + Controls entity search strategies and text normalization behavior: + - Primary: Query existing annotation edges (fast, 50-100ms) + - Fallback: Global entity search (slow, 500ms-2s) + - Text normalization for improved matching + """ + + enable_existing_annotations_search: bool = True + enable_global_entity_search: bool = True + max_entity_search_limit: int = Field(default=1000, gt=0, le=10000) + text_normalization: TextNormalizationConfig + + +class PromoteCacheServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the CacheService in the promote function. + + Controls caching behavior for text→entity mappings. + """ + + cache_table_name: str + + +class PromoteFunctionConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the promote function. + + The promote function resolves pattern-mode annotations by finding matching entities + and updating annotation edges from pointing to a sink node to pointing to actual entities. + + Configuration is organized by service interface: + - entitySearchService: Controls entity search strategies + - cacheService: Controls caching behavior + + Batch size is controlled via getCandidatesQuery.limit field. + """ + + get_candidates_query: QueryConfig | list[QueryConfig] + raw_db: str + raw_table_doc_pattern: str + raw_table_doc_tag: str + raw_table_doc_doc: str + entity_search_service: EntitySearchServiceConfig + cache_service: PromoteCacheServiceConfig + + class DataModelViews(BaseModel, alias_generator=to_camel): core_annotation_view: ViewPropertyConfig annotation_state_view: ViewPropertyConfig @@ -232,6 +296,7 @@ class Config(BaseModel, alias_generator=to_camel): prepare_function: PrepareFunction launch_function: LaunchFunction finalize_function: FinalizeFunction + promote_function: Optional[PromoteFunctionConfig] = None @classmethod def parse_direct_relation(cls, value: Any) -> Any: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 3944c66c..7bc1f425 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -11,7 +11,7 @@ DirectRelationReference, NodeList, ) -from services.ConfigService import Config +from services.ConfigService import Config, build_filter_from_query, get_limit_from_query from services.LoggerService import CogniteFunctionLogger from services.CacheService import CacheService from services.EntitySearchService import EntitySearchService @@ -75,14 +75,29 @@ def __init__( self.core_annotation_view = self.config.data_model_views.core_annotation_view self.file_view = self.config.data_model_views.file_view self.target_entities_view = self.config.data_model_views.target_entities_view + + # Sink node reference (from finalize_function config as it's shared) self.sink_node_ref = DirectRelationReference( space=self.config.finalize_function.apply_service.sink_node.space, external_id=self.config.finalize_function.apply_service.sink_node.external_id, ) - self.raw_db = self.config.finalize_function.apply_service.raw_db - self.raw_pattern_table = self.config.finalize_function.apply_service.raw_table_doc_pattern - self.raw_doc_doc_table = self.config.finalize_function.apply_service.raw_table_doc_doc - self.raw_doc_tag_table = self.config.finalize_function.apply_service.raw_table_doc_tag + + # RAW database and table configuration + # Prefer promote_function config if available, otherwise fallback to finalize_function config + if self.config.promote_function: + self.raw_db = self.config.promote_function.raw_db + self.raw_pattern_table = self.config.promote_function.raw_table_doc_pattern + self.raw_doc_doc_table = self.config.promote_function.raw_table_doc_doc + self.raw_doc_tag_table = self.config.promote_function.raw_table_doc_tag + else: + # Backward compatibility: use finalize_function config + self.logger.warning( + "promote_function config not found. Using finalize_function config for backward compatibility." + ) + self.raw_db = self.config.finalize_function.apply_service.raw_db + self.raw_pattern_table = self.config.finalize_function.apply_service.raw_table_doc_pattern + self.raw_doc_doc_table = self.config.finalize_function.apply_service.raw_table_doc_doc + self.raw_doc_tag_table = self.config.finalize_function.apply_service.raw_table_doc_tag # Injected service dependencies self.entity_search_service = entity_search_service @@ -214,7 +229,10 @@ def _get_promote_candidates(self) -> EdgeList | None: """ Retrieves pattern-mode annotation edges that are candidates for promotion. - Queries for edges where: + Uses query configuration from promote_function config if available, otherwise falls back + to hardcoded filter for backward compatibility. + + Default query criteria (when no config): - End node is the sink node (placeholder for unresolved entities) - Status is "Suggested" (not yet approved/rejected) - Tags do not contain "PromoteAttempted" (haven't been processed yet) @@ -224,12 +242,18 @@ def _get_promote_candidates(self) -> EdgeList | None: Returns: EdgeList of candidate edges, or None if no candidates found. - Limited to 500 edges per batch for performance. + Limited by getCandidatesQuery.limit (default 500 if -1/unlimited). """ - return self.client.data_modeling.instances.list( - instance_type="edge", - sources=[self.core_annotation_view.as_view_id()], - filter={ + # Use query config if available + if self.config.promote_function and self.config.promote_function.get_candidates_query: + query_filter = build_filter_from_query(self.config.promote_function.get_candidates_query) + limit = get_limit_from_query(self.config.promote_function.get_candidates_query) + # If limit is -1 (unlimited), use sensible default + if limit == -1: + limit = 500 + else: + # Backward compatibility: hardcoded filter + query_filter = { "and": [ { "equals": { @@ -247,8 +271,14 @@ def _get_promote_candidates(self) -> EdgeList | None: } }, ] - }, - limit=500, # Batch size + } + limit = 500 # Default batch size + + return self.client.data_modeling.instances.list( + instance_type="edge", + sources=[self.core_annotation_view.as_view_id()], + filter=query_filter, + limit=limit, ) def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: str) -> list[Node] | list: diff --git a/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml b/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml index 11e711bf..7d95801b 100644 --- a/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml +++ b/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml @@ -18,3 +18,13 @@ runtime: "py311" functionPath: "handler.py" +- name: Promote File Annotations + externalId: {{ promoteFunctionExternalId }} + owner: "Anonymous" + description: "Automatically promote suggested pattern mode annotations created by the finalize function if it exists." + metadata: + version: {{ promoteFunctionVersion }} + + runtime: "py311" + functionPath: "handler.py" + From 2de30be47de78d838579dfd3eb39242125d5f51f Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 17:25:02 -0500 Subject: [PATCH 101/128] implemented the promote service flags --- .../ep_file_annotation.config.yaml | 10 +- .../CONFIGURATION_GUIDE.md | 295 ------------------ .../dependencies.py | 45 +-- .../services/CacheService.py | 29 +- .../services/ConfigService.py | 6 +- .../services/EntitySearchService.py | 132 +++++--- 6 files changed, 120 insertions(+), 397 deletions(-) delete mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index fb18dd21..79b4f41b 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -146,22 +146,18 @@ config: operator: In targetProperty: tags limit: 500 # Number of edges to process per batch - # RAW database configuration rawDb: {{ rawDb }} rawTableDocPattern: {{ rawTableDocPattern }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} - # Entity search service configuration entitySearchService: enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast, checks existing annotation edges) enableGlobalEntitySearch: true # Fallback: Global entity search - (slow, unstable as instance count grows) maxEntitySearchLimit: 1000 # Max entities to fetch in global search textNormalization: - removeSpecialCharacters: true # Remove non-alphanumeric characters - convertToLowercase: true # Normalize case - stripLeadingZeros: true # Handle "V-0912" vs "V-912" - generateVariations: true # Generate common text variations for matching - # Cache service configuration + removeSpecialCharacters: true # Remove non-alphanumeric characters (e.g., "V-0912" → "V0912") + convertToLowercase: true # Convert to lowercase (e.g., "V0912" → "v0912") + stripLeadingZeros: true # Remove leading zeros (e.g., "v0912" → "v912") cacheService: cacheTableName: {{ rawTablePromoteCache }} diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md deleted file mode 100644 index b19cb081..00000000 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/CONFIGURATION_GUIDE.md +++ /dev/null @@ -1,295 +0,0 @@ -# Promote Function Configuration Guide - -## Overview - -The promote function now supports a dedicated `promoteFunction` configuration section in the extraction pipeline config, following the same pattern as `launchFunction` and `finalizeFunction`. This makes the function fully configurable and enables environment-specific tuning. - -The configuration is **organized by service interface**, matching the architectural pattern used throughout the file annotation system. - -## What Was Added - -### 1. Configuration Classes (ConfigService.py) - -The configuration follows a service-oriented structure, grouping settings by their respective service interfaces. - -#### `EntitySearchServiceConfig` - -Configuration for the EntitySearchService, controls entity search strategies and text normalization: - -- `enableExistingAnnotationsSearch` (bool): Enable primary search via annotation edges (fast, 50-100ms) -- `enableGlobalEntitySearch` (bool): Enable fallback global entity search (slow, 500ms-2s) -- `maxEntitySearchLimit` (int): Max entities to fetch in global search (default: 1000, max: 10000) -- `textNormalization` (TextNormalizationConfig): Text normalization settings (nested) - -#### `TextNormalizationConfig` - -Controls text normalization and variation generation (nested within `EntitySearchServiceConfig`): - -- `removeSpecialCharacters` (bool): Remove non-alphanumeric characters -- `convertToLowercase` (bool): Normalize case -- `stripLeadingZeros` (bool): Strip leading zeros (e.g., "V-0912" → "V-912") -- `generateVariations` (bool): Generate common text variations for matching - -#### `PromoteCacheServiceConfig` - -Configuration for the CacheService, controls caching behavior: - -- `cacheTableName` (str): RAW table for text→entity cache (default: "promote_text_to_entity_cache") - -#### `PromoteFunctionConfig` - -Main configuration for the promote function: - -- `getCandidatesQuery` (QueryConfig): Query for finding candidate edges to promote (includes `limit` field for batch size) -- `rawDb` (str): RAW database name -- `rawTableDocPattern` (str): RAW table for pattern-mode annotations -- `rawTableDocTag` (str): RAW table for tag annotations -- `rawTableDocDoc` (str): RAW table for document annotations -- `entitySearchService` (EntitySearchServiceConfig): Entity search service configuration -- `cacheService` (PromoteCacheServiceConfig): Cache service configuration - -**Note**: Batch size is controlled via the `limit` field in `getCandidatesQuery`. If set to `-1` (unlimited), defaults to 500. - -### 2. Updated Files - -- **ConfigService.py**: Added new config classes organized by service -- **PromoteService.py**: Updated to use config values instead of hardcoded constants -- **dependencies.py**: Updated to use config values when creating services -- **ep_file_annotation.config.yaml**: Added example `promoteFunction` section - -## Configuration Example - -Here's the complete `promoteFunction` section added to the extraction pipeline config: - -```yaml -promoteFunction: - # Query configuration for finding candidate edges to promote - getCandidatesQuery: - targetView: - schemaSpace: cdf_cdm - externalId: CogniteDiagramAnnotation - version: v1 - filters: - - values: "Suggested" # Only process suggested annotations - negate: False - operator: Equals - targetProperty: status - - values: ["PromoteAttempted"] # Skip already attempted edges - negate: True - operator: In - targetProperty: tags - limit: 500 # Number of edges to process per batch - - # RAW database configuration - rawDb: { { rawDb } } - rawTableDocPattern: { { rawTableDocPattern } } - rawTableDocTag: { { rawTableDocTag } } - rawTableDocDoc: { { rawTableDocDoc } } - - # Entity search service configuration - entitySearchService: - enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast) - enableGlobalEntitySearch: true # Fallback: Global entity search (slow) - maxEntitySearchLimit: 1000 # Max entities to fetch in global search - textNormalization: - removeSpecialCharacters: true # Remove non-alphanumeric characters - convertToLowercase: true # Normalize case - stripLeadingZeros: true # Handle "V-0912" vs "V-912" - generateVariations: true # Generate common text variations - - # Cache service configuration - cacheService: - cacheTableName: "promote_text_to_entity_cache" -``` - -## Service-Oriented Structure - -The configuration mirrors the actual service architecture: - -### EntitySearchService - -Handles finding entities by text using multiple strategies: - -- Primary: Query existing annotation edges -- Fallback: Global entity search -- Text normalization for matching - -**Config Section:** `entitySearchService` - -### CacheService - -Manages two-tier caching (in-memory + persistent RAW): - -- In-memory cache for this run -- Persistent RAW cache across all runs - -**Config Section:** `cacheService` - -This structure matches the patterns established in: - -- `launchFunction` → `dataModelService`, `cacheService`, `annotationService` -- `finalizeFunction` → `retrieveService`, `applyService` - -## Backward Compatibility - -The implementation includes full backward compatibility: - -1. **Optional Config Section**: The `promoteFunction` section is optional in the Config class -2. **Fallback Behavior**: If `promoteFunction` is not present, the function falls back to: - - RAW database config from `finalizeFunction.applyService` - - Hardcoded filter for candidate queries - - Default values for all other settings -3. **Warning Logs**: When falling back to old behavior, a warning is logged - -## Usage Examples - -### Example 1: Increase Batch Size for Better Performance - -```yaml -promoteFunction: - getCandidatesQuery: - targetView: - schemaSpace: cdf_cdm - externalId: CogniteDiagramAnnotation - version: v1 - filters: - - values: "Suggested" - operator: Equals - targetProperty: status - limit: 1000 # Process more edges per batch (increased from default 500) - # ... rest of config -``` - -### Example 2: Disable Global Search (Only Use Existing Annotations) - -```yaml -promoteFunction: - entitySearchService: - enableExistingAnnotationsSearch: true - enableGlobalEntitySearch: false # Skip slow global search - maxEntitySearchLimit: 1000 - textNormalization: - removeSpecialCharacters: true - convertToLowercase: true - stripLeadingZeros: true - generateVariations: true - # ... rest of config -``` - -### Example 3: Custom Query Filter - -```yaml -promoteFunction: - getCandidatesQuery: - targetView: - schemaSpace: cdf_cdm - externalId: CogniteDiagramAnnotation - version: v1 - filters: - - values: "Suggested" - operator: Equals - targetProperty: status - - values: ["HighPriority"] # Only promote high-priority edges - operator: In - targetProperty: tags - # ... rest of config -``` - -### Example 4: Separate Cache Per Environment - -```yaml -promoteFunction: - cacheService: - cacheTableName: "promote_text_to_entity_cache_prod" # Environment-specific cache - # ... rest of config -``` - -### Example 5: Adjust Text Normalization - -```yaml -promoteFunction: - entitySearchService: - enableExistingAnnotationsSearch: true - enableGlobalEntitySearch: true - maxEntitySearchLimit: 1000 - textNormalization: - removeSpecialCharacters: true - convertToLowercase: false # Preserve case sensitivity - stripLeadingZeros: false # Keep leading zeros - generateVariations: true - # ... rest of config -``` - -## Migration Guide - -### For Existing Deployments - -1. **No Immediate Action Required**: The function continues to work without the new config section -2. **Recommended**: Add the `promoteFunction` section to gain benefits: - - Flexible candidate filtering - - Performance tuning per environment - - Explicit configuration visibility - - Service-oriented organization - -### Adding Configuration to Existing Pipeline - -1. Open your extraction pipeline config file (e.g., `ep_file_annotation.config.yaml`) -2. Add the `promoteFunction` section after `finalizeFunction` -3. Customize values as needed for your environment -4. Deploy the updated configuration - -## Configuration Benefits - -1. **Service-Oriented**: Configuration mirrors actual service architecture -2. **Flexibility**: Easily adjust query filters without code changes -3. **Performance Tuning**: Optimize batch sizes and search strategies per environment -4. **Visibility**: All settings are explicitly documented in config -5. **Consistency**: Follows same pattern as launch/finalize functions -6. **Environment-Specific**: Different configs for dev/test/prod - -## Service Configuration Details - -### EntitySearchService Configuration - -Controls how the promote function finds matching entities: - -| Setting | Type | Default | Description | -| --------------------------------- | ---- | ------- | ------------------------------- | -| `enableExistingAnnotationsSearch` | bool | true | Primary search strategy (fast) | -| `enableGlobalEntitySearch` | bool | true | Fallback search strategy (slow) | -| `maxEntitySearchLimit` | int | 1000 | Max entities in global search | - -**Text Normalization Settings:** - -| Setting | Type | Default | Description | -| ------------------------- | ---- | ------- | ----------------------------- | -| `removeSpecialCharacters` | bool | true | Remove non-alphanumeric chars | -| `convertToLowercase` | bool | true | Case-insensitive matching | -| `stripLeadingZeros` | bool | true | Handle "V-0912" vs "V-912" | -| `generateVariations` | bool | true | Generate text variations | - -### CacheService Configuration - -Controls caching behavior for performance optimization: - -| Setting | Type | Default | Description | -| ---------------- | ---- | ------------------------------ | ------------------------------ | -| `cacheTableName` | str | "promote_text_to_entity_cache" | RAW table for persistent cache | - -## Future Enhancements - -The config structure supports easy addition of new features: - -- Retry logic configuration -- Tagging customization -- Batch processing delays -- Feature flags for A/B testing -- Additional service configurations - -## Questions? - -For questions or issues, refer to: - -- Main README: `cdf_file_annotation/README.md` -- Code documentation in ConfigService.py -- Example configs in extraction_pipelines/ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py index f5788604..831b5447 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/dependencies.py @@ -147,11 +147,10 @@ def create_entity_search_service( """ Creates an EntitySearchService instance for finding entities by text. - Factory function that initializes EntitySearchService with all required dependencies - extracted from the configuration. + Factory function that initializes EntitySearchService with configuration. Args: - config: Configuration object containing data model views + config: Configuration object containing data model views and entity search settings client: CogniteClient for API interactions logger: Logger instance for tracking execution @@ -161,25 +160,7 @@ def create_entity_search_service( Raises: ValueError: If regular_annotation_space (file_view.instance_space) is None """ - # Get required configuration - from services.ConfigService import ViewPropertyConfig - - core_annotation_view: ViewPropertyConfig = config.data_model_views.core_annotation_view - file_view: ViewPropertyConfig = config.data_model_views.file_view - target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view - regular_annotation_space: str | None = file_view.instance_space - - if not regular_annotation_space: - raise ValueError("regular_annotation_space (file_view.instance_space) is required but was None") - - return EntitySearchService( - client=client, - logger=logger, - core_annotation_view_id=core_annotation_view.as_view_id(), - file_view_id=file_view.as_view_id(), - target_entities_view_id=target_entities_view.as_view_id(), - regular_annotation_space=regular_annotation_space, - ) + return EntitySearchService(config=config, client=client, logger=logger) def create_cache_service( @@ -188,7 +169,7 @@ def create_cache_service( """ Creates a CacheService instance for caching text→entity mappings. - Factory function that initializes CacheService with all required dependencies. + Factory function that initializes CacheService with configuration. Importantly, reuses the normalize() function from EntitySearchService to ensure consistent text normalization between caching and searching. @@ -201,25 +182,9 @@ def create_cache_service( Returns: Initialized CacheService instance """ - from services.ConfigService import ViewPropertyConfig - - file_view: ViewPropertyConfig = config.data_model_views.file_view - target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view - - # Use promote_function config if available, otherwise fallback to finalize_function - if config.promote_function: - raw_db: str = config.promote_function.raw_db - cache_table_name: str = config.promote_function.cache_service.cache_table_name - else: - raw_db = config.finalize_function.apply_service.raw_db - cache_table_name = "promote_text_to_entity_cache" # Default - return CacheService( + config=config, client=client, logger=logger, - raw_db=raw_db, normalize_fn=entity_search_service.normalize, # Reuse normalization from entity search - file_view_id=file_view.as_view_id(), - target_entities_view_id=target_entities_view.as_view_id(), - cache_table_name=cache_table_name, ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index c04b4bfd..b1ace62c 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -6,6 +6,7 @@ from cognite.client.data_classes.data_modeling.ids import ViewId from cognite.client.data_classes.raw import Row from services.LoggerService import CogniteFunctionLogger +from services.ConfigService import Config, ViewPropertyConfig class ICacheService(abc.ABC): @@ -82,33 +83,37 @@ class CacheService(ICacheService): def __init__( self, + config: Config, client: CogniteClient, logger: CogniteFunctionLogger, - raw_db: str, normalize_fn: Callable[[str], str], - file_view_id: ViewId, - target_entities_view_id: ViewId, - cache_table_name: str = "promote_text_to_entity_cache", ): """ Initializes the cache service. Args: + config: Configuration object containing data model views and cache settings client: Cognite client logger: Logger instance - raw_db: RAW database name normalize_fn: Function to normalize text for cache keys - file_view_id: View ID for file entities - target_entities_view_id: View ID for target entities (assets, etc.) - cache_table_name: Name of the RAW table for persistent cache """ self.client = client self.logger = logger - self.raw_db = raw_db + self.config = config self.normalize = normalize_fn - self.file_view_id = file_view_id - self.target_entities_view_id = target_entities_view_id - self.cache_table_name = cache_table_name + + # Extract view configurations + file_view: ViewPropertyConfig = config.data_model_views.file_view + target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + + # Extract view IDs + self.file_view_id = file_view.as_view_id() + self.target_entities_view_id = target_entities_view.as_view_id() + + # Extract RAW database and cache table configuration + self.raw_db: str = config.promote_function.raw_db + self.cache_table_name: str = config.promote_function.cache_service.cache_table_name + self.function_id = "fn_file_annotation_promote" # In-memory cache: {(text, type): (space, ext_id) or None} diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index 90042022..765eb539 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -227,12 +227,14 @@ class TextNormalizationConfig(BaseModel, alias_generator=to_camel): Controls how text is normalized for matching and what variations are generated to improve match rates across different naming conventions. + + These flags affect both the normalize() function (for cache keys and direct matching) + and generate_text_variations() function (for query-based matching). """ remove_special_characters: bool = True convert_to_lowercase: bool = True strip_leading_zeros: bool = True - generate_variations: bool = True class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): @@ -296,7 +298,7 @@ class Config(BaseModel, alias_generator=to_camel): prepare_function: PrepareFunction launch_function: LaunchFunction finalize_function: FinalizeFunction - promote_function: Optional[PromoteFunctionConfig] = None + promote_function: PromoteFunctionConfig @classmethod def parse_direct_relation(cls, value: Any) -> Any: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index ebab1908..6fc11a01 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -5,6 +5,7 @@ from cognite.client.data_classes.data_modeling import Node, NodeList, ViewId from cognite.client.data_classes.filters import Filter, Equals, In from services.LoggerService import CogniteFunctionLogger +from services.ConfigService import Config, ViewPropertyConfig class IEntitySearchService(abc.ABC): @@ -52,30 +53,37 @@ class EntitySearchService(IEntitySearchService): def __init__( self, + config: Config, client: CogniteClient, logger: CogniteFunctionLogger, - core_annotation_view_id: ViewId, - file_view_id: ViewId, - target_entities_view_id: ViewId, - regular_annotation_space: str, ): """ Initializes the entity search service. Args: + config: Configuration object containing data model views and entity search settings client: Cognite client logger: Logger instance - core_annotation_view_id: View ID for annotation edges - file_view_id: View ID for file entities - target_entities_view_id: View ID for target entities (assets, etc.) - regular_annotation_space: Space where regular (non-pattern) annotations are stored + + Raises: + ValueError: If regular_annotation_space (file_view.instance_space) is None """ self.client = client self.logger = logger - self.core_annotation_view_id = core_annotation_view_id - self.file_view_id = file_view_id - self.target_entities_view_id = target_entities_view_id - self.regular_annotation_space = regular_annotation_space + self.config = config + + # Extract view IDs + self.core_annotation_view_id = config.data_model_views.core_annotation_view.as_view_id() + self.file_view_id = config.data_model_views.file_view.as_view_id() + self.target_entities_view_id = config.data_model_views.target_entities_view.as_view_id() + + # Extract regular annotation space + self.regular_annotation_space: str | None = config.data_model_views.file_view.instance_space + if not self.regular_annotation_space: + raise ValueError("regular_annotation_space (file_view.instance_space) is required but was None") + + # Extract text normalization config + self.text_normalization_config = config.promote_function.entity_search_service.text_normalization def find_entity(self, text: str, annotation_type: str, entity_space: str) -> list[Node]: """ @@ -274,63 +282,105 @@ def generate_text_variations(self, text: str) -> list[str]: """ Generates common variations of a text string to improve matching. - Examples: - "14-V-0937" → ["14-V-0937", "14-V-937", "14-v-0937", "14-v-937"] - "P&ID-001" → ["P&ID-001", "P&ID-1", "p&id-001", "p&id-1"] + Respects text_normalization_config settings: + - removeSpecialCharacters: Generate variations without special characters + - convertToLowercase: Generate lowercase variations + - stripLeadingZeros: Generate variations with leading zeros removed + + Examples (all flags enabled): + "V-0912" → ["V-0912", "v-0912", "V-912", "v-912", "V0912", "v0912", "V912", "v912"] + "P&ID-001" → ["P&ID-001", "p&id-001", "P&ID-1", "p&id-1", "PID001", "pid001", "PID1", "pid1"] + + Examples (all flags disabled): + "V-0912" → ["V-0912"] # Only original Args: text: Original text from pattern detection Returns: - List of text variations (original + common transformations) + List of text variations based on config settings """ variations: set[str] = set() variations.add(text) # Always include original - # Add lowercase version - variations.add(text.lower()) - - # Remove leading zeros from number sequences + # Helper function to strip leading zeros def strip_leading_zeros_in_text(s: str) -> str: return re.sub(r"\b0+(\d+)", r"\1", s) - variations.add(strip_leading_zeros_in_text(text)) - variations.add(strip_leading_zeros_in_text(text.lower())) - - return list(variations) + # Helper function to remove special characters + def remove_special_chars(s: str) -> str: + return re.sub(r"[^a-zA-Z0-9]", "", s) + + # Generate all combinations of transformations systematically + # We'll build up variations by applying each transformation flag + base_variations: set[str] = {text} + + # Apply removeSpecialCharacters transformations + if self.text_normalization_config.remove_special_characters: + new_variations: set[str] = set() + for v in base_variations: + new_variations.add(remove_special_chars(v)) + base_variations.update(new_variations) + + # Apply convertToLowercase transformations + if self.text_normalization_config.convert_to_lowercase: + new_variations = set() + for v in base_variations: + new_variations.add(v.lower()) + base_variations.update(new_variations) + + # Apply stripLeadingZeros transformations + if self.text_normalization_config.strip_leading_zeros: + new_variations = set() + for v in base_variations: + new_variations.add(strip_leading_zeros_in_text(v)) + base_variations.update(new_variations) + + return list(base_variations) def normalize(self, s: str) -> str: """ - Normalizes a string for comparison. + Normalizes a string for comparison based on text_normalization_config settings. - Process: - 1. Ensures it's a string - 2. Removes all non-alphanumeric characters - 3. Converts to lowercase - 4. Removes leading zeros from any sequence of digits + Applies transformations in sequence based on config: + 1. removeSpecialCharacters: Remove non-alphanumeric characters + 2. convertToLowercase: Convert to lowercase + 3. stripLeadingZeros: Remove leading zeros from number sequences - Examples: + Examples (all flags enabled): "V-0912" -> "v912" "FT-101A" -> "ft101a" "P&ID-0001" -> "pid1" + Examples (all flags disabled): + "V-0912" -> "V-0912" # No transformation + + Examples (only removeSpecialCharacters): + "V-0912" -> "V0912" # Special chars removed, case and zeros preserved + Args: s: String to normalize Returns: - Normalized string + Normalized string based on config settings """ if not isinstance(s, str): return "" - # Step 1: Basic cleaning (e.g., "V-0912" -> "v0912") - s = re.sub(r"[^a-zA-Z0-9]", "", s).lower() + # Apply transformations based on config + if self.text_normalization_config.remove_special_characters: + s = re.sub(r"[^a-zA-Z0-9]", "", s) + + if self.text_normalization_config.convert_to_lowercase: + s = s.lower() + + if self.text_normalization_config.strip_leading_zeros: + # Define a replacer function that converts any matched number to an int and back to a string + def strip_leading_zeros(match): + # match.group(0) is the matched string (e.g., "0912") + return str(int(match.group(0))) - # Step 2: Define a replacer function that converts any matched number to an int and back to a string - def strip_leading_zeros(match): - # match.group(0) is the matched string (e.g., "0912") - return str(int(match.group(0))) + # Apply the replacer function to all sequences of digits (\d+) in the string + s = re.sub(r"\d+", strip_leading_zeros, s) - # Step 3: Apply the replacer function to all sequences of digits (\d+) in the string - # This turns "v0912" into "v912" - return re.sub(r"\d+", strip_leading_zeros, s) + return s From ef2b7466c7edb5cb36916c1dc72e3d9e4d45ae15 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 17:54:40 -0500 Subject: [PATCH 102/128] refactored the existing_annotation and global_entity functions --- .../cdf_file_annotation/default.config.yaml | 2 +- .../services/CacheService.py | 10 +- .../services/ConfigService.py | 8 +- .../services/EntitySearchService.py | 129 +++++++++--------- .../services/PromoteService.py | 10 +- 5 files changed, 81 insertions(+), 78 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index 39c60941..a020bfc8 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -19,7 +19,7 @@ rawTableDocDoc: annotation_documents_docs rawTableDocPattern: annotation_documents_patterns rawTableCache: annotation_entities_cache rawManualPatternsCatalog: manual_patterns_catalog -rawTablePromoteCache: promote_text_cache +rawTablePromoteCache: annotation_tags_cache # used in /extraction_pipelines extractionPipelineExternalId: ep_file_annotation diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py index b1ace62c..6c79ba58 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/CacheService.py @@ -60,14 +60,14 @@ class CacheService(ICacheService): Manages two-tier caching for text → entity mappings to dramatically improve performance. **TIER 1: In-Memory Cache** (This Run Only): - - Ultra-fast lookup (<1ms) + - Ultra-fast lookup (in-memory dictionary) - Dictionary stored in memory: {(text, type): (space, id) or None} - **Includes negative caching** (remembers "no match found" to avoid repeated searches) - Cleared when function execution ends - Used for: Both positive matches AND negative results (not found) **TIER 2: Persistent RAW Cache** (All Runs): - - Fast lookup (5-10ms) + - Fast lookup (single database query) - Stored in RAW table: promote_text_to_entity_cache - Benefits all future function runs indefinitely - Tracks hit count for analytics @@ -75,9 +75,9 @@ class CacheService(ICacheService): - Does NOT cache negative results (to allow for new entities added over time) **Performance Impact:** - - First lookup: 50-100ms (query annotation edges) - - Cached lookup (same run): <1ms (5000x faster) - - Cached lookup (future run): 5-10ms (10-20x faster) + - First lookup: Slowest (query annotation edges + entity retrieval) + - Cached lookup (same run): Fastest (in-memory dictionary) + - Cached lookup (future run): Fast (single database query) - Self-improving: Gets faster as cache fills """ diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index 765eb539..5da1b779 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -242,9 +242,11 @@ class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): Configuration for the EntitySearchService in the promote function. Controls entity search strategies and text normalization behavior: - - Primary: Query existing annotation edges (fast, 50-100ms) - - Fallback: Global entity search (slow, 500ms-2s) - - Text normalization for improved matching + - Primary: Query existing annotation edges (server-side IN filter on startNodeText) + - Fallback: Global entity search (server-side IN filter on entity aliases) + - Text normalization for generating search variations + + Both strategies use efficient server-side filtering for optimal performance. """ enable_existing_annotations_search: bool = True diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index 6fc11a01..03e411c7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -35,20 +35,22 @@ class EntitySearchService(IEntitySearchService): This service implements a two-tier search strategy for finding entities: - **Strategy 1 - Existing Annotations** (Primary, Fast: 50-100ms): + **Strategy 1 - Existing Annotations** (Primary, Fast): - Queries annotation edges from regular diagram detect - - Uses server-side IN filter with text variations + - Uses server-side IN filter with text variations on edge startNodeText - Returns entities that were successfully annotated before - Handles cross-scope scenarios naturally (entity in different site/unit) + - Most efficient: Queries proven successful matches first - **Strategy 2 - Global Entity Search** (Fallback, Slow: 500ms-2s): - - Fetches all entities in space (limit 1000) - - Client-side normalized matching against aliases - - Comprehensive but may timeout with large entity counts + **Strategy 2 - Global Entity Search** (Fallback): + - Queries all entities in specified space + - Uses server-side IN filter with text variations on entity aliases + - Comprehensive search across all entities when no previous annotation exists + - Efficient: Server-side indexed filtering on aliases property **Utilities:** - - `generate_text_variations()`: Creates common variations (case, leading zeros) - - `normalize()`: Normalizes text for comparison (removes special chars, lowercase, strips zeros) + - `generate_text_variations()`: Creates common variations (case, leading zeros, special chars) + - `normalize()`: Normalizes text for cache keys (removes special chars, lowercase, strips zeros) """ def __init__( @@ -92,8 +94,11 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis This is the main entry point for entity search. Strategy: - 1. Try existing annotations (fast, 50-100ms) - 2. Fall back to global search (slow, 500ms-2s) + 1. Generate text variations once (e.g., "V-0912" → ["V-0912", "v-0912", "V-912", "v-912", ...]) + 2. Try existing annotations (fast, queries edges from previous successful matches) + 3. Fall back to global search (queries all entities in space with IN filter on aliases) + + Both strategies use server-side filtering with text variations for efficiency. Args: text: Text to search for (e.g., "V-123", "G18A-921") @@ -106,17 +111,21 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis - [node] if single unambiguous match - [node1, node2] if ambiguous (multiple matches) """ + # Generate text variations once for use in both strategies + text_variations: list[str] = self.generate_text_variations(text) + self.logger.info(f"Generated {len(text_variations)} text variation(s) for '{text}': {text_variations}") + # STRATEGY 1: Query existing annotations (primary, fast) - found_nodes: list[Node] = self.find_from_existing_annotations(text, annotation_type) + found_nodes: list[Node] = self.find_from_existing_annotations(text_variations, annotation_type) if not found_nodes: - # STRATEGY 2: Global entity search (fallback, slow) + # STRATEGY 2: Global entity search (fallback) self.logger.debug(f"No match in existing annotations for '{text}'. Trying global entity search.") - found_nodes = self.find_global_entity(text, entity_space) + found_nodes = self.find_global_entity(text_variations, entity_space) return found_nodes - def find_from_existing_annotations(self, text: str, annotation_type: str) -> list[Node]: + def find_from_existing_annotations(self, text_variations: list[str], annotation_type: str) -> list[Node]: """ Searches for existing successful annotations with matching startNodeText. @@ -127,17 +136,16 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis 4. Handles cross-scope scenarios naturally (entity in different site/unit) Args: - text: The text to search for (e.g., "G18A-921") + text_variations: List of text variations to search for (e.g., ["V-0912", "v-0912", "V-912", ...]) annotation_type: "diagrams.FileLink" or "diagrams.AssetLink" Returns: List of matched entity nodes (0, 1, or 2+ for ambiguous) """ - try: - # Generate variations of the search text - text_variations: list[str] = self.generate_text_variations(text) - self.logger.debug(f"Searching for text variations: {text_variations}") + # Use first text variation (original text) for logging + original_text: str = text_variations[0] if text_variations else "unknown" + try: # Query edges directly with IN filter # These are annotation edges that are from regular diagram detect (not pattern mode) # NOTE: manually promoted results from pattern mode are added to the @@ -176,7 +184,7 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis top_matches: list[tuple[str, str]] if len(matched_end_nodes) > 1: self.logger.warning( - f"Found {len(matched_end_nodes)} different entities for '{text}' in existing annotations. " + f"Found {len(matched_end_nodes)} different entities for '{original_text}' in existing annotations. " f"This indicates data quality issues or legitimate ambiguity." ) # Return list of most common matches (limit to 2 for ambiguity detection) @@ -207,77 +215,70 @@ def find_from_existing_annotations(self, text: str, annotation_type: str) -> lis if matched_nodes: self.logger.info( - f"Found {len(matched_nodes)} match(es) for '{text}' from existing annotations " + f"Found {len(matched_nodes)} match(es) for '{original_text}' from existing annotations " f"(appeared {matched_end_nodes.get((matched_nodes[0].space, matched_nodes[0].external_id), 0)} times)" ) return matched_nodes except Exception as e: - self.logger.error(f"Error searching existing annotations for '{text}': {e}") + self.logger.error(f"Error searching existing annotations for '{original_text}': {e}") return [] - def find_global_entity(self, text: str, entity_space: str) -> list[Node]: + def find_global_entity(self, text_variations: list[str], entity_space: str) -> list[Node]: """ - Performs a global, un-scoped search for an entity matching the given text. - Uses normalized matching to handle variations like "V-0912" vs "V-912". + Performs a global, un-scoped search for an entity matching the given text variations. + Uses server-side IN filter with text variations to handle different naming conventions. - NOTE: This approach queries all instances in a given space. - Pros: The most accurate and guaranteed approach - Cons: Will likely timeout as the amount of instances in a given space increase + This approach uses server-side filtering on the aliases property, making it efficient + and scalable even with large numbers of entities in a space. Args: - text: Text to search for + text_variations: List of text variations to search for (e.g., ["V-0912", "v-0912", "V-912", ...]) entity_space: Space to search in Returns: List of matched nodes (0, 1, or 2 for ambiguity detection) """ - # Normalize the search text - normalized_text: str = self.normalize(text) + # Use first text variation (original text) for logging + original_text: str = text_variations[0] if text_variations else "unknown" - # Fetch all entities in the space (with reasonable limit) - # NOTE: We can't do normalized matching server-side, so we fetch and filter client-side - entities: Any try: - entities = self.client.data_modeling.instances.list( + # Query entities with IN filter on aliases property + aliases_filter: Filter = In(self.target_entities_view_id.as_property_ref("aliases"), text_variations) + + entities: Any = self.client.data_modeling.instances.list( instance_type="node", sources=[self.target_entities_view_id], + filter=aliases_filter, space=entity_space, limit=1000, # Reasonable limit to prevent timeouts ) + + if not entities: + return [] + + # Convert to list and check for ambiguity + matched_entities: list[Node] = list(entities) + + if len(matched_entities) > 1: + self.logger.warning( + f"Found {len(matched_entities)} entities with aliases matching '{original_text}' in space '{entity_space}'. " + f"This is ambiguous. Returning first 2 for ambiguity detection." + ) + return matched_entities[:2] + + if matched_entities: + self.logger.info( + f"Found {len(matched_entities)} match(es) for '{original_text}' via global entity search" + ) + + return matched_entities + except Exception as e: - self.logger.error(f"Error fetching entities from space '{entity_space}': {e}") + self.logger.error(f"Error searching for entity '{original_text}' in space '{entity_space}': {e}") return [] - # Client-side normalized matching against aliases - matches: list[Node] = [] - for entity in entities: - entity_props: dict[str, Any] = entity.properties.get(self.target_entities_view_id, {}) - aliases: Any = entity_props.get("aliases", []) - - # Ensure aliases is iterable - if not isinstance(aliases, list): - continue - - # Check if any alias matches after normalization - for alias in aliases: - if isinstance(alias, str) and self.normalize(alias) == normalized_text: - matches.append(entity) - # Stop after finding 2 matches (ambiguous case) - if len(matches) >= 2: - self.logger.warning( - f"Found multiple entities with alias matching '{text}' (normalized: '{normalized_text}'). " - f"This is ambiguous." - ) - return matches[:2] - break # Move to next entity after finding match - - if matches: - self.logger.info(f"Found {len(matches)} match(es) for '{text}' via global entity search") - - return matches - def generate_text_variations(self, text: str) -> list[str]: """ Generates common variations of a text string to improve matching. diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 7bc1f425..210d7f1f 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -285,11 +285,11 @@ def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: """ Finds entity for text using multi-tier caching strategy. - Caching strategy: - - TIER 1: In-memory cache (this run, <1ms) - - TIER 2: Persistent RAW cache (all runs, 5-10ms) - - TIER 3: EntitySearchService (annotation edges, 50-100ms) - - TIER 4: EntitySearchService fallback (global search, 500ms-2s) + Caching strategy (fastest to slowest): + - TIER 1: In-memory cache (this run only, in-memory dictionary) + - TIER 2: Persistent RAW cache (all runs, single database query) + - TIER 3: EntitySearchService (annotation edges, server-side IN filter on startNodeText) + - TIER 4: EntitySearchService fallback (global search, server-side IN filter on aliases) Caching behavior: - Only caches unambiguous single matches (len(found_nodes) == 1) From db9fadf32d5fc232896b657c81a8c1b45dfab726 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 18:03:10 -0500 Subject: [PATCH 103/128] provide the correct viewId to use as source when global entity search --- .../services/EntitySearchService.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index 03e411c7..60d5534e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -121,7 +121,11 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis if not found_nodes: # STRATEGY 2: Global entity search (fallback) self.logger.debug(f"No match in existing annotations for '{text}'. Trying global entity search.") - found_nodes = self.find_global_entity(text_variations, entity_space) + if annotation_type == "diagrams.FileLink": + source: ViewId = self.file_view_id + else: + source = self.target_entities_view_id + found_nodes = self.find_global_entity(text_variations, source, entity_space) return found_nodes @@ -225,7 +229,7 @@ def find_from_existing_annotations(self, text_variations: list[str], annotation_ self.logger.error(f"Error searching existing annotations for '{original_text}': {e}") return [] - def find_global_entity(self, text_variations: list[str], entity_space: str) -> list[Node]: + def find_global_entity(self, text_variations: list[str], source: ViewId, entity_space: str) -> list[Node]: """ Performs a global, un-scoped search for an entity matching the given text variations. Uses server-side IN filter with text variations to handle different naming conventions. @@ -249,7 +253,7 @@ def find_global_entity(self, text_variations: list[str], entity_space: str) -> l entities: Any = self.client.data_modeling.instances.list( instance_type="node", - sources=[self.target_entities_view_id], + sources=source, filter=aliases_filter, space=entity_space, limit=1000, # Reasonable limit to prevent timeouts From f31d3c518b7707362ca6f1894f5728edd9016959 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 18:22:51 -0500 Subject: [PATCH 104/128] retired find_from_existing_annotations() function --- .../services/ConfigService.py | 8 +- .../services/EntitySearchService.py | 87 +++++++++++-------- .../services/PromoteService.py | 3 +- 3 files changed, 55 insertions(+), 43 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index 5da1b779..c5435ecc 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -241,12 +241,12 @@ class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): """ Configuration for the EntitySearchService in the promote function. - Controls entity search strategies and text normalization behavior: - - Primary: Query existing annotation edges (server-side IN filter on startNodeText) - - Fallback: Global entity search (server-side IN filter on entity aliases) + Controls entity search and text normalization behavior: + - Queries entities directly (server-side IN filter on entity/file aliases) - Text normalization for generating search variations - Both strategies use efficient server-side filtering for optimal performance. + Uses efficient server-side filtering on the smaller entity dataset rather than + the larger annotation edge dataset for better performance at scale. """ enable_existing_annotations_search: bool = True diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index 60d5534e..abb0a4cf 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -31,22 +31,23 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis class EntitySearchService(IEntitySearchService): """ - Finds entities by text using multiple search strategies with automatic fallback. + Finds entities by text using server-side filtering on entity aliases. - This service implements a two-tier search strategy for finding entities: + This service queries entities directly using an IN filter on the aliases property, + which is more efficient than querying annotation edges: - **Strategy 1 - Existing Annotations** (Primary, Fast): - - Queries annotation edges from regular diagram detect - - Uses server-side IN filter with text variations on edge startNodeText - - Returns entities that were successfully annotated before - - Handles cross-scope scenarios naturally (entity in different site/unit) - - Most efficient: Queries proven successful matches first + **Why query entities directly instead of annotation edges?** + - Entity dataset is smaller and stable (~1,000-10,000 entities) + - Annotation edges grow quadratically (Files × Entities = potentially millions) + - Neither startNodeText nor aliases properties are indexed + - Without indexes, smaller dataset = better performance + - Entity count doesn't increase as more files are annotated - **Strategy 2 - Global Entity Search** (Fallback): - - Queries all entities in specified space - - Uses server-side IN filter with text variations on entity aliases - - Comprehensive search across all entities when no previous annotation exists - - Efficient: Server-side indexed filtering on aliases property + **Search Strategy:** + - Generate text variations (e.g., "V-0912" → ["V-0912", "v-0912", "V-912", "v912", ...]) + - Query entities with server-side IN filter on aliases property + - Uses text variations to handle different naming conventions + - Returns matches from specified entity space **Utilities:** - `generate_text_variations()`: Creates common variations (case, leading zeros, special chars) @@ -89,21 +90,24 @@ def __init__( def find_entity(self, text: str, annotation_type: str, entity_space: str) -> list[Node]: """ - Finds entities matching the given text using multiple strategies. + Finds entities matching the given text by querying entity aliases. This is the main entry point for entity search. Strategy: - 1. Generate text variations once (e.g., "V-0912" → ["V-0912", "v-0912", "V-912", "v-912", ...]) - 2. Try existing annotations (fast, queries edges from previous successful matches) - 3. Fall back to global search (queries all entities in space with IN filter on aliases) + 1. Generate text variations (e.g., "V-0912" → ["V-0912", "v-0912", "V-912", "v912", ...]) + 2. Query entities with server-side IN filter on aliases property - Both strategies use server-side filtering with text variations for efficiency. + Note: We query entities directly rather than annotation edges because: + - Entity dataset is smaller and more stable (~1,000-10,000 entities) + - Annotation edges grow quadratically (Files × Entities = potentially millions) + - Neither startNodeText nor aliases properties are indexed + - Without indexes, smaller dataset = better performance Args: text: Text to search for (e.g., "V-123", "G18A-921") annotation_type: Type of annotation ("diagrams.FileLink" or "diagrams.AssetLink") - entity_space: Space to search in for global fallback + entity_space: Space to search in Returns: List of matched nodes: @@ -111,33 +115,41 @@ def find_entity(self, text: str, annotation_type: str, entity_space: str) -> lis - [node] if single unambiguous match - [node1, node2] if ambiguous (multiple matches) """ - # Generate text variations once for use in both strategies + # Generate text variations once text_variations: list[str] = self.generate_text_variations(text) self.logger.info(f"Generated {len(text_variations)} text variation(s) for '{text}': {text_variations}") - # STRATEGY 1: Query existing annotations (primary, fast) - found_nodes: list[Node] = self.find_from_existing_annotations(text_variations, annotation_type) + # Determine which view to query based on annotation type + if annotation_type == "diagrams.FileLink": + source: ViewId = self.file_view_id + else: + source = self.target_entities_view_id - if not found_nodes: - # STRATEGY 2: Global entity search (fallback) - self.logger.debug(f"No match in existing annotations for '{text}'. Trying global entity search.") - if annotation_type == "diagrams.FileLink": - source: ViewId = self.file_view_id - else: - source = self.target_entities_view_id - found_nodes = self.find_global_entity(text_variations, source, entity_space) + # Query entities directly by aliases + found_nodes: list[Node] = self.find_global_entity(text_variations, source, entity_space) return found_nodes def find_from_existing_annotations(self, text_variations: list[str], annotation_type: str) -> list[Node]: """ - Searches for existing successful annotations with matching startNodeText. + [UNUSED] Searches for existing successful annotations with matching startNodeText. + + ** WHY THIS FUNCTION IS NOT USED: ** + While this was originally designed as a "smart" optimization to find proven matches, + it actually queries the LARGER dataset: + + - Annotation edges grow quadratically: O(Files × Entities) = potentially millions + - Entity/file nodes grow linearly: O(Entities) = thousands + - Neither startNodeText nor aliases properties are indexed + - Without indexes, querying the smaller dataset (entities) is always faster + + Performance comparison at scale: + - This function: Scans ~500,000+ annotation edges (grows over time) + - Global entity search: Scans ~1,000-10,000 entities (relatively stable) + + Result: Global entity search is 50-500x faster at scale. - This is MUCH faster than querying all entity aliases because: - 1. Queries edges directly with server-side filtering (indexed and fast) - 2. Uses IN filter with text variations to handle common differences - 3. Only searches proven successful annotations - 4. Handles cross-scope scenarios naturally (entity in different site/unit) + This function is kept for reference but should not be used in production. Args: text_variations: List of text variations to search for (e.g., ["V-0912", "v-0912", "V-912", ...]) @@ -239,6 +251,7 @@ def find_global_entity(self, text_variations: list[str], source: ViewId, entity_ Args: text_variations: List of text variations to search for (e.g., ["V-0912", "v-0912", "V-912", ...]) + source: View to query (file_view or target_entities_view) entity_space: Space to search in Returns: @@ -249,7 +262,7 @@ def find_global_entity(self, text_variations: list[str], source: ViewId, entity_ try: # Query entities with IN filter on aliases property - aliases_filter: Filter = In(self.target_entities_view_id.as_property_ref("aliases"), text_variations) + aliases_filter: Filter = In(source.as_property_ref("aliases"), text_variations) entities: Any = self.client.data_modeling.instances.list( instance_type="node", diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 210d7f1f..6b872bae 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -288,8 +288,7 @@ def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: Caching strategy (fastest to slowest): - TIER 1: In-memory cache (this run only, in-memory dictionary) - TIER 2: Persistent RAW cache (all runs, single database query) - - TIER 3: EntitySearchService (annotation edges, server-side IN filter on startNodeText) - - TIER 4: EntitySearchService fallback (global search, server-side IN filter on aliases) + - TIER 3: EntitySearchService (global entity search, server-side IN filter on aliases) Caching behavior: - Only caches unambiguous single matches (len(found_nodes) == 1) From 7ee95b019c99fb1d36c251350052c62dea093d4a Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 18:42:44 -0500 Subject: [PATCH 105/128] updated the README with promote phase --- .../cdf_file_annotation/README.md | 257 ++++++++++++------ 1 file changed, 175 insertions(+), 82 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/README.md b/modules/contextualization/cdf_file_annotation/README.md index 4feb4cde..086f0b5b 100644 --- a/modules/contextualization/cdf_file_annotation/README.md +++ b/modules/contextualization/cdf_file_annotation/README.md @@ -10,11 +10,12 @@ The Annotation template is a framework designed to automate the process of annot - **Dual Annotation Modes**: Simultaneously runs standard entity matching and pattern-based detection mode: - **Standard Mode**: Links files to known entities in your data model with confidence-based approval thresholds. - **Pattern Mode**: Automatically generates regex-like patterns from entity aliases and detects all matching text in files, creating a comprehensive searchable catalog of potential entities for review and approval. +- **Automatic Pattern Promotion:** Post-processes pattern-mode annotations to automatically resolve cross-scope entity references using intelligent text matching and multi-tier caching, dramatically reducing manual review burden. - **Intelligent Pattern Generation:** Automatically analyzes entity aliases to generate pattern samples, with support for manual pattern overrides at global, site, or unit levels. - **Large Document Support (\>50 Pages):** Automatically handles files with more than 50 pages by breaking them into manageable chunks, processing them iteratively, and tracking the overall progress. - **Parallel Execution Ready:** Designed for concurrent execution with a robust optimistic locking mechanism to prevent race conditions when multiple finalize function instances run in parallel. - **Comprehensive Reporting:** Annotations stored in three dedicated RAW tables (doc-to-doc links, doc-to-tag links, and pattern detections) plus extraction pipeline logs for full traceability. -- **Local Running and Debugging:** Both the launch and finalize handler can be run locally and have default setups in the 'Run & Debug' tab in VSCode. Requires a .env file to be placed in the directory. +- **Local Running and Debugging:** All function handlers can be run locally and have default setups in the 'Run & Debug' tab in VSCode. Requires a .env file to be placed in the directory. ## Getting Started @@ -29,7 +30,7 @@ For a quick overview, deploying this annotation module into a new Cognite Data F ## How It Works -The template operates in three main phases, orchestrated by CDF Workflows. Since the prepare phase is relatively small, it is bundled in with the launch phase. However, conceptually it should be treated as a separate process. +The template operates in four main phases, orchestrated by CDF Workflows. Since the prepare phase is relatively small, it is bundled in with the launch phase. However, conceptually it should be treated as a separate process. ### Prepare Phase @@ -59,40 +60,40 @@ The template operates in three main phases, orchestrated by CDF Workflows. Since
Click to view Mermaid flowchart for Launch Phase -```mermaid -flowchart TD - Start([Start Launch Phase]) --> QueryFiles[Query AnnotationState
for New or Retry status] - QueryFiles --> CheckFiles{Any files
to process?} - CheckFiles -->|No| End([End]) - CheckFiles -->|Yes| GroupFiles[Group files by
primary scope
e.g., site, unit] - - GroupFiles --> NextScope{Next scope
group?} - NextScope -->|Yes| CheckCache{Valid cache
exists in RAW?} - - CheckCache -->|No - Stale/Missing| QueryEntities[Query data model for
entities within scope] - QueryEntities --> GenPatterns[Auto-generate pattern samples
from entity aliases
e.g., FT-101A → #91;FT#93;-000#91;A#93;] - GenPatterns --> GetManual[Retrieve manual pattern
overrides from RAW catalog
GLOBAL, site, or unit level] - GetManual --> MergePatterns[Merge and deduplicate
auto-generated and
manual patterns] - MergePatterns --> StoreCache[Store entity list and
pattern samples in
RAW cache] - StoreCache --> UseCache[Use entities and patterns] - - CheckCache -->|Yes - Valid| LoadCache[Load entities and
patterns from RAW cache] - LoadCache --> UseCache - - UseCache --> ProcessBatch[Process files in batches
up to max batch size] - ProcessBatch --> SubmitJobs[Submit Diagram Detect jobs:
1 Standard annotation
2 Pattern mode if enabled] - SubmitJobs --> UpdateState[Update AnnotationState:
- Set status to Processing
- Store both job IDs] - UpdateState --> NextScope - NextScope -->|No more groups| QueryFiles - - style Start fill:#d4f1d4 - style End fill:#f1d4d4 - style CheckFiles fill:#fff4e6 - style CheckCache fill:#fff4e6 - style NextScope fill:#fff4e6 - style UseCache fill:#e6f3ff - style UpdateState fill:#e6f3ff -``` + ```mermaid + flowchart TD + Start([Start Launch Phase]) --> QueryFiles[Query AnnotationState
for New or Retry status] + QueryFiles --> CheckFiles{Any files
to process?} + CheckFiles -->|No| End([End]) + CheckFiles -->|Yes| GroupFiles[Group files by
primary scope
e.g., site, unit] + + GroupFiles --> NextScope{Next scope
group?} + NextScope -->|Yes| CheckCache{Valid cache
exists in RAW?} + + CheckCache -->|No - Stale/Missing| QueryEntities[Query data model for
entities within scope] + QueryEntities --> GenPatterns[Auto-generate pattern samples
from entity aliases
e.g., FT-101A → #91;FT#93;-000#91;A#93;] + GenPatterns --> GetManual[Retrieve manual pattern
overrides from RAW catalog
GLOBAL, site, or unit level] + GetManual --> MergePatterns[Merge and deduplicate
auto-generated and
manual patterns] + MergePatterns --> StoreCache[Store entity list and
pattern samples in
RAW cache] + StoreCache --> UseCache[Use entities and patterns] + + CheckCache -->|Yes - Valid| LoadCache[Load entities and
patterns from RAW cache] + LoadCache --> UseCache + + UseCache --> ProcessBatch[Process files in batches
up to max batch size] + ProcessBatch --> SubmitJobs[Submit Diagram Detect jobs:
1 Standard annotation
2 Pattern mode if enabled] + SubmitJobs --> UpdateState[Update AnnotationState:
- Set status to Processing
- Store both job IDs] + UpdateState --> NextScope + NextScope -->|No more groups| QueryFiles + + style Start fill:#d4f1d4 + style End fill:#f1d4d4 + style CheckFiles fill:#fff4e6 + style CheckCache fill:#fff4e6 + style NextScope fill:#fff4e6 + style UseCache fill:#e6f3ff + style UpdateState fill:#e6f3ff + ```
@@ -114,52 +115,130 @@ flowchart TD
Click to view Mermaid flowchart for Finalize Phase -```mermaid -flowchart TD - Start([Start Finalize Phase]) --> QueryState[Query for ONE AnnotationState
with Processing status
Use optimistic locking to claim it] - QueryState --> CheckState{Found annotation
state instance?} - CheckState -->|No| End([End]) - CheckState -->|Yes| GetJobId[Extract job ID and
pattern mode job ID] - - GetJobId --> FindFiles[Find ALL files with
the same job ID] - FindFiles --> CheckJobs{Both standard
and pattern jobs
complete?} - CheckJobs -->|No| ResetStatus[Update AnnotationStates
back to Processing
Wait 30 seconds] - ResetStatus --> QueryState - - CheckJobs -->|Yes| RetrieveResults[Retrieve results from
both completed jobs] - RetrieveResults --> MergeResults[Merge regular and pattern
results by file ID
Creates unified result per file] - MergeResults --> LoopFiles[For each file in merged results] - - LoopFiles --> ProcessResults[Process file results:
- Create stable hash for deduplication
- Filter standard by confidence threshold
- Skip pattern duplicates] - - ProcessResults --> CheckClean{First run for
multi-page file?} - CheckClean -->|Yes| CleanOld[Clean old annotations] - CheckClean -->|No| CreateEdges - CleanOld --> CreateEdges[Create edges in data model] - - CreateEdges --> StandardEdges[Standard annotations:
Link file to entities
Write to doc_tag and doc_doc RAW tables] - StandardEdges --> PatternEdges[Pattern annotations:
Link file to sink node
Write to doc_pattern RAW table] - - PatternEdges --> UpdateTag[Update file tag:
AnnotationInProcess → Annotated] - UpdateTag --> PrepareUpdate[Prepare AnnotationState update:
- Annotated if complete
- Failed if error
- New if more pages remain
Track page progress] - - PrepareUpdate --> MoreFiles{More files in
merged results?} - MoreFiles -->|Yes| LoopFiles - MoreFiles -->|No| BatchUpdate[Batch update ALL
AnnotationState instances
for this job] - - BatchUpdate --> QueryState - - style Start fill:#d4f1d4 - style End fill:#f1d4d4 - style CheckState fill:#fff4e6 - style CheckJobs fill:#fff4e6 - style CheckClean fill:#fff4e6 - style MoreFiles fill:#fff4e6 - style MergeResults fill:#e6f3ff - style ProcessResults fill:#e6f3ff - style CreateEdges fill:#e6f3ff - style BatchUpdate fill:#e6f3ff -``` + ```mermaid + flowchart TD + Start([Start Finalize Phase]) --> QueryState[Query for ONE AnnotationState
with Processing status
Use optimistic locking to claim it] + QueryState --> CheckState{Found annotation
state instance?} + CheckState -->|No| End([End]) + CheckState -->|Yes| GetJobId[Extract job ID and
pattern mode job ID] + + GetJobId --> FindFiles[Find ALL files with
the same job ID] + FindFiles --> CheckJobs{Both standard
and pattern jobs
complete?} + CheckJobs -->|No| ResetStatus[Update AnnotationStates
back to Processing
Wait 30 seconds] + ResetStatus --> QueryState + + CheckJobs -->|Yes| RetrieveResults[Retrieve results from
both completed jobs] + RetrieveResults --> MergeResults[Merge regular and pattern
results by file ID
Creates unified result per file] + MergeResults --> LoopFiles[For each file in merged results] + + LoopFiles --> ProcessResults[Process file results:
- Create stable hash for deduplication
- Filter standard by confidence threshold
- Skip pattern duplicates] + + ProcessResults --> CheckClean{First run for
multi-page file?} + CheckClean -->|Yes| CleanOld[Clean old annotations] + CheckClean -->|No| CreateEdges + CleanOld --> CreateEdges[Create edges in data model] + + CreateEdges --> StandardEdges[Standard annotations:
Link file to entities
Write to doc_tag and doc_doc RAW tables] + StandardEdges --> PatternEdges[Pattern annotations:
Link file to sink node
Write to doc_pattern RAW table] + + PatternEdges --> UpdateTag[Update file tag:
AnnotationInProcess → Annotated] + UpdateTag --> PrepareUpdate[Prepare AnnotationState update:
- Annotated if complete
- Failed if error
- New if more pages remain
Track page progress] + + PrepareUpdate --> MoreFiles{More files in
merged results?} + MoreFiles -->|Yes| LoopFiles + MoreFiles -->|No| BatchUpdate[Batch update ALL
AnnotationState instances
for this job] + + BatchUpdate --> QueryState + + style Start fill:#d4f1d4 + style End fill:#f1d4d4 + style CheckState fill:#fff4e6 + style CheckJobs fill:#fff4e6 + style CheckClean fill:#fff4e6 + style MoreFiles fill:#fff4e6 + style MergeResults fill:#e6f3ff + style ProcessResults fill:#e6f3ff + style CreateEdges fill:#e6f3ff + style BatchUpdate fill:#e6f3ff + ``` + +
+ +### Promote Phase + +- **Goal**: Automatically resolve pattern-mode annotations by finding matching entities and updating edges from sink node to actual entities. +- **Process**: + 1. Queries for pattern-mode annotation edges (edges pointing to the sink node with status "Suggested"). + 2. Groups candidates by unique text to process each text only once per batch. + 3. For each unique text: + - Generates text variations to handle different naming conventions (case, special characters, leading zeros). + - Searches for matching entities using a multi-tier caching strategy: + - **TIER 1**: In-memory cache (fastest, this run only). + - **TIER 2**: Persistent RAW cache (shared across runs and with manual promotions). + - **TIER 3**: Entity search via data model (queries smaller, stable entity dataset). + - Updates all edges with the same text based on search results. + 4. Updates edges and RAW tables based on results: + - **Approved**: Single unambiguous match found → edge points to actual entity, added "PromotedAuto" tag. + - **Rejected**: No match found → edge stays on sink node, added "PromoteAttempted" tag. + - **Suggested**: Multiple ambiguous matches → kept for manual review, added "AmbiguousMatch" tag. + 5. Runs continuously (designed for repeated execution) until all resolvable pattern annotations are promoted. +
+Click to view Mermaid flowchart for Promote Phase + + ```mermaid + flowchart TD + Start([Start Promote Phase]) --> QueryEdges[Query for pattern-mode edges
pointing to sink node
with Suggested status] + QueryEdges --> CheckEdges{Any edges
to promote?} + CheckEdges -->|No| End([End]) + CheckEdges -->|Yes| GroupText[Group edges by
unique text + type
Process each text once] + + GroupText --> NextText{Next unique
text?} + NextText -->|Yes| GenVariations[Generate text variations
Case, special chars, zeros
e.g., V-0912 → 8 variations] + + GenVariations --> CheckMemCache{In-memory
cache hit?} + CheckMemCache -->|Yes| UseMemCache[Use cached entity
TIER 1: Fastest] + CheckMemCache -->|No| CheckRAWCache{Persistent RAW
cache hit?} + + CheckRAWCache -->|Yes| UseRAWCache[Use cached entity
TIER 2: Fast
Populate in-memory cache] + CheckRAWCache -->|No| SearchEntities[Query entities via
data model
TIER 3: Server-side IN filter
on aliases property] + + SearchEntities --> CacheResult{Match found
and unambiguous?} + CacheResult -->|Yes| CachePositive[Cache positive result
in-memory + RAW] + CacheResult -->|No match| CacheNegative[Cache negative result
in-memory only] + CacheResult -->|Ambiguous| NoCache[Don't cache
ambiguous results] + + UseMemCache --> ProcessResult + UseRAWCache --> ProcessResult + CachePositive --> ProcessResult[Determine result type:
Single match, No match,
or Ambiguous] + CacheNegative --> ProcessResult + NoCache --> ProcessResult + + ProcessResult --> UpdateEdges{Result type?} + UpdateEdges -->|Single Match| ApproveEdges[Update ALL edges with this text:
- Point to matched entity
- Status: Approved
- Tag: PromotedAuto
- Update RAW pattern table] + UpdateEdges -->|No Match| RejectEdges[Update ALL edges with this text:
- Keep on sink node
- Status: Rejected
- Tag: PromoteAttempted
- Update RAW pattern table] + UpdateEdges -->|Ambiguous| FlagEdges[Update ALL edges with this text:
- Keep on sink node
- Status: Suggested
- Tags: PromoteAttempted,
AmbiguousMatch
- Update RAW pattern table] + + ApproveEdges --> BatchUpdate[Batch update edges
and RAW rows in CDF] + RejectEdges --> BatchUpdate + FlagEdges --> BatchUpdate + + BatchUpdate --> NextText + NextText -->|No more texts| QueryEdges + + style Start fill:#d4f1d4 + style End fill:#f1d4d4 + style CheckEdges fill:#fff4e6 + style CheckMemCache fill:#fff4e6 + style CheckRAWCache fill:#fff4e6 + style CacheResult fill:#fff4e6 + style UpdateEdges fill:#fff4e6 + style NextText fill:#fff4e6 + style UseMemCache fill:#e6ffe6 + style UseRAWCache fill:#e6f3ff + style SearchEntities fill:#ffe6e6 + style ProcessResult fill:#e6f3ff + style BatchUpdate fill:#e6f3ff + ```
@@ -182,6 +261,11 @@ Key configuration sections include: - `cleanOldAnnotations`: Whether to remove existing annotations before applying new ones. - `maxRetryAttempts`: Retry limit for failed files. - `sinkNode`: Target node for pattern mode annotations pending review. +- `promoteFunction`: Configures automatic resolution of pattern-mode annotations: + - `getCandidatesQuery`: Query to find pattern-mode edges to promote (batch size controlled via limit). + - `entitySearchService`: Controls entity search and text normalization (case, special chars, leading zeros). + - `cacheService`: Configuration for the persistent text→entity cache shared across runs and with manual promotions. + - `rawDb` / `rawTableDocPattern`: Location of RAW tables for storing promotion results. This file allows for deep customization. For example, you can use a list of query configurations to combine them with `OR` logic, or you can set `primaryScopeProperty` to `None` to process files that are not tied to a specific scope. Manual pattern samples can be added to the RAW catalog at `GLOBAL`, site, or unit levels to override or supplement auto-generated patterns. @@ -246,6 +330,15 @@ When processing tens of thousands of files, naively fetching context for each fi This cache is loaded once per scope and reused for all files in that batch, drastically reducing the number of queries to CDF and improving overall throughput. The pattern generation process extracts common naming conventions from aliases, creating regex-like patterns that can match variations (e.g., detecting "FT-102A" even if only "FT-101A" was in the training data). +### Efficient Entity Search for Pattern Promotion + +The promote function's entity search strategy is deliberately optimized for scale: + +- **Dataset Size Analysis:** When pattern-mode annotations need resolution, there are two potential query strategies: query annotation edges (to find proven matches) or query entities directly. Without property indexes on either `startNodeText` (edges) or `aliases` (entities), the smaller dataset wins. +- **Growth Patterns:** Annotation edges grow quadratically with O(Files × Entities), potentially reaching hundreds of thousands or millions. Entity counts grow linearly and remain relatively stable at thousands. +- **Design Choice:** The promote function queries entities directly via server-side IN filters on the aliases property, avoiding the much larger annotation edge dataset. This provides 50-500x better performance at scale. +- **Self-Improving Cache:** The persistent RAW cache accumulates successful text→entity mappings over time and is shared between automated promotions and manual promotions from the Streamlit dashboard, creating a self-improving system. + ### Interface-Based Extensibility The template is designed around a core set of abstract interfaces (e.g., `IDataModelService`, `ILaunchService`). This is a foundational architectural choice that enables scalability and long-term viability. From b2b77d11f1c755ec49933fcbe2ea9c3700fd55b6 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 23:05:53 -0500 Subject: [PATCH 106/128] updated supporting modules configs --- .../ep_file_annotation.ExtractionPipeline.yaml | 4 ++++ .../ep_file_annotation.config.yaml | 1 - .../raw/tbl_file_annotation.Tables.yaml | 8 +++++++- .../wf_file_annotation.WorkflowVersion.yaml | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml index 8af0b484..103ad3bc 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.ExtractionPipeline.yaml @@ -7,10 +7,14 @@ rawTables: tableName: {{ rawTableDocTag }} - dbName: {{ rawDb }} tableName: {{ rawTableDocDoc }} + - dbName: {{ rawDb }} + tableName: {{ rawTableDocPattern }} - dbName: {{ rawDb }} tableName: {{ rawTableCache }} - dbName: {{ rawDb }} tableName: {{ rawManualPatternsCatalog }} + - dbName: {{ rawDb }} + tableName: {{ rawTablePromoteCache }} source: "Files" documentation: > ## 1. `dataModelViews` diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 79b4f41b..1f3936e5 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -130,7 +130,6 @@ config: rawTableDocDoc: {{ rawTableDocDoc }} rawTableDocPattern: {{ rawTableDocPattern }} promoteFunction: - # Query configuration for finding candidate edges to promote getCandidatesQuery: targetView: schemaSpace: cdf_cdm diff --git a/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml b/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml index 7f6e9a2b..6a3a5abb 100644 --- a/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml +++ b/modules/contextualization/cdf_file_annotation/raw/tbl_file_annotation.Tables.yaml @@ -4,8 +4,14 @@ - dbName: {{ rawDb }} tableName: {{ rawTableDocDoc }} +- dbName: {{ rawDb }} + tableName: {{ rawTableDocPattern }} + - dbName: {{ rawDb }} tableName: {{ rawTableCache }} - dbName: {{ rawDb }} - tableName: {{ rawManualPatternsCatalog }} \ No newline at end of file + tableName: {{ rawManualPatternsCatalog }} + +- dbName: {{ rawDb }} + tableName: {{ rawTablePromoteCache }} \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index a1d6d30d..58235917 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -104,3 +104,20 @@ workflowDefinition: retries: 0 timeout: 600 onFailure: "abortWorkflow" + + - externalId: fn_promote + type: "function" + parameters: + function: + externalId: {{ promoteFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Promote File Annotations + description: Auto promote tags + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" From 373afb8560b43063173b4109733293f7e08ce249 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Thu, 16 Oct 2025 23:14:02 -0500 Subject: [PATCH 107/128] refactored the handler to run independent of 7min timeout --- .../fn_file_annotation_promote/handler.py | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index 38de4852..74383783 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -71,7 +71,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ run_status: str = "success" try: - # Run in a loop for a maximum of 7 minutes + # Run in a loop for a maximum of 7 minutes b/c serverless functions can run for max 10 minutes before hardware dies while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): result: str | None = promote_service.run() if result == "Done": @@ -79,8 +79,6 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ break # Log batch report and pause between batches logger.info(tracker.generate_local_report(), section="START") - time.sleep(10) - return {"status": run_status, "data": data} except Exception as e: run_status = "failure" @@ -118,9 +116,43 @@ def run_locally(config_file: dict) -> None: client: CogniteClient = create_client(env_vars) # Mock function_call_info for local runs - function_call_info: dict[str, str] = {"function_id": "local", "call_id": "local"} + config: Config + config, client = create_config_service(function_data=config_file) + logger: CogniteFunctionLogger = create_logger_service( + config_file.get("logLevel", "DEBUG"), config_file.get("logPath") + ) + tracker: PromoteTracker = PromoteTracker() + + # Create service dependencies + entity_search_service: EntitySearchService = create_entity_search_service(config, client, logger) + cache_service: CacheService = create_cache_service(config, client, logger, entity_search_service) + + # Create promote service with injected dependencies + promote_service: GeneralPromoteService = GeneralPromoteService( + client=client, + config=config, + logger=logger, + tracker=tracker, + entity_search_service=entity_search_service, + cache_service=cache_service, + ) - handle(config_file, function_call_info, client) + try: + # Run in a loop for a maximum of 7 minutes b/c serverless functions can run for max 10 minutes before hardware dies + while True: + result: str | None = promote_service.run() + if result == "Done": + logger.info("No more candidates to process. Exiting.", section="END") + break + # Log batch report and pause between batches + logger.info(tracker.generate_local_report(), section="START") + except Exception as e: + run_status = "failure" + msg: str = f"{str(e)}" + logger.error(f"An unexpected error occurred: {msg}", section="BOTH") + finally: + # Generate overall summary report + logger.info(tracker.generate_overall_report(), section="BOTH") if __name__ == "__main__": From 850fbea7595afd98b222a2211c9c7ae2129dbfe8 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 17 Oct 2025 14:25:31 -0500 Subject: [PATCH 108/128] added error handling to the promote service --- .../services/PromoteService.py | 132 ++++++++++-------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 6b872bae..1b4f032e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -1,4 +1,5 @@ import abc +import time from typing import Any, Literal from cognite.client import CogniteClient from cognite.client.data_classes import RowWrite @@ -129,10 +130,16 @@ def run(self) -> Literal["Done"] | None: """ self.logger.info("Starting Promote batch", section="START") - candidates: EdgeList | None = self._get_promote_candidates() - if not candidates: - self.logger.info("No Promote candidates found.", section="END") - return "Done" + try: + candidates: EdgeList | None = self._get_promote_candidates() + if not candidates: + self.logger.info("No Promote candidates found.", section="END") + return "Done" + except Exception as e: + self.logger.error(f"Ran into the following error: {str(e)}") + self.logger.info("Retrying in 15 seconds") + time.sleep(15) + return self.logger.info(f"Found {len(candidates)} Promote candidates. Starting processing.") @@ -165,63 +172,66 @@ def run(self) -> Literal["Done"] | None: batch_rejected: int = 0 batch_ambiguous: int = 0 - # Process each unique text/type combination once - for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items(): - entity_space: str | None = ( - self.file_view.instance_space - if annotation_type == "diagrams.FileLink" - else self.target_entities_view.instance_space - ) - - if not entity_space: - self.logger.warning(f"Could not determine entity space for type '{annotation_type}'. Skipping.") - continue - - # Strategy: Check cache → query edges → fallback to global search - found_nodes: list[Node] | list = self._find_entity_with_cache(text_to_find, annotation_type, entity_space) - - # Determine result type for tracking - num_edges: int = len(edges_with_same_text) - if len(found_nodes) == 1: - batch_promoted += num_edges - elif len(found_nodes) == 0: - batch_rejected += num_edges - else: # Multiple matches - batch_ambiguous += num_edges - - # Apply the same result to ALL edges with this text - for edge in edges_with_same_text: - edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) - - if edge_apply is not None: - edges_to_update.append(edge_apply) - if raw_row is not None: - raw_rows_to_update.append(raw_row) - - # Update tracker with batch results - self.tracker.add_edges(promoted=batch_promoted, rejected=batch_rejected, ambiguous=batch_ambiguous) - - if edges_to_update: - self.client.data_modeling.instances.apply(edges=edges_to_update) - self.logger.info( - f"Successfully updated {len(edges_to_update)} edges in data model:\n" - f" ├─ Promoted: {batch_promoted}\n" - f" ├─ Rejected: {batch_rejected}\n" - f" └─ Ambiguous: {batch_ambiguous}", - section="END", - ) - - if raw_rows_to_update: - self.client.raw.rows.insert( - db_name=self.raw_db, - table_name=self.raw_pattern_table, - row=raw_rows_to_update, - ensure_parent=True, - ) - self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.", section="END") - - if not edges_to_update and not raw_rows_to_update: - self.logger.info("No edges were updated in this run.", section="END") + try: + # Process each unique text/type combination once + for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items(): + entity_space: str | None = ( + self.file_view.instance_space + if annotation_type == "diagrams.FileLink" + else self.target_entities_view.instance_space + ) + + if not entity_space: + self.logger.warning(f"Could not determine entity space for type '{annotation_type}'. Skipping.") + continue + + # Strategy: Check cache → query edges → fallback to global search + found_nodes: list[Node] | list = self._find_entity_with_cache( + text_to_find, annotation_type, entity_space + ) + + # Determine result type for tracking + num_edges: int = len(edges_with_same_text) + if len(found_nodes) == 1: + batch_promoted += num_edges + elif len(found_nodes) == 0: + batch_rejected += num_edges + else: # Multiple matches + batch_ambiguous += num_edges + + # Apply the same result to ALL edges with this text + for edge in edges_with_same_text: + edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) + + if edge_apply is not None: + edges_to_update.append(edge_apply) + if raw_row is not None: + raw_rows_to_update.append(raw_row) + finally: + # Update tracker with batch results + self.tracker.add_edges(promoted=batch_promoted, rejected=batch_rejected, ambiguous=batch_ambiguous) + + if edges_to_update: + self.client.data_modeling.instances.apply(edges=edges_to_update) + self.logger.info( + f"Successfully updated {len(edges_to_update)} edges in data model:\n" + f" ├─ Promoted: {batch_promoted}\n" + f" ├─ Rejected: {batch_rejected}\n" + f" └─ Ambiguous: {batch_ambiguous}", + section="END", + ) + + if raw_rows_to_update: + self.client.raw.rows.insert( + db_name=self.raw_db, + table_name=self.raw_pattern_table, + row=raw_rows_to_update, + ensure_parent=True, + ) + self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.", section="END") + + if not edges_to_update and not raw_rows_to_update: + self.logger.info("No edges were updated in this run.", section="END") return None # Continue running if more candidates might exist From c7466f617919353d9de05ee0ec1fd73ac89b390b Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 17 Oct 2025 15:52:18 -0500 Subject: [PATCH 109/128] updated configs and added new flags to promote function --- .../ep_file_annotation.config.yaml | 2 + .../services/ConfigService.py | 71 +++++++++++++++++++ .../services/ConfigService.py | 71 +++++++++++++++++++ .../services/ConfigService.py | 2 + .../services/PromoteService.py | 26 +++---- 5 files changed, 157 insertions(+), 15 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 1f3936e5..25664d9e 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -149,6 +149,8 @@ config: rawTableDocPattern: {{ rawTableDocPattern }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} + delete_rejected_edges: true + delete_suggested_edges: true entitySearchService: enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast, checks existing annotation edges) enableGlobalEntitySearch: true # Fallback: Global entity search - (slow, unstable as instance count grows) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py index 8b5bd257..f1d2584d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py @@ -220,6 +220,76 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): apply_service: ApplyServiceConfig +# Promote Related Configs +class TextNormalizationConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for text normalization and variation generation. + + Controls how text is normalized for matching and what variations are generated + to improve match rates across different naming conventions. + + These flags affect both the normalize() function (for cache keys and direct matching) + and generate_text_variations() function (for query-based matching). + """ + + remove_special_characters: bool = True + convert_to_lowercase: bool = True + strip_leading_zeros: bool = True + + +class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the EntitySearchService in the promote function. + + Controls entity search and text normalization behavior: + - Queries entities directly (server-side IN filter on entity/file aliases) + - Text normalization for generating search variations + + Uses efficient server-side filtering on the smaller entity dataset rather than + the larger annotation edge dataset for better performance at scale. + """ + + enable_existing_annotations_search: bool = True + enable_global_entity_search: bool = True + max_entity_search_limit: int = Field(default=1000, gt=0, le=10000) + text_normalization: TextNormalizationConfig + + +class PromoteCacheServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the CacheService in the promote function. + + Controls caching behavior for text→entity mappings. + """ + + cache_table_name: str + + +class PromoteFunctionConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the promote function. + + The promote function resolves pattern-mode annotations by finding matching entities + and updating annotation edges from pointing to a sink node to pointing to actual entities. + + Configuration is organized by service interface: + - entitySearchService: Controls entity search strategies + - cacheService: Controls caching behavior + + Batch size is controlled via getCandidatesQuery.limit field. + """ + + get_candidates_query: QueryConfig | list[QueryConfig] + raw_db: str + raw_table_doc_pattern: str + raw_table_doc_tag: str + raw_table_doc_doc: str + delete_rejected_edges: bool + delete_suggested_edges: bool + entity_search_service: EntitySearchServiceConfig + cache_service: PromoteCacheServiceConfig + + class DataModelViews(BaseModel, alias_generator=to_camel): core_annotation_view: ViewPropertyConfig annotation_state_view: ViewPropertyConfig @@ -232,6 +302,7 @@ class Config(BaseModel, alias_generator=to_camel): prepare_function: PrepareFunction launch_function: LaunchFunction finalize_function: FinalizeFunction + promote_function: PromoteFunctionConfig @classmethod def parse_direct_relation(cls, value: Any) -> Any: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py index 8b5bd257..f1d2584d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py @@ -220,6 +220,76 @@ class FinalizeFunction(BaseModel, alias_generator=to_camel): apply_service: ApplyServiceConfig +# Promote Related Configs +class TextNormalizationConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for text normalization and variation generation. + + Controls how text is normalized for matching and what variations are generated + to improve match rates across different naming conventions. + + These flags affect both the normalize() function (for cache keys and direct matching) + and generate_text_variations() function (for query-based matching). + """ + + remove_special_characters: bool = True + convert_to_lowercase: bool = True + strip_leading_zeros: bool = True + + +class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the EntitySearchService in the promote function. + + Controls entity search and text normalization behavior: + - Queries entities directly (server-side IN filter on entity/file aliases) + - Text normalization for generating search variations + + Uses efficient server-side filtering on the smaller entity dataset rather than + the larger annotation edge dataset for better performance at scale. + """ + + enable_existing_annotations_search: bool = True + enable_global_entity_search: bool = True + max_entity_search_limit: int = Field(default=1000, gt=0, le=10000) + text_normalization: TextNormalizationConfig + + +class PromoteCacheServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the CacheService in the promote function. + + Controls caching behavior for text→entity mappings. + """ + + cache_table_name: str + + +class PromoteFunctionConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the promote function. + + The promote function resolves pattern-mode annotations by finding matching entities + and updating annotation edges from pointing to a sink node to pointing to actual entities. + + Configuration is organized by service interface: + - entitySearchService: Controls entity search strategies + - cacheService: Controls caching behavior + + Batch size is controlled via getCandidatesQuery.limit field. + """ + + get_candidates_query: QueryConfig | list[QueryConfig] + raw_db: str + raw_table_doc_pattern: str + raw_table_doc_tag: str + raw_table_doc_doc: str + delete_rejected_edges: bool + delete_suggested_edges: bool + entity_search_service: EntitySearchServiceConfig + cache_service: PromoteCacheServiceConfig + + class DataModelViews(BaseModel, alias_generator=to_camel): core_annotation_view: ViewPropertyConfig annotation_state_view: ViewPropertyConfig @@ -232,6 +302,7 @@ class Config(BaseModel, alias_generator=to_camel): prepare_function: PrepareFunction launch_function: LaunchFunction finalize_function: FinalizeFunction + promote_function: PromoteFunctionConfig @classmethod def parse_direct_relation(cls, value: Any) -> Any: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py index c5435ecc..f1d2584d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py @@ -284,6 +284,8 @@ class PromoteFunctionConfig(BaseModel, alias_generator=to_camel): raw_table_doc_pattern: str raw_table_doc_tag: str raw_table_doc_doc: str + delete_rejected_edges: bool + delete_suggested_edges: bool entity_search_service: EntitySearchServiceConfig cache_service: PromoteCacheServiceConfig diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 1b4f032e..1613e610 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -84,21 +84,14 @@ def __init__( ) # RAW database and table configuration - # Prefer promote_function config if available, otherwise fallback to finalize_function config - if self.config.promote_function: - self.raw_db = self.config.promote_function.raw_db - self.raw_pattern_table = self.config.promote_function.raw_table_doc_pattern - self.raw_doc_doc_table = self.config.promote_function.raw_table_doc_doc - self.raw_doc_tag_table = self.config.promote_function.raw_table_doc_tag - else: - # Backward compatibility: use finalize_function config - self.logger.warning( - "promote_function config not found. Using finalize_function config for backward compatibility." - ) - self.raw_db = self.config.finalize_function.apply_service.raw_db - self.raw_pattern_table = self.config.finalize_function.apply_service.raw_table_doc_pattern - self.raw_doc_doc_table = self.config.finalize_function.apply_service.raw_table_doc_doc - self.raw_doc_tag_table = self.config.finalize_function.apply_service.raw_table_doc_tag + self.raw_db = self.config.promote_function.raw_db + self.raw_pattern_table = self.config.promote_function.raw_table_doc_pattern + self.raw_doc_doc_table = self.config.promote_function.raw_table_doc_doc + self.raw_doc_tag_table = self.config.promote_function.raw_table_doc_tag + + # Promote flags + self.delete_rejected_edges: bool = self.config.promote_function.delete_rejected_edges + self.delete_suggested_edges: bool = self.config.promote_function.delete_suggested_edges # Injected service dependencies self.entity_search_service = entity_search_service @@ -166,6 +159,9 @@ def run(self) -> Literal["Done"] | None: edges_to_update: list[EdgeApply] = [] raw_rows_to_update: list[RowWrite] = [] + # TODO: think about whether we need to delete the cooresponding raw row of edges that we delete OR if it should be placed in another RAW table when rejected + # raw_rows_to_delete: list[RowWrite] = [] + edges_to_delete: list[EdgeApply] = [] # Track results for this batch batch_promoted: int = 0 From ddb543a22a0aca81be78f9b8d86a58eb6944cdcd Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 17 Oct 2025 16:12:09 -0500 Subject: [PATCH 110/128] delete edges based off flags --- .../services/PromoteService.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 1613e610..44e631a6 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -5,6 +5,7 @@ from cognite.client.data_classes import RowWrite from cognite.client.data_classes.data_modeling import ( Edge, + EdgeId, EdgeList, EdgeApply, Node, @@ -152,7 +153,7 @@ def run(self) -> Literal["Done"] | None: self.logger.info( message=f"Grouped {len(candidates)} candidates into {len(grouped_candidates)} unique text/type combinations.", ) - self.logger.info( + self.logger.debug( message=f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided.", section="END", ) @@ -161,7 +162,7 @@ def run(self) -> Literal["Done"] | None: raw_rows_to_update: list[RowWrite] = [] # TODO: think about whether we need to delete the cooresponding raw row of edges that we delete OR if it should be placed in another RAW table when rejected # raw_rows_to_delete: list[RowWrite] = [] - edges_to_delete: list[EdgeApply] = [] + edges_to_delete: list[EdgeId] = [] # Track results for this batch batch_promoted: int = 0 @@ -186,23 +187,35 @@ def run(self) -> Literal["Done"] | None: text_to_find, annotation_type, entity_space ) - # Determine result type for tracking + # Determine result type for tracking AND deletion decision num_edges: int = len(edges_with_same_text) + should_delete: bool = False + if len(found_nodes) == 1: batch_promoted += num_edges + should_delete = False # Never delete promoted edges elif len(found_nodes) == 0: batch_rejected += num_edges + should_delete = self.delete_rejected_edges else: # Multiple matches batch_ambiguous += num_edges + should_delete = self.delete_suggested_edges # Apply the same result to ALL edges with this text for edge in edges_with_same_text: edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes) - if edge_apply is not None: - edges_to_update.append(edge_apply) - if raw_row is not None: - raw_rows_to_update.append(raw_row) + if should_delete: + # Delete the edge but still update RAW row to track what happened + edges_to_delete.append(EdgeId(edge.space, edge.external_id)) + if raw_row is not None: + raw_rows_to_update.append(raw_row) + else: + # Update both edge and RAW row + if edge_apply is not None: + edges_to_update.append(edge_apply) + if raw_row is not None: + raw_rows_to_update.append(raw_row) finally: # Update tracker with batch results self.tracker.add_edges(promoted=batch_promoted, rejected=batch_rejected, ambiguous=batch_ambiguous) @@ -217,6 +230,10 @@ def run(self) -> Literal["Done"] | None: section="END", ) + if edges_to_delete: + self.client.data_modeling.instances.delete(edges=edges_to_delete) + self.logger.info(f"Successfully deleted {len(edges_to_delete)} edges from data model.", section="END") + if raw_rows_to_update: self.client.raw.rows.insert( db_name=self.raw_db, @@ -226,7 +243,7 @@ def run(self) -> Literal["Done"] | None: ) self.logger.info(f"Successfully updated {len(raw_rows_to_update)} rows in RAW table.", section="END") - if not edges_to_update and not raw_rows_to_update: + if not edges_to_update and not edges_to_delete and not raw_rows_to_update: self.logger.info("No edges were updated in this run.", section="END") return None # Continue running if more candidates might exist From 43c3db8920a030f8f8a8c0aa8337347fd5ee182e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 17 Oct 2025 16:29:59 -0500 Subject: [PATCH 111/128] made flags camel case in ep_config --- .../extraction_pipelines/ep_file_annotation.config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 25664d9e..d1196466 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -149,8 +149,8 @@ config: rawTableDocPattern: {{ rawTableDocPattern }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} - delete_rejected_edges: true - delete_suggested_edges: true + deleteRejectedEdges: true + deleteSuggestedEdges: true entitySearchService: enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast, checks existing annotation edges) enableGlobalEntitySearch: true # Fallback: Global entity search - (slow, unstable as instance count grows) From 066685a846d6b9b54d3e83b96902cc2596e9f136 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Fri, 17 Oct 2025 16:34:50 -0500 Subject: [PATCH 112/128] clearer logging --- .../fn_file_annotation_promote/services/EntitySearchService.py | 2 +- .../fn_file_annotation_promote/services/PromoteService.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py index abb0a4cf..f1dc1cb4 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/EntitySearchService.py @@ -286,7 +286,7 @@ def find_global_entity(self, text_variations: list[str], source: ViewId, entity_ return matched_entities[:2] if matched_entities: - self.logger.info( + self.logger.debug( f"Found {len(matched_entities)} match(es) for '{original_text}' via global entity search" ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 44e631a6..5aa7e7eb 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -227,7 +227,7 @@ def run(self) -> Literal["Done"] | None: f" ├─ Promoted: {batch_promoted}\n" f" ├─ Rejected: {batch_rejected}\n" f" └─ Ambiguous: {batch_ambiguous}", - section="END", + section="BOTH", ) if edges_to_delete: From 7f8598454f1823a899edaea282cf6d424114dad8 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sat, 18 Oct 2025 18:24:13 -0500 Subject: [PATCH 113/128] created a launch finalize and promote workflow version --- .../cdf_file_annotation/default.config.yaml | 4 +++- .../wf_file_annotation.WorkflowTrigger.yaml | 18 ++++++++++++++++++ .../wf_file_annotation.WorkflowVersion.yaml | 14 ++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index a020bfc8..cf1766e7 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -41,7 +41,9 @@ functionClientSecret: ${IDP_CLIENT_SECRET} # used in /workflows workflowSchedule: "3-59/15 * * * *" # NOTE: runs every 15 minutes with a 3 minute offset workflowExternalId: wf_file_annotation -workflowVersion: v1 +launchWorkflowVersion: v1_launch +finalizeWorkflowVersion: v1_finalize +promoteWorkflowVersion: v1_promote # used in /auth groupSourceId: # source ID from Azure AD for the corresponding groups \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index 18d1c21f..92953383 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -4,6 +4,24 @@ triggerRule: cronExpression: "{{workflowSchedule}}" workflowExternalId: {{workflowExternalId}} workflowVersion: {{workflowVersion}} +authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} +externalId: {{workflowExternalId}} +triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" +workflowExternalId: {{workflowExternalId}} +workflowVersion: {{workflowVersion}} +authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} +externalId: {{workflowExternalId}} +triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" +workflowExternalId: {{workflowExternalId}} +workflowVersion: {{workflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 58235917..0c5665e0 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -1,7 +1,7 @@ workflowExternalId: {{ workflowExternalId }} -version: "v1" +version: {{ launchWorkflowVersion }} workflowDefinition: - description: "A workflow for annotating P&ID and documents." + description: "Create diagram detect jobs for files marked to annotate." tasks: - externalId: fn_launch type: "function" @@ -20,6 +20,11 @@ workflowDefinition: timeout: 600 onFailure: "abortWorkflow" +workflowExternalId: {{ workflowExternalId }} +version: {{ finalizeWorkflowVersion }} +workflowDefinition: + description: "Process the diagram detect jobs created by the launch workflow" + tasks: - externalId: fn_finalize_thread_1 type: "function" parameters: @@ -105,6 +110,11 @@ workflowDefinition: timeout: 600 onFailure: "abortWorkflow" +workflowExternalId: {{ workflowExternalId }} +version: {{ promoteWorkflowVersion }} +workflowDefinition: + description: "Attempt to automatically promote annotation edges created from the pattern mode results in the finalize workflow" + tasks: - externalId: fn_promote type: "function" parameters: From a5ee5027184834ce0f2b23bad0c025d79bdfdef3 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sat, 18 Oct 2025 18:26:24 -0500 Subject: [PATCH 114/128] added a workflow trigger for each workflow version --- .../workflows/wf_file_annotation.WorkflowTrigger.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index 92953383..e7b75763 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -3,25 +3,27 @@ triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" workflowExternalId: {{workflowExternalId}} -workflowVersion: {{workflowVersion}} +workflowVersion: {{launchWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} + externalId: {{workflowExternalId}} triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" workflowExternalId: {{workflowExternalId}} -workflowVersion: {{workflowVersion}} +workflowVersion: {{finalizeWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} + externalId: {{workflowExternalId}} triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" workflowExternalId: {{workflowExternalId}} -workflowVersion: {{workflowVersion}} +workflowVersion: {{promoteWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} \ No newline at end of file From ed7ea8c3a99b87a5bc4690c743e6c41ea52cb00e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sat, 18 Oct 2025 18:56:27 -0500 Subject: [PATCH 115/128] made different versions and triggers an array in yaml --- .../cdf_file_annotation/default.config.yaml | 5 +- .../wf_file_annotation.WorkflowTrigger.yaml | 54 ++-- .../wf_file_annotation.WorkflowVersion.yaml | 254 +++++++++--------- 3 files changed, 158 insertions(+), 155 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index cf1766e7..43989983 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -39,11 +39,14 @@ functionClientId: ${IDP_CLIENT_ID} functionClientSecret: ${IDP_CLIENT_SECRET} # used in /workflows -workflowSchedule: "3-59/15 * * * *" # NOTE: runs every 15 minutes with a 3 minute offset +workflowSchedule: "3-59/10 * * * *" # NOTE: runs every 10 minutes with a 3 minute offset workflowExternalId: wf_file_annotation launchWorkflowVersion: v1_launch +launchWorkflowTrigger: wf_launch_trigger finalizeWorkflowVersion: v1_finalize +finalizeWorkflowTrigger: wf_finalize_trigger promoteWorkflowVersion: v1_promote +promoteWorkflowTrigger: wf_promote_trigger # used in /auth groupSourceId: # source ID from Azure AD for the corresponding groups \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index e7b75763..55981dbe 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -1,29 +1,29 @@ -externalId: {{workflowExternalId}} -triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -workflowExternalId: {{workflowExternalId}} -workflowVersion: {{launchWorkflowVersion}} -authentication: - clientId: {{functionClientId}} - clientSecret: {{functionClientSecret}} +- externalId: {{launchWorkflowTrigger}} + triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" + workflowExternalId: {{workflowExternalId}} + workflowVersion: {{launchWorkflowVersion}} + authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} -externalId: {{workflowExternalId}} -triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -workflowExternalId: {{workflowExternalId}} -workflowVersion: {{finalizeWorkflowVersion}} -authentication: - clientId: {{functionClientId}} - clientSecret: {{functionClientSecret}} +- externalId: {{finalizeWorkflowTrigger}} + triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" + workflowExternalId: {{workflowExternalId}} + workflowVersion: {{finalizeWorkflowVersion}} + authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} -externalId: {{workflowExternalId}} -triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -workflowExternalId: {{workflowExternalId}} -workflowVersion: {{promoteWorkflowVersion}} -authentication: - clientId: {{functionClientId}} - clientSecret: {{functionClientSecret}} \ No newline at end of file +- externalId: {{promoteWorkflowTrigger}} + triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" + workflowExternalId: {{workflowExternalId}} + workflowVersion: {{promoteWorkflowVersion}} + authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 0c5665e0..0449c353 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -1,133 +1,133 @@ -workflowExternalId: {{ workflowExternalId }} -version: {{ launchWorkflowVersion }} -workflowDefinition: - description: "Create diagram detect jobs for files marked to annotate." - tasks: - - externalId: fn_launch - type: "function" - parameters: - function: - externalId: {{ launchFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Launch File Annotations - description: Launch - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" +- workflowExternalId: {{ workflowExternalId }} + version: {{ launchWorkflowVersion }} + workflowDefinition: + description: "Create diagram detect jobs for files marked to annotate." + tasks: + - externalId: fn_launch + type: "function" + parameters: + function: + externalId: {{ launchFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Launch File Annotations + description: Launch + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" -workflowExternalId: {{ workflowExternalId }} -version: {{ finalizeWorkflowVersion }} -workflowDefinition: - description: "Process the diagram detect jobs created by the launch workflow" - tasks: - - externalId: fn_finalize_thread_1 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 1 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" +- workflowExternalId: {{ workflowExternalId }} + version: {{ finalizeWorkflowVersion }} + workflowDefinition: + description: "Process the diagram detect jobs created by the launch workflow" + tasks: + - externalId: fn_finalize_thread_1 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 1 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" - - externalId: fn_finalize_thread_2 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 2 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" + - externalId: fn_finalize_thread_2 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 2 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" - - externalId: fn_finalize_thread_3 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 3 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" + - externalId: fn_finalize_thread_3 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 3 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" - - externalId: fn_finalize_thread_4 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 4 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" + - externalId: fn_finalize_thread_4 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 4 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" - - externalId: fn_finalize_thread_5 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 5 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" + - externalId: fn_finalize_thread_5 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 5 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" -workflowExternalId: {{ workflowExternalId }} -version: {{ promoteWorkflowVersion }} -workflowDefinition: - description: "Attempt to automatically promote annotation edges created from the pattern mode results in the finalize workflow" - tasks: - - externalId: fn_promote - type: "function" - parameters: - function: - externalId: {{ promoteFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Promote File Annotations - description: Auto promote tags - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" +- workflowExternalId: {{ workflowExternalId }} + version: {{ promoteWorkflowVersion }} + workflowDefinition: + description: "Attempt to automatically promote annotation edges created from the pattern mode results in the finalize workflow" + tasks: + - externalId: fn_promote + type: "function" + parameters: + function: + externalId: {{ promoteFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Promote File Annotations + description: Auto promote tags + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" From 70d4308cd1c4a0df12b2d2d8279f2e8ca9449fbd Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 09:55:44 -0500 Subject: [PATCH 116/128] removed the prepare function from launch service --- .../fn_file_annotation_launch/handler.py | 21 +-- .../services/LaunchService.py | 140 +----------------- 2 files changed, 6 insertions(+), 155 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py index fedd7396..8b3f5439 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py @@ -26,7 +26,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: 2. Create an instance of the launch function and create implementations of the interfaces 3. Run the launch instance until... 4. It's been 7 minutes - 5. There are no files left that need to be annoated + 5. There are no files left that need to be launched NOTE: Cognite functions have a run-time limit of 10 minutes. Don't want the function to die at the 10minute mark since there's no guarantee all code will execute. Thus we set a timelimit of 7 minutes (conservative) so that code execution is guaranteed. @@ -52,15 +52,6 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: run_status: str = "success" try: - while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): - if launch_instance.prepare() == "Done": - break - logger_instance.info(tracker_instance.generate_local_report()) - - overall_report: str = tracker_instance.generate_overall_report() - logger_instance.info(overall_report, "BOTH") - tracker_instance.reset() - while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): if launch_instance.run() == "Done": return {"status": run_status, "data": data} @@ -90,7 +81,7 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): 1. Create an instance of config, logger, and tracker 2. Create an instance of the Launch function and create implementations of the interfaces 3. Run the launch instance until - 4. There are no files left that need to be annoated + 4. There are no files left that need to be launched """ log_level = config_file.get("logLevel", "DEBUG") config_instance, client = create_config_service(function_data=config_file) @@ -109,14 +100,6 @@ def run_locally(config_file: dict[str, str], log_path: str | None = None): function_call_info={"function_id": None, "call_id": None}, ) try: - while True: - if launch_instance.prepare() == "Done": - break - logger_instance.info(tracker_instance.generate_local_report()) - - logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") - tracker_instance.reset() - while True: if launch_instance.run() == "Done": break diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py index 8323791a..62c44cfe 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/LaunchService.py @@ -28,8 +28,8 @@ class AbstractLaunchService(abc.ABC): """ - Orchestrates the file annotation launch process. This service prepares files for annotation, - manages batching and caching, and initiates diagram detection jobs. + Orchestrates the file annotation launch process. This service manages batching and caching, + and initiates diagram detection jobs for files ready to be annotated. """ def __init__( @@ -50,10 +50,6 @@ def __init__( self.cache_service = cache_service self.annotation_service = annotation_service - @abc.abstractmethod - def prepare(self) -> str | None: - pass - @abc.abstractmethod def run(self) -> str | None: pass @@ -61,8 +57,8 @@ def run(self) -> str | None: class GeneralLaunchService(AbstractLaunchService): """ - Orchestrates the file annotation launch process. This service prepares files for annotation, - manages batching and caching, and initiates diagram detection jobs. + Orchestrates the file annotation launch process. This service manages batching and caching, + and initiates diagram detection jobs for files ready to be annotated. """ def __init__( @@ -102,134 +98,6 @@ def __init__( self.function_id: int | None = function_call_info.get("function_id") self.call_id: int | None = function_call_info.get("call_id") - self.reset_files: bool = False - if self.config.prepare_function.get_files_for_annotation_reset_query: - self.reset_files = True - - # NOTE: I believe this code should be encapsulated as a separate CDF function named prepFunction. Due to the amount of cdf functions we can spin up, we're coupling this within the launchFunction. - def prepare(self) -> Literal["Done"] | None: - """ - Prepares files for annotation by creating annotation state instances. - - Retrieves files marked "ToAnnotate", creates corresponding FileAnnotationState instances, - and updates file tags to indicate processing has started. Can also reset files if configured. - - Args: - None - - Returns: - "Done" if no more files need preparation, None if processing should continue. - - Raises: - CogniteAPIError: If query timeout or other API errors occur (408 errors are handled gracefully). - ValueError: If annotation state view instance space is not configured. - """ - self.logger.info( - message=f"Starting Prepare Function", - section="START", - ) - try: - if self.reset_files: - file_nodes_to_reset: NodeList | None = self.data_model_service.get_files_for_annotation_reset() - if not file_nodes_to_reset: - self.logger.info( - "No files found with the getFilesForAnnotationReset query provided in the config file" - ) - else: - self.logger.info(f"Resetting {len(file_nodes_to_reset)} files") - reset_node_apply: list[NodeApply] = [] - for file_node in file_nodes_to_reset: - file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) - if "AnnotationInProcess" in tags_property: - tags_property.remove("AnnotationInProcess") - if "Annotated" in tags_property: - tags_property.remove("Annotated") - if "AnnotationFailed" in tags_property: - tags_property.remove("AnnotationFailed") - - reset_node_apply.append(file_node_apply) - update_results = self.data_model_service.update_annotation_state(reset_node_apply) - self.logger.info( - f"Removed the AnnotationInProcess/Annotated/AnnotationFailed tag of {len(update_results)} files" - ) - self.reset_files = False - except CogniteAPIError as e: - # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. - if ( - e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." - ): - # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. - self.logger.error(message=f"Ran into the following error:\n{str(e)}") - return - else: - raise e - - try: - file_nodes: NodeList | None = self.data_model_service.get_files_to_annotate() - if not file_nodes: - self.logger.info( - message=f"No files found to prepare", - section="END", - ) - return "Done" - self.logger.info(f"Preparing {len(file_nodes)} files") - except CogniteAPIError as e: - # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. - if ( - e.code == 408 - and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." - ): - # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. - self.logger.error(message=f"Ran into the following error:\n{str(e)}") - return - else: - raise e - - annotation_state_instances: list[NodeApply] = [] - file_apply_instances: list[NodeApply] = [] - for file_node in file_nodes: - node_id = {"space": file_node.space, "externalId": file_node.external_id} - annotation_instance = AnnotationState( - annotationStatus=AnnotationStatus.NEW, - linkedFile=node_id, - ) - if not self.annotation_state_view.instance_space: - msg = ( - "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" - ) - self.logger.error(msg) - raise ValueError(msg) - annotation_instance_space: str = self.annotation_state_view.instance_space - - annotation_node_apply: NodeApply = annotation_instance.to_node_apply( - node_space=annotation_instance_space, - annotation_state_view=self.annotation_state_view.as_view_id(), - ) - annotation_state_instances.append(annotation_node_apply) - - file_node_apply: NodeApply = file_node.as_write() - tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) - if "AnnotationInProcess" not in tags_property: - tags_property.append("AnnotationInProcess") - file_apply_instances.append(file_node_apply) - - try: - create_results = self.data_model_service.create_annotation_state(annotation_state_instances) - self.logger.info(message=f"Created {len(create_results)} annotation state instances") - update_results = self.data_model_service.update_annotation_state(file_apply_instances) - self.logger.info( - message=f"Added 'AnnotationInProcess' to the tag property for {len(update_results)} files", - section="END", - ) - except Exception as e: - self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") - raise - - self.tracker.add_files(success=len(file_nodes)) - return - def run(self) -> Literal["Done"] | None: """ Main execution loop for launching diagram detection jobs. From 3886f2c80f4783bba2422ba15a924f168560466e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 09:56:16 -0500 Subject: [PATCH 117/128] created separate prepare function --- .../fn_file_annotation_prepare/__init__.py | 0 .../dependencies.py | 96 +++++ .../fn_file_annotation_prepare/handler.py | 146 +++++++ .../requirements.txt | 24 ++ .../services/ConfigService.py | 371 ++++++++++++++++ .../services/DataModelService.py | 403 ++++++++++++++++++ .../services/LoggerService.py | 169 ++++++++ .../services/PipelineService.py | 66 +++ .../services/PrepareService.py | 209 +++++++++ .../utils/DataStructures.py | 331 ++++++++++++++ 10 files changed, 1815 insertions(+) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/__init__.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/dependencies.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/requirements.txt create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/ConfigService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/DataModelService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/LoggerService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PipelineService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PrepareService.py create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/__init__.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/dependencies.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/dependencies.py new file mode 100644 index 00000000..55d4e7bd --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/dependencies.py @@ -0,0 +1,96 @@ +import os + +from pathlib import Path +from dotenv import load_dotenv +from typing import Any, Tuple, Literal +from cognite.client import CogniteClient, ClientConfig +from cognite.client.credentials import OAuthClientCredentials + +from utils.DataStructures import EnvConfig +from services.LoggerService import CogniteFunctionLogger +from services.ConfigService import Config, load_config_parameters +from services.DataModelService import GeneralDataModelService +from services.PipelineService import GeneralPipelineService + + +def get_env_variables() -> EnvConfig: + print("Loading environment variables from .env...") + + project_path = (Path(__file__).parent / ".env").resolve() + print(f"project_path is set to: {project_path}") + + load_dotenv() + + required_envvars = ( + "CDF_PROJECT", + "CDF_CLUSTER", + "IDP_TENANT_ID", + "IDP_CLIENT_ID", + "IDP_CLIENT_SECRET", + ) + + missing = [envvar for envvar in required_envvars if envvar not in os.environ] + if missing: + raise ValueError(f"Missing one or more env.vars: {missing}") + + return EnvConfig( + cdf_project=os.getenv("CDF_PROJECT"), # type: ignore + cdf_cluster=os.getenv("CDF_CLUSTER"), # type: ignore + tenant_id=os.getenv("IDP_TENANT_ID"), # type: ignore + client_id=os.getenv("IDP_CLIENT_ID"), # type: ignore + client_secret=os.getenv("IDP_CLIENT_SECRET"), # type: ignore + ) + + +def create_client(env_config: EnvConfig, debug: bool = False): + SCOPES = [f"https://{env_config.cdf_cluster}.cognitedata.com/.default"] + TOKEN_URL = f"https://login.microsoftonline.com/{env_config.tenant_id}/oauth2/v2.0/token" + creds = OAuthClientCredentials( + token_url=TOKEN_URL, + client_id=env_config.client_id, + client_secret=env_config.client_secret, + scopes=SCOPES, + ) + cnf = ClientConfig( + client_name="DEV_Working", + project=env_config.cdf_project, + base_url=f"https://{env_config.cdf_cluster}.cognitedata.com", # NOTE: base_url might need to be adjusted if on PSAAS or Private Link + credentials=creds, + debug=debug, + ) + client = CogniteClient(cnf) + return client + + +def create_config_service( + function_data: dict[str, Any], client: CogniteClient | None = None +) -> Tuple[Config, CogniteClient]: + if client is None: + env_config = get_env_variables() + client = create_client(env_config) + config = load_config_parameters(client=client, function_data=function_data) + return config, client + + +def create_logger_service(log_level): + if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR"]: + return CogniteFunctionLogger() + else: + return CogniteFunctionLogger(log_level=log_level) + + +def create_write_logger_service(log_level, filepath): + if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR"]: + return CogniteFunctionLogger(write=True, filepath=filepath) + else: + return CogniteFunctionLogger(log_level=log_level, write=True, filepath=filepath) + + +def create_general_data_model_service( + config: Config, client: CogniteClient, logger: CogniteFunctionLogger +) -> GeneralDataModelService: + return GeneralDataModelService(config=config, client=client, logger=logger) + + +def create_general_pipeline_service(client: CogniteClient, pipeline_ext_id: str) -> GeneralPipelineService: + return GeneralPipelineService(pipeline_ext_id, client) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py new file mode 100644 index 00000000..e8c71e95 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py @@ -0,0 +1,146 @@ +import sys +from cognite.client import CogniteClient +from datetime import datetime, timezone, timedelta + +from dependencies import ( + create_config_service, + create_logger_service, + create_write_logger_service, + create_general_data_model_service, + create_general_pipeline_service, +) +from services.PrepareService import GeneralPrepareService, LocalPrepareService, AbstractPrepareService +from services.DataModelService import IDataModelService +from services.PipelineService import IPipelineService +from utils.DataStructures import PerformanceTracker + + +def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: + """ + Main entry point for the cognite function. + 1. Create an instance of config, logger, and tracker + 2. Create an instance of the prepare function and create implementations of the interfaces + 3. Run the prepare instance until... + 4. It's been 7 minutes + 5. There are no files left that need to be prepared + NOTE: Cognite functions have a run-time limit of 10 minutes. + Don't want the function to die at the 10minute mark since there's no guarantee all code will execute. + Thus we set a timelimit of 7 minutes (conservative) so that code execution is guaranteed. + documentation on the calling a function can be found here... https://api-docs.cognite.com/20230101/tag/Function-calls/operation/postFunctionsCall + """ + start_time = datetime.now(timezone.utc) + log_level = data.get("logLevel", "INFO") + + config_instance, client = create_config_service(function_data=data, client=client) + logger_instance = create_logger_service(log_level) + tracker_instance = PerformanceTracker() + pipeline_instance: IPipelineService = create_general_pipeline_service( + client, pipeline_ext_id=data["ExtractionPipelineExtId"] + ) + + prepare_instance: AbstractPrepareService = _create_prepare_service( + config=config_instance, + client=client, + logger=logger_instance, + tracker=tracker_instance, + function_call_info=function_call_info, + ) + + run_status: str = "success" + try: + while datetime.now(timezone.utc) - start_time < timedelta(minutes=7): + if prepare_instance.run() == "Done": + return {"status": run_status, "data": data} + logger_instance.info(tracker_instance.generate_local_report()) + return {"status": run_status, "data": data} + except Exception as e: + run_status = "failure" + msg = f"{str(e)}" + logger_instance.error(message=msg, section="BOTH") + return {"status": run_status, "message": msg} + finally: + logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") + # only want to report on the count of successful and failed files in ep_logs if there were files that were processed or an error occured + # else run log will be too messy. + if tracker_instance.files_failed != 0 or tracker_instance.files_success != 0 or run_status == "failure": + function_id = function_call_info.get("function_id") + call_id = function_call_info.get("call_id") + pipeline_instance.update_extraction_pipeline( + msg=tracker_instance.generate_ep_run("Prepare", function_id, call_id) + ) + pipeline_instance.upload_extraction_pipeline(status=run_status) + + +def run_locally(config_file: dict[str, str], log_path: str | None = None): + """ + Main entry point for the cognite function. + 1. Create an instance of config, logger, and tracker + 2. Create an instance of the Prepare function and create implementations of the interfaces + 3. Run the prepare instance until + 4. There are no files left that need to be prepared + """ + log_level = config_file.get("logLevel", "DEBUG") + config_instance, client = create_config_service(function_data=config_file) + + if log_path: + logger_instance = create_write_logger_service(log_level=log_level, filepath=log_path) + else: + logger_instance = create_logger_service(log_level=log_level) + tracker_instance = PerformanceTracker() + + prepare_instance: AbstractPrepareService = _create_local_prepare_service( + config=config_instance, + client=client, + logger=logger_instance, + tracker=tracker_instance, + function_call_info={"function_id": None, "call_id": None}, + ) + try: + while True: + if prepare_instance.run() == "Done": + break + logger_instance.info(tracker_instance.generate_local_report()) + except Exception as e: + logger_instance.error( + message=f"Ran into the following error: \n{e}", + section="END", + ) + finally: + logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") + logger_instance.close() + + +def _create_prepare_service(config, client, logger, tracker, function_call_info) -> AbstractPrepareService: + data_model_instance: IDataModelService = create_general_data_model_service(config, client, logger) + prepare_instance: AbstractPrepareService = GeneralPrepareService( + client=client, + config=config, + logger=logger, + tracker=tracker, + data_model_service=data_model_instance, + function_call_info=function_call_info, + ) + return prepare_instance + + +def _create_local_prepare_service(config, client, logger, tracker, function_call_info) -> AbstractPrepareService: + data_model_instance: IDataModelService = create_general_data_model_service(config, client, logger) + prepare_instance: AbstractPrepareService = LocalPrepareService( + client=client, + config=config, + logger=logger, + tracker=tracker, + data_model_service=data_model_instance, + function_call_info=function_call_info, + ) + return prepare_instance + + +if __name__ == "__main__": + # NOTE: Receives the arguments from .vscode/launch.json. Mimics arguments that are passed into the serverless function. + config_file = { + "ExtractionPipelineExtId": sys.argv[1], + "logLevel": sys.argv[2], + } + log_path = sys.argv[3] + run_locally(config_file, log_path) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/requirements.txt b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/requirements.txt new file mode 100644 index 00000000..bd7f2bc3 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/requirements.txt @@ -0,0 +1,24 @@ +annotated-types==0.7.0 +certifi==2025.4.26 +cffi==1.17.1 +charset-normalizer==3.4.2 +cognite-sdk==7.76.0 +cryptography==44.0.3 +dotenv==0.9.9 +idna==3.10 +msal==1.32.3 +oauthlib==3.2.2 +packaging==25.0 +protobuf==6.30.2 +pycparser==2.22 +pydantic==2.11.4 +pydantic_core==2.33.2 +PyJWT==2.10.1 +python-dotenv==1.1.0 +PyYAML==6.0.2 +requests==2.32.3 +requests-oauthlib==1.3.1 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +urllib3==2.5.0 + diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/ConfigService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/ConfigService.py new file mode 100644 index 00000000..f1d2584d --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/ConfigService.py @@ -0,0 +1,371 @@ +from enum import Enum +from typing import Any, Literal, cast, Optional + +import yaml +from cognite.client.data_classes.contextualization import ( + DiagramDetectConfig, + ConnectionFlags, + CustomizeFuzziness, + DirectionWeights, +) +from cognite.client.data_classes.data_modeling import NodeId +from cognite.client.data_classes.filters import Filter +from cognite.client import CogniteClient +from cognite.client import data_modeling as dm +from cognite.client.exceptions import CogniteAPIError +from pydantic import BaseModel, Field +from pydantic.alias_generators import to_camel +from utils.DataStructures import AnnotationStatus, FilterOperator + + +# Configuration Classes +class ViewPropertyConfig(BaseModel, alias_generator=to_camel): + schema_space: str + instance_space: Optional[str] = None + external_id: str + version: str + annotation_type: Optional[Literal["diagrams.FileLink", "diagrams.AssetLink"]] = None + + def as_view_id(self) -> dm.ViewId: + return dm.ViewId(space=self.schema_space, external_id=self.external_id, version=self.version) + + def as_property_ref(self, property) -> list[str]: + return [self.schema_space, f"{self.external_id}/{self.version}", property] + + +class FilterConfig(BaseModel, alias_generator=to_camel): + values: Optional[list[AnnotationStatus | str] | AnnotationStatus | str] = None + negate: bool = False + operator: FilterOperator + target_property: str + + def as_filter(self, view_properties: ViewPropertyConfig) -> Filter: + property_reference = view_properties.as_property_ref(self.target_property) + + # Converts enum value into string -> i.e.) in the case of AnnotationStatus + if isinstance(self.values, list): + find_values = [v.value if isinstance(v, Enum) else v for v in self.values] + elif isinstance(self.values, Enum): + find_values = self.values.value + else: + find_values = self.values + + filter: Filter + if find_values is None: + if self.operator == FilterOperator.EXISTS: + filter = dm.filters.Exists(property=property_reference) + else: + raise ValueError(f"Operator {self.operator} requires a value") + elif self.operator == FilterOperator.IN: + if not isinstance(find_values, list): + raise ValueError(f"Operator 'IN' requires a list of values for property {self.target_property}") + filter = dm.filters.In(property=property_reference, values=find_values) + elif self.operator == FilterOperator.EQUALS: + filter = dm.filters.Equals(property=property_reference, value=find_values) + elif self.operator == FilterOperator.CONTAINSALL: + filter = dm.filters.ContainsAll(property=property_reference, values=find_values) + elif self.operator == FilterOperator.SEARCH: + filter = dm.filters.Search(property=property_reference, value=find_values) + else: + raise NotImplementedError(f"Operator {self.operator} is not implemented.") + + if self.negate: + return dm.filters.Not(filter) + else: + return filter + + +class QueryConfig(BaseModel, alias_generator=to_camel): + target_view: ViewPropertyConfig + filters: list[FilterConfig] + limit: Optional[int] = -1 + + def build_filter(self) -> Filter: + list_filters: list[Filter] = [f.as_filter(self.target_view) for f in self.filters] + + if len(list_filters) == 1: + return list_filters[0] + else: + return dm.filters.And(*list_filters) # NOTE: '*' Unpacks each filter in the list + + +class ConnectionFlagsConfig(BaseModel, alias_generator=to_camel): + no_text_inbetween: Optional[bool] = None + natural_reading_order: Optional[bool] = None + + def as_connection_flag(self) -> ConnectionFlags: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return ConnectionFlags(**params) + + +class CustomizeFuzzinessConfig(BaseModel, alias_generator=to_camel): + fuzzy_score: Optional[float] = None + max_boxes: Optional[int] = None + min_chars: Optional[int] = None + + def as_customize_fuzziness(self) -> CustomizeFuzziness: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return CustomizeFuzziness(**params) + + +class DirectionWeightsConfig(BaseModel, alias_generator=to_camel): + left: Optional[float] = None + right: Optional[float] = None + up: Optional[float] = None + down: Optional[float] = None + + def as_direction_weights(self) -> DirectionWeights: + params = {key: value for key, value in self.model_dump().items() if value is not None} + return DirectionWeights(**params) + + +class DiagramDetectConfigModel(BaseModel, alias_generator=to_camel): + # NOTE: configs come from V7 of the cognite python sdk cognite SDK + annotation_extract: Optional[bool] = None + case_sensitive: Optional[bool] = None + connection_flags: Optional[ConnectionFlagsConfig] = None + customize_fuzziness: Optional[CustomizeFuzzinessConfig] = None + direction_delta: Optional[float] = None + direction_weights: Optional[DirectionWeightsConfig] = None + min_fuzzy_score: Optional[float] = None + read_embedded_text: Optional[bool] = None + remove_leading_zeros: Optional[bool] = None + substitutions: Optional[dict[str, list[str]]] = None + + def as_config(self) -> DiagramDetectConfig: + params = {} + if self.annotation_extract is not None: + params["annotation_extract"] = self.annotation_extract + if self.case_sensitive is not None: + params["case_sensitive"] = self.case_sensitive + if self.connection_flags is not None: + params["connection_flags"] = self.connection_flags.as_connection_flag() + if self.customize_fuzziness is not None: + params["customize_fuzziness"] = self.customize_fuzziness.as_customize_fuzziness() + if self.direction_delta is not None: + params["direction_delta"] = self.direction_delta + if self.direction_weights is not None: + params["direction_weights"] = self.direction_weights.as_direction_weights() + if self.min_fuzzy_score is not None: + params["min_fuzzy_score"] = self.min_fuzzy_score + if self.read_embedded_text is not None: + params["read_embedded_text"] = self.read_embedded_text + if self.remove_leading_zeros is not None: + params["remove_leading_zeros"] = self.remove_leading_zeros + if self.substitutions is not None: + params["substitutions"] = self.substitutions + + return DiagramDetectConfig(**params) + + +# Launch Related Configs +class DataModelServiceConfig(BaseModel, alias_generator=to_camel): + get_files_to_process_query: QueryConfig | list[QueryConfig] + get_target_entities_query: QueryConfig | list[QueryConfig] + get_file_entities_query: QueryConfig | list[QueryConfig] + + +class CacheServiceConfig(BaseModel, alias_generator=to_camel): + cache_time_limit: int + raw_db: str + raw_table_cache: str + raw_manual_patterns_catalog: str + + +class AnnotationServiceConfig(BaseModel, alias_generator=to_camel): + page_range: int = Field(gt=0, le=50) + partial_match: bool = True + min_tokens: int = 1 + diagram_detect_config: Optional[DiagramDetectConfigModel] = None + + +class PrepareFunction(BaseModel, alias_generator=to_camel): + get_files_for_annotation_reset_query: Optional[QueryConfig | list[QueryConfig]] = None + get_files_to_annotate_query: QueryConfig | list[QueryConfig] + + +class LaunchFunction(BaseModel, alias_generator=to_camel): + batch_size: int = Field(gt=0, le=50) + primary_scope_property: str + secondary_scope_property: Optional[str] = None + file_search_property: str = "aliases" + target_entities_search_property: str = "aliases" + pattern_mode: bool + file_resource_property: Optional[str] = None + target_entities_resource_property: Optional[str] = None + data_model_service: DataModelServiceConfig + cache_service: CacheServiceConfig + annotation_service: AnnotationServiceConfig + + +# Finalize Related Configs +class RetrieveServiceConfig(BaseModel, alias_generator=to_camel): + get_job_id_query: QueryConfig | list[QueryConfig] + + +class ApplyServiceConfig(BaseModel, alias_generator=to_camel): + auto_approval_threshold: float = Field(gt=0.0, le=1.0) + auto_suggest_threshold: float = Field(gt=0.0, le=1.0) + sink_node: NodeId + raw_db: str + raw_table_doc_tag: str + raw_table_doc_doc: str + raw_table_doc_pattern: str + + +class FinalizeFunction(BaseModel, alias_generator=to_camel): + clean_old_annotations: bool + max_retry_attempts: int + retrieve_service: RetrieveServiceConfig + apply_service: ApplyServiceConfig + + +# Promote Related Configs +class TextNormalizationConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for text normalization and variation generation. + + Controls how text is normalized for matching and what variations are generated + to improve match rates across different naming conventions. + + These flags affect both the normalize() function (for cache keys and direct matching) + and generate_text_variations() function (for query-based matching). + """ + + remove_special_characters: bool = True + convert_to_lowercase: bool = True + strip_leading_zeros: bool = True + + +class EntitySearchServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the EntitySearchService in the promote function. + + Controls entity search and text normalization behavior: + - Queries entities directly (server-side IN filter on entity/file aliases) + - Text normalization for generating search variations + + Uses efficient server-side filtering on the smaller entity dataset rather than + the larger annotation edge dataset for better performance at scale. + """ + + enable_existing_annotations_search: bool = True + enable_global_entity_search: bool = True + max_entity_search_limit: int = Field(default=1000, gt=0, le=10000) + text_normalization: TextNormalizationConfig + + +class PromoteCacheServiceConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the CacheService in the promote function. + + Controls caching behavior for text→entity mappings. + """ + + cache_table_name: str + + +class PromoteFunctionConfig(BaseModel, alias_generator=to_camel): + """ + Configuration for the promote function. + + The promote function resolves pattern-mode annotations by finding matching entities + and updating annotation edges from pointing to a sink node to pointing to actual entities. + + Configuration is organized by service interface: + - entitySearchService: Controls entity search strategies + - cacheService: Controls caching behavior + + Batch size is controlled via getCandidatesQuery.limit field. + """ + + get_candidates_query: QueryConfig | list[QueryConfig] + raw_db: str + raw_table_doc_pattern: str + raw_table_doc_tag: str + raw_table_doc_doc: str + delete_rejected_edges: bool + delete_suggested_edges: bool + entity_search_service: EntitySearchServiceConfig + cache_service: PromoteCacheServiceConfig + + +class DataModelViews(BaseModel, alias_generator=to_camel): + core_annotation_view: ViewPropertyConfig + annotation_state_view: ViewPropertyConfig + file_view: ViewPropertyConfig + target_entities_view: ViewPropertyConfig + + +class Config(BaseModel, alias_generator=to_camel): + data_model_views: DataModelViews + prepare_function: PrepareFunction + launch_function: LaunchFunction + finalize_function: FinalizeFunction + promote_function: PromoteFunctionConfig + + @classmethod + def parse_direct_relation(cls, value: Any) -> Any: + if isinstance(value, dict): + return dm.DirectRelationReference.load(value) + return value + + +# Functions to construct queries +def get_limit_from_query(query: QueryConfig | list[QueryConfig]) -> int: + """ + Determines the retrieval limit from a query configuration. + Handles 'None' by treating it as the default -1 (unlimited). + """ + default_limit = -1 + if isinstance(query, list): + if not query: + return default_limit + limits = [q.limit if q.limit is not None else default_limit for q in query] + return max(limits) + else: + return query.limit if query.limit is not None else default_limit + + +def build_filter_from_query(query: QueryConfig | list[QueryConfig]) -> Filter: + """ + Builds a Cognite Filter from a query configuration. + + If the query is a list, it builds a filter for each item and combines them with a logical OR. + If the query is a single object, it builds the filter directly from it. + """ + if isinstance(query, list): + list_filters: list[Filter] = [q.build_filter() for q in query] + if not list_filters: + raise ValueError("Query list cannot be empty.") + return dm.filters.Or(*list_filters) if len(list_filters) > 1 else list_filters[0] + else: + return query.build_filter() + + +def load_config_parameters( + client: CogniteClient, + function_data: dict[str, Any], +) -> Config: + """ + Retrieves the configuration parameters from the function data and loads the configuration from CDF. + """ + if "ExtractionPipelineExtId" not in function_data: + raise ValueError("Missing key 'ExtractionPipelineExtId' in input data to the function") + + pipeline_ext_id = function_data["ExtractionPipelineExtId"] + try: + raw_config = client.extraction_pipelines.config.retrieve(pipeline_ext_id) + if raw_config.config is None: + raise ValueError(f"No config found for extraction pipeline: {pipeline_ext_id!r}") + except CogniteAPIError: + raise RuntimeError(f"Not able to retrieve pipeline config for extraction pipeline: {pipeline_ext_id!r}") + + loaded_yaml_data = yaml.safe_load(raw_config.config) + + if isinstance(loaded_yaml_data, dict): + return Config.model_validate(loaded_yaml_data) + else: + raise ValueError( + "Invalid configuration structure from CDF: \nExpected a YAML dictionary with a top-level 'config' key." + ) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/DataModelService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/DataModelService.py new file mode 100644 index 00000000..ee374874 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/DataModelService.py @@ -0,0 +1,403 @@ +import abc +from datetime import datetime, timezone, timedelta +from cognite.client import CogniteClient +from cognite.client.data_classes.data_modeling import ( + Node, + NodeId, + NodeList, + NodeApply, + NodeApplyResultList, + instances, + InstancesApplyResult, +) +from cognite.client.data_classes.filters import ( + Filter, + Equals, + In, + Range, + Exists, +) + +from services.ConfigService import ( + Config, + ViewPropertyConfig, + build_filter_from_query, + get_limit_from_query, +) +from services.LoggerService import CogniteFunctionLogger +from utils.DataStructures import AnnotationStatus + + +class IDataModelService(abc.ABC): + """ + Interface for interacting with data model instances in CDF + """ + + @abc.abstractmethod + def get_files_for_annotation_reset(self) -> NodeList | None: + pass + + @abc.abstractmethod + def get_files_to_annotate(self) -> NodeList | None: + pass + + @abc.abstractmethod + def get_files_to_process( + self, + ) -> tuple[NodeList, dict[NodeId, Node]] | tuple[None, None]: + pass + + @abc.abstractmethod + def update_annotation_state( + self, + list_node_apply: list[NodeApply], + ) -> NodeApplyResultList: + pass + + @abc.abstractmethod + def create_annotation_state( + self, + list_node_apply: list[NodeApply], + ) -> NodeApplyResultList: + pass + + @abc.abstractmethod + def get_instances_entities( + self, primary_scope_value: str, secondary_scope_value: str | None + ) -> tuple[NodeList, NodeList]: + pass + + +class GeneralDataModelService(IDataModelService): + """ + Implementation used for real runs + """ + + def __init__(self, config: Config, client: CogniteClient, logger: CogniteFunctionLogger): + self.client: CogniteClient = client + self.config: Config = config + self.logger: CogniteFunctionLogger = logger + + self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view + self.file_view: ViewPropertyConfig = config.data_model_views.file_view + self.target_entities_view: ViewPropertyConfig = config.data_model_views.target_entities_view + + self.get_files_to_annotate_retrieve_limit: int | None = get_limit_from_query( + config.prepare_function.get_files_to_annotate_query + ) + self.get_files_to_process_retrieve_limit: int | None = get_limit_from_query( + config.launch_function.data_model_service.get_files_to_process_query + ) + + self.filter_files_to_annotate: Filter = build_filter_from_query( + config.prepare_function.get_files_to_annotate_query + ) + self.filter_files_to_process: Filter = build_filter_from_query( + config.launch_function.data_model_service.get_files_to_process_query + ) + self.filter_target_entities: Filter = build_filter_from_query( + config.launch_function.data_model_service.get_target_entities_query + ) + self.filter_file_entities: Filter = build_filter_from_query( + config.launch_function.data_model_service.get_file_entities_query + ) + + def get_files_for_annotation_reset(self) -> NodeList | None: + """ + Retrieves files that need their annotation status reset based on configuration. + + Args: + None + + Returns: + NodeList of file instances to reset, or None if no reset query is configured. + + NOTE: Not building the filter in the object instantiation because the filter will only ever be used once throughout all runs of prepare + Furthermore, there is an implicit guarantee that a filter will be returned b/c launch checks if the query exists. + """ + if not self.config.prepare_function.get_files_for_annotation_reset_query: + return + + filter_files_for_annotation_reset: Filter = build_filter_from_query( + self.config.prepare_function.get_files_for_annotation_reset_query + ) + result: NodeList | None = self.client.data_modeling.instances.list( + instance_type="node", + sources=self.file_view.as_view_id(), + space=self.file_view.instance_space, + limit=-1, # NOTE: this should always be kept at -1 so that all files defined in the query will get reset + filter=filter_files_for_annotation_reset, + ) + return result + + def get_files_to_annotate(self) -> NodeList | None: + """ + Retrieves files ready for annotation processing based on their tag status. + + Queries for files marked "ToAnnotate" that don't have 'AnnotationInProcess' or 'Annotated' tags. + The specific query filters are defined in the getFilesToAnnotate config parameter. + + Args: + None + + Returns: + NodeList of file instances ready for annotation, or None if no files found. + """ + result: NodeList | None = self.client.data_modeling.instances.list( + instance_type="node", + sources=self.file_view.as_view_id(), + space=self.file_view.instance_space, + limit=self.get_files_to_annotate_retrieve_limit, # NOTE: the amount of instances that are returned may or may not matter depending on how the memory constraints of azure/aws functions + filter=self.filter_files_to_annotate, + ) + + return result + + def get_files_to_process( + self, + ) -> tuple[NodeList, dict[NodeId, Node]] | tuple[None, None]: + """ + Retrieves files with annotation state instances that are ready for diagram detection. + + Queries for FileAnnotationStateInstances based on the getFilesToProcess config parameter, + extracts the linked file NodeIds, and retrieves the corresponding file nodes. + + Args: + None + + Returns: + A tuple containing: + - NodeList of file instances to process + - Dictionary mapping file NodeIds to their annotation state Node instances + Returns (None, None) if no files are found. + """ + annotation_state_filter = self._get_annotation_state_filter() + annotation_state_instances: NodeList = self.client.data_modeling.instances.list( + instance_type="node", + sources=self.annotation_state_view.as_view_id(), + space=self.annotation_state_view.instance_space, + limit=self.get_files_to_process_retrieve_limit, + filter=annotation_state_filter, + ) + + if not annotation_state_instances: + return None, None + + file_to_state_map: dict[NodeId, Node] = {} + list_file_node_ids: list[NodeId] = [] + + for node in annotation_state_instances: + file_reference = node.properties.get(self.annotation_state_view.as_view_id()).get("linkedFile") + if self.file_view.instance_space is None or self.file_view.instance_space == file_reference["space"]: + file_node_id = NodeId( + space=file_reference["space"], + external_id=file_reference["externalId"], + ) + + file_to_state_map[file_node_id] = node + list_file_node_ids.append(file_node_id) + + file_instances: NodeList = self.client.data_modeling.instances.retrieve_nodes( + nodes=list_file_node_ids, + sources=self.file_view.as_view_id(), + ) + + return file_instances, file_to_state_map + + def _get_annotation_state_filter(self) -> Filter: + """ + Builds a filter for annotation state instances, including automatic retry logic for stuck jobs. + + Combines the configured filter with a fallback filter that catches annotation state instances + stuck in Processing/Finalizing status for more than 12 hours. + + Args: + None + + Returns: + Combined Filter for querying annotation state instances. + + NOTE: filter = (getFilesToProcess filter || (annotationStatus == Processing && now() - lastUpdatedTime) > 1440 minutes) + - getFilesToProcess filter comes from extraction pipeline + - (annotationStatus == Processing | Finalizing && now() - lastUpdatedTime) > 720 minutes/12 hours -> hardcoded -> reprocesses any file that's stuck + - Edge case that occurs very rarely but can happen. + NOTE: Implementation of a more complex query that can't be handled in config should come from an implementation of the interface. + """ + annotation_status_property = self.annotation_state_view.as_property_ref("annotationStatus") + annotation_last_updated_property = self.annotation_state_view.as_property_ref("sourceUpdatedTime") + # NOTE: While this number is hard coded, I believe it doesn't need to be configured. Number comes from my experience with the pipeline. Feel free to change if your experience leads to a different number + latest_permissible_time_utc = datetime.now(timezone.utc) - timedelta(minutes=720) + latest_permissible_time_utc = latest_permissible_time_utc.isoformat(timespec="milliseconds") + filter_stuck = In( + annotation_status_property, + [AnnotationStatus.PROCESSING, AnnotationStatus.FINALIZING], + ) & Range(annotation_last_updated_property, lt=latest_permissible_time_utc) + + filter = self.filter_files_to_process | filter_stuck # | == OR + return filter + + def update_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + """ + Updates existing annotation state nodes with new property values. + + Args: + list_node_apply: List of NodeApply objects containing updated properties. + + Returns: + NodeApplyResultList containing the results of the update operation. + """ + update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( + nodes=list_node_apply, + replace=False, # ensures we don't delete other properties in the view + ) + return update_results.nodes + + def create_annotation_state(self, list_node_apply: list[NodeApply]) -> NodeApplyResultList: + """ + Creates new annotation state nodes, replacing any existing nodes with the same IDs. + + Args: + list_node_apply: List of NodeApply objects to create as new annotation state instances. + + Returns: + NodeApplyResultList containing the results of the creation operation. + """ + update_results: InstancesApplyResult = self.client.data_modeling.instances.apply( + nodes=list_node_apply, + auto_create_direct_relations=True, + replace=True, # ensures we reset the properties of the node + ) + return update_results.nodes + + def get_instances_entities( + self, primary_scope_value: str, secondary_scope_value: str | None + ) -> tuple[NodeList, NodeList]: + """ + Retrieves target entities and file entities for use in diagram detection. + + Queries the data model for entities (assets) and files that match the configured filters + and scope values, which will be used to create the entity cache for diagram detection. + + Args: + primary_scope_value: Primary scope identifier (e.g., site, facility). + secondary_scope_value: Optional secondary scope identifier (e.g., unit, area). + + Returns: + A tuple containing: + - NodeList of target entity instances (typically assets) + - NodeList of file entity instances + + NOTE: 1. grab assets that meet the filter requirement + NOTE: 2. grab files that meet the filter requirement + """ + target_filter: Filter = self._get_target_entities_filter(primary_scope_value, secondary_scope_value) + file_filter: Filter = self._get_file_entities_filter(primary_scope_value, secondary_scope_value) + + target_entities: NodeList = self.client.data_modeling.instances.list( + instance_type="node", + sources=self.target_entities_view.as_view_id(), + space=self.target_entities_view.instance_space, + filter=target_filter, + limit=-1, # NOTE: this should always be kept at -1 so that all entities are retrieved + ) + file_entities: NodeList = self.client.data_modeling.instances.list( + instance_type="node", + sources=self.file_view.as_view_id(), + space=self.file_view.instance_space, + filter=file_filter, + limit=-1, # NOTE: this should always be kept at -1 so that all entities are retrieved + ) + return target_entities, file_entities + + def _get_target_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: + """ + Builds a filter for target entities (assets) based on scope and configuration. + + Creates a filter combining scope-specific filtering with global 'ScopeWideDetect' entities. + + Args: + primary_scope_value: Primary scope identifier for filtering entities. + secondary_scope_value: Optional secondary scope identifier for more specific filtering. + + Returns: + Combined Filter for querying target entities. + + NOTE: Create a filter that... + - grabs assets in the primary_scope_value and secondary_scope_value provided with detectInDiagram in the tags property + or + - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value + """ + filter_primary_scope: Filter = Equals( + property=self.target_entities_view.as_property_ref(self.config.launch_function.primary_scope_property), + value=primary_scope_value, + ) + filter_entities: Filter = self.filter_target_entities + # NOTE: ScopeWideDetect is an optional string that allows annotating across scopes + filter_scope_wide: Filter = In( + property=self.target_entities_view.as_property_ref("tags"), + values=["ScopeWideDetect"], + ) + if not primary_scope_value: + target_filter = filter_entities | filter_scope_wide + elif secondary_scope_value: + filter_secondary_scope: Filter = Equals( + property=self.target_entities_view.as_property_ref( + self.config.launch_function.secondary_scope_property + ), + value=secondary_scope_value, + ) + target_filter = (filter_primary_scope & filter_secondary_scope & filter_entities) | ( + filter_primary_scope & filter_scope_wide + ) + else: + target_filter = (filter_primary_scope & filter_entities) | (filter_primary_scope & filter_scope_wide) + return target_filter + + def _get_file_entities_filter(self, primary_scope_value: str, secondary_scope_value: str | None) -> Filter: + """ + Builds a filter for file entities based on scope and configuration. + + Creates a filter combining scope-specific filtering with global 'ScopeWideDetect' files, + ensuring file entities have the required search properties. + + Args: + primary_scope_value: Primary scope identifier for filtering file entities. + secondary_scope_value: Optional secondary scope identifier for more specific filtering. + + Returns: + Combined Filter for querying file entities. + + NOTE: Create a filter that... + - grabs assets in the primary_scope_value and secondary_scope_value provided with DetectInDiagram in the tags property + or + - grabs assets in the primary_scope_value with ScopeWideDetect in the tags property (hard coded) -> provides an option to include entities outside of the secondary_scope_value + """ + filter_primary_scope: Filter = Equals( + property=self.file_view.as_property_ref(self.config.launch_function.primary_scope_property), + value=primary_scope_value, + ) + filter_entities: Filter = self.filter_file_entities + filter_search_property_exists: Filter = Exists( + property=self.file_view.as_property_ref(self.config.launch_function.file_search_property), + ) + # NOTE: ScopeWideDetect is an optional string that allows annotating across scopes + filter_scope_wide: Filter = In( + property=self.file_view.as_property_ref("tags"), + values=["ScopeWideDetect"], + ) + if not primary_scope_value: + file_filter = (filter_entities & filter_search_property_exists) | (filter_scope_wide) + elif secondary_scope_value: + filter_secondary_scope: Filter = Equals( + property=self.file_view.as_property_ref(self.config.launch_function.secondary_scope_property), + value=secondary_scope_value, + ) + file_filter = ( + filter_primary_scope & filter_entities & filter_secondary_scope & filter_search_property_exists + ) | (filter_primary_scope & filter_scope_wide) + else: + file_filter = (filter_primary_scope & filter_entities & filter_search_property_exists) | ( + filter_primary_scope & filter_scope_wide + ) + + return file_filter diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/LoggerService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/LoggerService.py new file mode 100644 index 00000000..773b7797 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/LoggerService.py @@ -0,0 +1,169 @@ +from typing import Literal +import os + + +class CogniteFunctionLogger: + def __init__( + self, + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO", + write: bool = False, + filepath: str | None = None, + ): + self.log_level = log_level.upper() + self.write = write + self.filepath = filepath + self.file_handler = None + + if self.filepath and self.write: + try: + dir_name = os.path.dirname(self.filepath) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + self.file_handler = open(self.filepath, "a", encoding="utf-8") + except Exception as e: + print(f"[LOGGER_SETUP_ERROR] Could not open log file {self.filepath}: {e}") + self.write = False + + def _format_message_lines(self, prefix: str, message: str) -> list[str]: + """ + Formats multi-line messages with consistent indentation. + + Args: + prefix: The log level prefix (e.g., "[INFO]", "[ERROR]"). + message: The message to format. + + Returns: + List of formatted message lines with proper indentation. + """ + formatted_lines = [] + if "\n" not in message: + formatted_lines.append(f"{prefix} {message}") + else: + lines = message.split("\n") + formatted_lines.append(f"{prefix}{lines[0]}") + padding = " " * len(prefix) + for line_content in lines[1:]: + formatted_lines.append(f"{padding} {line_content}") + return formatted_lines + + def _print(self, prefix: str, message: str) -> None: + """ + Prints formatted log messages to console and optionally to file. + + Args: + prefix: The log level prefix to prepend to the message. + message: The message to log. + + Returns: + None + """ + lines_to_log = self._format_message_lines(prefix, message) + if self.write and self.file_handler: + try: + for line in lines_to_log: + print(line) + self.file_handler.write(line + "\n") + self.file_handler.flush() + except Exception as e: + print(f"[LOGGER_SETUP_ERROR] Could not write to {self.filepath}: {e}") + elif not self.write: + for line in lines_to_log: + print(line) + + def debug(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + """ + Logs a debug-level message. + + Args: + message: The debug message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ + if section == "START" or section == "BOTH": + self._section() + if self.log_level == "DEBUG": + self._print("[DEBUG]", message) + if section == "END" or section == "BOTH": + self._section() + + def info(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + """ + Logs an info-level message. + + Args: + message: The informational message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ + if section == "START" or section == "BOTH": + self._section() + if self.log_level in ("DEBUG", "INFO"): + self._print("[INFO]", message) + if section == "END" or section == "BOTH": + self._section() + + def warning(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + """ + Logs a warning-level message. + + Args: + message: The warning message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ + if section == "START" or section == "BOTH": + self._section() + if self.log_level in ("DEBUG", "INFO", "WARNING"): + self._print("[WARNING]", message) + if section == "END" or section == "BOTH": + self._section() + + def error(self, message: str, section: Literal["START", "END", "BOTH"] | None = None) -> None: + """ + Logs an error-level message. + + Args: + message: The error message to log. + section: Optional section separator position (START, END, or BOTH). + + Returns: + None + """ + if section == "START" or section == "BOTH": + self._section() + self._print("[ERROR]", message) + if section == "END" or section == "BOTH": + self._section() + + def _section(self) -> None: + """ + Prints a visual separator line for log sections. + + Returns: + None + """ + if self.write and self.file_handler: + self.file_handler.write( + "--------------------------------------------------------------------------------\n" + ) + print("--------------------------------------------------------------------------------") + + def close(self) -> None: + """ + Closes the file handler if file logging is enabled. + + Returns: + None + """ + if self.file_handler: + try: + self.file_handler.close() + except Exception as e: + print(f"[LOGGER_CLEANUP_ERROR] Error closing log file: {e}") + self.file_handler = None diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PipelineService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PipelineService.py new file mode 100644 index 00000000..7cf5d885 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PipelineService.py @@ -0,0 +1,66 @@ +import abc + +from typing import Literal +from cognite.client import CogniteClient +from cognite.client.data_classes import ExtractionPipelineRunWrite + + +class IPipelineService(abc.ABC): + """ + Interface for creating and updating extraction pipeline logs. + """ + + @abc.abstractmethod + def update_extraction_pipeline(self, msg: str) -> None: + pass + + @abc.abstractmethod + def upload_extraction_pipeline( + self, + status: Literal["success", "failure", "seen"], + ) -> None: + pass + + +class GeneralPipelineService(IPipelineService): + """ + Implementation of the pipeline interface + """ + + def __init__(self, pipeline_ext_id: str, client: CogniteClient): + self.client: CogniteClient = client + self.ep_write: ExtractionPipelineRunWrite = ExtractionPipelineRunWrite( + extpipe_external_id=pipeline_ext_id, + status="seen", + ) + + def update_extraction_pipeline(self, msg: str) -> None: + """ + Appends a message to the extraction pipeline run log. + + Args: + msg: The message to append to the pipeline log. + + Returns: + None + """ + if not self.ep_write.message: + self.ep_write.message = msg + else: + self.ep_write.message = f"{self.ep_write.message}\n{msg}" + + def upload_extraction_pipeline( + self, + status: Literal["success", "failure", "seen"], + ) -> None: + """ + Creates an extraction pipeline run with accumulated status and messages. + + Args: + status: The run status to report (success, failure, or seen). + + Returns: + None + """ + self.ep_write.status = status + self.client.extraction_pipelines.runs.create(self.ep_write) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PrepareService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PrepareService.py new file mode 100644 index 00000000..2bbeeb14 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/PrepareService.py @@ -0,0 +1,209 @@ +import abc +from typing import cast, Literal +from cognite.client import CogniteClient +from cognite.client.exceptions import CogniteAPIError +from cognite.client.data_classes.data_modeling import ( + NodeList, + NodeApply, +) + +from services.ConfigService import Config, ViewPropertyConfig +from services.DataModelService import IDataModelService +from services.LoggerService import CogniteFunctionLogger +from utils.DataStructures import ( + AnnotationStatus, + AnnotationState, + PerformanceTracker, +) + + +class AbstractPrepareService(abc.ABC): + """ + Orchestrates the file annotation prepare process. This service prepares files for annotation + by creating annotation state instances for files marked ToAnnotate. + """ + + def __init__( + self, + client: CogniteClient, + config: Config, + logger: CogniteFunctionLogger, + tracker: PerformanceTracker, + data_model_service: IDataModelService, + ): + self.client = client + self.config = config + self.logger = logger + self.tracker = tracker + self.data_model_service = data_model_service + + @abc.abstractmethod + def run(self) -> str | None: + pass + + +class GeneralPrepareService(AbstractPrepareService): + """ + Orchestrates the file annotation prepare process. This service prepares files for annotation + by creating annotation state instances for files marked ToAnnotate. + """ + + def __init__( + self, + client: CogniteClient, + config: Config, + logger: CogniteFunctionLogger, + tracker: PerformanceTracker, + data_model_service: IDataModelService, + function_call_info: dict, + ): + super().__init__( + client, + config, + logger, + tracker, + data_model_service, + ) + + self.annotation_state_view: ViewPropertyConfig = config.data_model_views.annotation_state_view + self.file_view: ViewPropertyConfig = config.data_model_views.file_view + + self.function_id: int | None = function_call_info.get("function_id") + self.call_id: int | None = function_call_info.get("call_id") + + self.reset_files: bool = False + if self.config.prepare_function.get_files_for_annotation_reset_query: + self.reset_files = True + + def run(self) -> Literal["Done"] | None: + """ + Prepares files for annotation by creating annotation state instances. + + Retrieves files marked "ToAnnotate", creates corresponding FileAnnotationState instances, + and updates file tags to indicate processing has started. Can also reset files if configured. + + Args: + None + + Returns: + "Done" if no more files need preparation, None if processing should continue. + + Raises: + CogniteAPIError: If query timeout or other API errors occur (408 errors are handled gracefully). + ValueError: If annotation state view instance space is not configured. + """ + self.logger.info( + message=f"Starting Prepare Function", + section="START", + ) + try: + if self.reset_files: + file_nodes_to_reset: NodeList | None = self.data_model_service.get_files_for_annotation_reset() + if not file_nodes_to_reset: + self.logger.info( + "No files found with the getFilesForAnnotationReset query provided in the config file" + ) + else: + self.logger.info(f"Resetting {len(file_nodes_to_reset)} files") + reset_node_apply: list[NodeApply] = [] + for file_node in file_nodes_to_reset: + file_node_apply: NodeApply = file_node.as_write() + tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) + if "AnnotationInProcess" in tags_property: + tags_property.remove("AnnotationInProcess") + if "Annotated" in tags_property: + tags_property.remove("Annotated") + if "AnnotationFailed" in tags_property: + tags_property.remove("AnnotationFailed") + + reset_node_apply.append(file_node_apply) + update_results = self.data_model_service.update_annotation_state(reset_node_apply) + self.logger.info( + f"Removed the AnnotationInProcess/Annotated/AnnotationFailed tag of {len(update_results)} files" + ) + self.reset_files = False + except CogniteAPIError as e: + # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. + if ( + e.code == 408 + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + ): + # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. + self.logger.error(message=f"Ran into the following error:\n{str(e)}") + return + else: + raise e + + try: + file_nodes: NodeList | None = self.data_model_service.get_files_to_annotate() + if not file_nodes: + self.logger.info( + message=f"No files found to prepare", + section="END", + ) + return "Done" + self.logger.info(f"Preparing {len(file_nodes)} files") + except CogniteAPIError as e: + # NOTE: Reliant on the CogniteAPI message to stay the same across new releases. If unexpected changes were to occur please refer to this section of the code and check if error message is now different. + if ( + e.code == 408 + and e.message == "Graph query timed out. Reduce load or contention, or optimise your query." + ): + # NOTE: 408 indicates a timeout error. Keep retrying the query if a timeout occurs. + self.logger.error(message=f"Ran into the following error:\n{str(e)}") + return + else: + raise e + + annotation_state_instances: list[NodeApply] = [] + file_apply_instances: list[NodeApply] = [] + for file_node in file_nodes: + node_id = {"space": file_node.space, "externalId": file_node.external_id} + annotation_instance = AnnotationState( + annotationStatus=AnnotationStatus.NEW, + linkedFile=node_id, + ) + if not self.annotation_state_view.instance_space: + msg = ( + "Need an instance space in DataModelViews/AnnotationStateView config to store the annotation state" + ) + self.logger.error(msg) + raise ValueError(msg) + annotation_instance_space: str = self.annotation_state_view.instance_space + + annotation_node_apply: NodeApply = annotation_instance.to_node_apply( + node_space=annotation_instance_space, + annotation_state_view=self.annotation_state_view.as_view_id(), + ) + annotation_state_instances.append(annotation_node_apply) + + file_node_apply: NodeApply = file_node.as_write() + tags_property: list[str] = cast(list[str], file_node_apply.sources[0].properties["tags"]) + if "AnnotationInProcess" not in tags_property: + tags_property.append("AnnotationInProcess") + file_apply_instances.append(file_node_apply) + + try: + create_results = self.data_model_service.create_annotation_state(annotation_state_instances) + self.logger.info(message=f"Created {len(create_results)} annotation state instances") + update_results = self.data_model_service.update_annotation_state(file_apply_instances) + self.logger.info( + message=f"Added 'AnnotationInProcess' to the tag property for {len(update_results)} files", + section="END", + ) + except Exception as e: + self.logger.error(message=f"Ran into the following error:\n{str(e)}", section="END") + raise + + self.tracker.add_files(success=len(file_nodes)) + return + + +class LocalPrepareService(GeneralPrepareService): + """ + Prepare service variant for local development and debugging. + + Extends GeneralPrepareService with any local-specific behavior if needed. + """ + + pass diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py new file mode 100644 index 00000000..e6da5d06 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py @@ -0,0 +1,331 @@ +from dataclasses import dataclass, asdict, field +from typing import Literal, cast +from enum import Enum +from datetime import datetime, timezone, timedelta + +from cognite.client.data_classes.data_modeling import ( + Node, + NodeId, + NodeApply, + NodeOrEdgeData, + ViewId, +) +from cognite.client.data_classes.contextualization import ( + FileReference, +) + + +@dataclass +class EnvConfig: + """ + Data structure holding the configs to connect to CDF client locally + """ + + cdf_project: str + cdf_cluster: str + tenant_id: str + client_id: str + client_secret: str + + +class DiagramAnnotationStatus(str, Enum): + SUGGESTED = "Suggested" + APPROVED = "Approved" + REJECTED = "Rejected" + + +class AnnotationStatus(str, Enum): + """ + Defines the types of values that the annotationStatus property can be for the Annotation State Instances. + Inherits from 'str' so that the enum members are also string instances, + making them directly usable where a string is expected (e.g., serialization). + Holds the different values that the annotationStatus property can be for the Annotation State Instances. + """ + + NEW = "New" + RETRY = "Retry" + PROCESSING = "Processing" + FINALIZING = "Finalizing" + ANNOTATED = "Annotated" + FAILED = "Failed" + + +class FilterOperator(str, Enum): + """ + Defines the types of filter operations that can be specified in the configuration. + Inherits from 'str' so that the enum members are also string instances, + making them directly usable where a string is expected (e.g., serialization). + """ + + EQUALS = "Equals" # Checks for equality against a single value. + EXISTS = "Exists" # Checks if a property exists (is not null). + CONTAINSALL = "ContainsAll" # Checks if an item contains all specified values for a given property + IN = "In" # Checks if a value is within a list of specified values. Not implementing CONTAINSANY b/c IN is usually more suitable + SEARCH = "Search" # Performs full text search on a specified property + + +@dataclass +class AnnotationState: + """ + Data structure holding the mpcAnnotationState view properties. Time will convert to Timestamp when ingested into CDF. + """ + + annotationStatus: AnnotationStatus + linkedFile: dict[str, str] = field(default_factory=dict) + attemptCount: int = 0 + annotationMessage: str | None = None + diagramDetectJobId: int | None = None + sourceCreatedTime: str = field( + default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() + ) + sourceUpdatedTime: str = field( + default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() + ) + sourceCreatedUser: str = "fn_dm_context_annotation_prepare" + sourceUpdatedUser: str = "fn_dm_context_annotation_prepare" + + def _create_external_id(self) -> str: + """ + Create a deterministic external ID so that we can replace mpcAnnotationState of files that have been updated and aren't new + """ + prefix = "an_state" + linked_file_space = self.linkedFile["space"] + linked_file_id = self.linkedFile["externalId"] + return f"{prefix}_{linked_file_space}_{linked_file_id}" + + def to_dict(self) -> dict: + return asdict(self) + + def to_node_apply(self, node_space: str, annotation_state_view: ViewId) -> NodeApply: + external_id: str = self._create_external_id() + + return NodeApply( + space=node_space, + external_id=external_id, + sources=[ + NodeOrEdgeData( + source=annotation_state_view, + properties=self.to_dict(), + ) + ], + ) + + +@dataclass +class FileProcessingBatch: + primary_scope_value: str + secondary_scope_value: str | None + files: list[Node] + + +@dataclass +class entity: + """ + data structure for the 'entities' fed into diagram detect, + { + "external_id": file.external_id, + "name": file.properties[job_config.file_view.as_view_id()]["name"], + "space": file.space, + "annotation_type": job_config.file_view.type, + "resource_type": file.properties[job_config.file_view.as_view_id()][{resource_type}], + "search_property": file.properties[job_config.file_view.as_view_id()][{search_property}], + } + """ + + external_id: str + name: str + space: str + annotation_type: Literal["diagrams.FileLink", "diagrams.AssetLink"] | None + resource_type: str + search_property: list[str] = field(default_factory=list) + + def to_dict(self): + return asdict(self) + + +@dataclass +class BatchOfNodes: + nodes: list[Node] = field(default_factory=list) + ids: list[NodeId] = field(default_factory=list) + apply: list[NodeApply] = field(default_factory=list) + + def add(self, node: Node): + self.nodes.append(node) + node_id = node.as_id() + self.ids.append(node_id) + return + + def clear(self): + self.nodes.clear() + self.ids.clear() + self.apply.clear() + return + + def update_node_properties(self, new_properties: dict, view_id: ViewId): + for node in self.nodes: + node_apply = NodeApply( + space=node.space, + external_id=node.external_id, + existing_version=None, + sources=[ + NodeOrEdgeData( + source=view_id, + properties=new_properties, + ) + ], + ) + self.apply.append(node_apply) + return + + +@dataclass +class BatchOfPairedNodes: + """ + Where nodeA is an instance of the file view and nodeB is an instance of the annotation state view + """ + + file_to_state_map: dict[NodeId, Node] + batch_files: BatchOfNodes = field(default_factory=BatchOfNodes) + batch_states: BatchOfNodes = field(default_factory=BatchOfNodes) + file_references: list[FileReference] = field(default_factory=list) + + def add_pair(self, file_node: Node, file_reference: FileReference): + self.file_references.append(file_reference) + self.batch_files.add(file_node) + file_node_id: NodeId = file_node.as_id() + state_node: Node = self.file_to_state_map[file_node_id] + self.batch_states.add(state_node) + + def create_file_reference( + self, + file_node_id: NodeId, + page_range: int, + annotation_state_view_id: ViewId, + ) -> FileReference: + """ + Create a file reference that has a page range for annotation. + The current implementation of the detect api 20230101-beta only allows annotation of files up to 50 pages. + Thus, this is my idea of how we can enables annotating files that are more than 50 pages long. + + The annotatedPageCount and pageCount properties won't be set in the initial creation of the annotation state nodes. + That's because we don't know how many pages are in the pdf until we run the diagram detect job where the page count gets returned from the results of the job. + Thus, annotatedPageCount and pageCount get set in the finalize function. + The finalize function will set the page count properties based on the page count that returned from diagram detect job results. + - If the pdf has less than 50 pages, say 3 pages, then... + - annotationStatus property will get set to 'complete' + - annotatedPageCount and pageCount properties will be set to 3. + - Elif the pdf has more than 50 pages, say 80, then... + - annotationStatus property will get set to 'new' + - annotatedPageCount set to 50 + - pageCount set to 80 + - attemptCount doesn't get incremented + + NOTE: Chose to create the file_reference here b/c I already have access to the file node and state node. + If I chose to have this logic in the launchService then we'd have to iterate on all of the nodes that have already been added. + Thus -> O(N) + O(N) to create the BatchOfPairedNodes and then to create the file references + Instead, this approach makes it just O(N) + """ + annotation_state_node: Node = self.file_to_state_map[file_node_id] + annotated_page_count: int | None = cast( + int, + annotation_state_node.properties[annotation_state_view_id].get("annotatedPageCount"), + ) + page_count: int | None = cast( + int, + annotation_state_node.properties[annotation_state_view_id].get("pageCount"), + ) + if not annotated_page_count or not page_count: + file_reference: FileReference = FileReference( + file_instance_id=file_node_id, + first_page=1, + last_page=page_range, + ) + else: + # NOTE: adding 1 here since that annotated_page_count variable holds the last page that was annotated. Thus we want to annotate the following page + # e.g.) first run annotates pages 1-50 second run would annotate 51-100 + first_page = annotated_page_count + 1 + last_page = annotated_page_count + page_range + if page_count <= last_page: + last_page = page_count + file_reference: FileReference = FileReference( + file_instance_id=file_node_id, + first_page=first_page, + last_page=last_page, + ) + + return file_reference + + def clear_pair(self): + self.batch_files.clear() + self.batch_states.clear() + self.file_references.clear() + + def size(self) -> int: + return len(self.file_references) + + def is_empty(self) -> bool: + if self.file_references: + return False + return True + + +@dataclass +class PerformanceTracker: + """ + Keeps track of metrics + """ + + files_success: int = 0 + files_failed: int = 0 + total_runs: int = 0 + total_time_delta: timedelta = timedelta(0) + latest_run_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + def _run_time(self) -> timedelta: + time_delta = datetime.now(timezone.utc) - self.latest_run_time + return time_delta + + def _average_run_time(self) -> timedelta: + if self.total_runs == 0: + return timedelta(0) + return self.total_time_delta / self.total_runs + + def add_files(self, success: int, failed: int = 0): + self.files_success += success + self.files_failed += failed + + def generate_local_report(self) -> str: + self.total_runs += 1 + time_delta = self._run_time() + self.total_time_delta += time_delta + self.latest_run_time = datetime.now(timezone.utc) + + report = f"run time: {time_delta}" + return report + + def generate_overall_report(self) -> str: + report = f" Run started {datetime.now(timezone.utc)}\n- total runs: {self.total_runs}\n- total files processed: {self.files_success+self.files_failed}\n- successful files: {self.files_success}\n- failed files: {self.files_failed}\n- total run time: {self.total_time_delta}\n- average run time: {self._average_run_time()}" + return report + + def generate_ep_run( + self, + caller: Literal["Launch", "Finalize"], + function_id: str | None, + call_id: str | None, + ) -> str: + """Generates the report string for the extraction pipeline run.""" + report = ( + f"(caller:{caller}, function_id:{function_id}, call_id:{call_id}) - " + f"total files processed: {self.files_success + self.files_failed} - " + f"successful files: {self.files_success} - " + f"failed files: {self.files_failed}" + ) + return report + + def reset(self) -> None: + self.files_success = 0 + self.files_failed = 0 + self.total_runs: int = 0 + self.total_time_delta = timedelta(0) + self.latest_run_time = datetime.now(timezone.utc) + print("PerformanceTracker state has been reset") From ef26cbe80096f92208e60b88a407778f566e5319 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 10:23:18 -0500 Subject: [PATCH 118/128] created workflow version and trigger for the prepare function --- .../cdf_file_annotation/default.config.yaml | 30 +++++++++++++------ .../functions/functions.Function.yaml | 10 +++++++ .../wf_file_annotation.WorkflowTrigger.yaml | 10 +++++++ .../wf_file_annotation.WorkflowVersion.yaml | 26 ++++++++++++++-- 4 files changed, 65 insertions(+), 11 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/default.config.yaml b/modules/contextualization/cdf_file_annotation/default.config.yaml index 43989983..10812f4a 100644 --- a/modules/contextualization/cdf_file_annotation/default.config.yaml +++ b/modules/contextualization/cdf_file_annotation/default.config.yaml @@ -28,25 +28,37 @@ targetEntityInstanceSpace: targetEntityExternalId: targetEntityVersion: -# used in /functions and /workflows +# used in /functions +functionClientId: ${IDP_CLIENT_ID} +functionClientSecret: ${IDP_CLIENT_SECRET} + +# used in prepare function +prepareFunctionExternalId: fn_file_annotation_prepare #NOTE: if this is changed, then the folder holding the prepare function must be named the same as the new external ID +prepareFunctionVersion: v1.0.0 +prepareWorkflowVersion: v1_prepare +prepareWorkflowTrigger: wf_prepare_trigger + +# used in launch function launchFunctionExternalId: fn_file_annotation_launch #NOTE: if this is changed, then the folder holding the launch function must be named the same as the new external ID launchFunctionVersion: v1.0.0 +launchWorkflowVersion: v1_launch +launchWorkflowTrigger: wf_launch_trigger + +# used in finalize function finalizeFunctionExternalId: fn_file_annotation_finalize #NOTE: if this is changed, then the folder holding the finalize function must be named the same as the new external ID finalizeFunctionVersion: v1.0.0 +finalizeWorkflowVersion: v1_finalize +finalizeWorkflowTrigger: wf_finalize_trigger + +# used in promote function promoteFunctionExternalId: fn_file_annotation_promote #NOTE: if this is changed, then the folder holding the promote function must be named the same as the new external ID promoteFunctionVersion: v1.0.0 -functionClientId: ${IDP_CLIENT_ID} -functionClientSecret: ${IDP_CLIENT_SECRET} +promoteWorkflowVersion: v1_promote +promoteWorkflowTrigger: wf_promote_trigger # used in /workflows workflowSchedule: "3-59/10 * * * *" # NOTE: runs every 10 minutes with a 3 minute offset workflowExternalId: wf_file_annotation -launchWorkflowVersion: v1_launch -launchWorkflowTrigger: wf_launch_trigger -finalizeWorkflowVersion: v1_finalize -finalizeWorkflowTrigger: wf_finalize_trigger -promoteWorkflowVersion: v1_promote -promoteWorkflowTrigger: wf_promote_trigger # used in /auth groupSourceId: # source ID from Azure AD for the corresponding groups \ No newline at end of file diff --git a/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml b/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml index 7d95801b..f6b00037 100644 --- a/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml +++ b/modules/contextualization/cdf_file_annotation/functions/functions.Function.yaml @@ -1,3 +1,13 @@ +- name: Prepare File Annotations + externalId: {{ prepareFunctionExternalId }} + owner: "Anonymous" + description: "Create annotation state instances for files marked ToAnnotate." + metadata: + version: {{ prepareFunctionVersion }} + + runtime: "py311" + functionPath: "handler.py" + - name: Launch File Annotations externalId: {{ launchFunctionExternalId }} owner: "Anonymous" diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index 55981dbe..b513cdd7 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -1,3 +1,13 @@ +- externalId: {{prepareWorkflowTrigger}} + triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" + workflowExternalId: {{workflowExternalId}} + workflowVersion: {{prepareWorkflowVersion}} + authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} + - externalId: {{launchWorkflowTrigger}} triggerRule: triggerType: schedule diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 0449c353..163aaabf 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -1,7 +1,29 @@ +- workflowExternalId: {{ workflowExternalId }} + version: {{ prepareWorkflowVersion }} + workflowDefinition: + description: "Create annotation state instances for files marked to annotate." + tasks: + - externalId: fn_prepare + type: "function" + parameters: + function: + externalId: {{ prepareFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Prepare File Annotations + description: Prepare + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + - workflowExternalId: {{ workflowExternalId }} version: {{ launchWorkflowVersion }} workflowDefinition: - description: "Create diagram detect jobs for files marked to annotate." + description: "Create diagram detect jobs for annotation state instances marked new or retry." tasks: - externalId: fn_launch type: "function" @@ -113,7 +135,7 @@ - workflowExternalId: {{ workflowExternalId }} version: {{ promoteWorkflowVersion }} workflowDefinition: - description: "Attempt to automatically promote annotation edges created from the pattern mode results in the finalize workflow" + description: "Attempt to automatically promote annotation edges created from the pattern mode results in the finalize workflow." tasks: - externalId: fn_promote type: "function" From 16c3deea408350fdc621c9b700f64dfca5bc0ab3 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:24:32 -0500 Subject: [PATCH 119/128] dm workflow trigger for prepare, launch, and finalize --- .../workflows/TRIGGER_ARCHITECTURE.md | 280 ++++++++++++++++++ .../wf_file_annotation.WorkflowTrigger.yaml | 117 ++++++++ .../wf_file_annotation.WorkflowVersion.yaml | 162 ++++++++++ 3 files changed, 559 insertions(+) create mode 100644 modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md diff --git a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md new file mode 100644 index 00000000..3feefbfc --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md @@ -0,0 +1,280 @@ +# Data Modeling Event Triggers Architecture + +## Overview + +The file annotation workflows now use **data modeling event triggers** instead of scheduled triggers to eliminate wasteful serverless function executions. Triggers fire only when there's actual work to process, dramatically improving cost efficiency and responsiveness. + +## Architecture + +### Trigger Flow + +``` +Files uploaded with "ToAnnotate" tag + ↓ (triggers v1_prepare) +Prepare Function creates AnnotationState with status="New" + ↓ (triggers v1_launch) +Launch Function creates diagram detect jobs, sets status="Processing" + ↓ (triggers v1_finalize) +Finalize Function processes results, sets status="Annotated"/"Failed" +``` + +### Trigger Configurations + +#### 1. Prepare Trigger (`wf_prepare_trigger`) + +**Fires when:** Files have `tags` containing "ToAnnotate" WITHOUT ["AnnotationInProcess", "Annotated", "AnnotationFailed"] + +**Batch Config:** + +- Size: 100 files +- Timeout: 60 seconds + +**Query:** + +```yaml +with: + files_to_prepare: + nodes: + filter: + And: + - In: + property: [fileSchemaSpace, "fileExternalId/version", "tags"] + values: ["ToAnnotate"] + - Not: + In: + property: [fileSchemaSpace, "fileExternalId/version", "tags"] + values: ["AnnotationInProcess", "Annotated", "AnnotationFailed"] +``` + +**Function Input:** `${workflow.input.items}` - Array of file instances + +**Loop Prevention:** Once processed, files get "AnnotationInProcess" tag, preventing re-triggering + +--- + +#### 2. Launch Trigger (`wf_launch_trigger`) + +**Fires when:** AnnotationState instances have `annotationStatus` IN ["New", "Retry"] + +**Batch Config:** + +- Size: 50 instances +- Timeout: 30 seconds + +**Query:** + +```yaml +with: + states_to_launch: + nodes: + filter: + In: + property: + [ + annotationStateSchemaSpace, + "annotationStateExternalId/version", + "annotationStatus", + ] + values: ["New", "Retry"] +``` + +**Function Input:** `${workflow.input.items}` - Array of AnnotationState instances + +**Loop Prevention:** Function updates `annotationStatus="Processing"`, preventing re-triggering + +--- + +#### 3. Finalize Trigger (`wf_finalize_trigger`) + +**Fires when:** AnnotationState instances have `annotationStatus="Processing"` AND `diagramDetectJobId` exists + +**Batch Config:** + +- Size: 20 instances +- Timeout: 60 seconds + +**Query:** + +```yaml +with: + jobs_to_finalize: + nodes: + filter: + And: + - Equals: + property: + [ + annotationStateSchemaSpace, + "annotationStateExternalId/version", + "annotationStatus", + ] + value: "Processing" + - Exists: + property: + [ + annotationStateSchemaSpace, + "annotationStateExternalId/version", + "diagramDetectJobId", + ] +``` + +**Function Input:** `${workflow.input.items}` - Array of AnnotationState instances with job IDs + +**Loop Prevention:** Function updates `annotationStatus="Annotated"/"Failed"`, preventing re-triggering + +--- + +## How Triggers Work + +According to the [Cognite documentation](https://docs.cognite.com/cdf/data_workflows/triggers/), data modeling triggers use a **change-based polling mechanism**: + +1. **Polling**: System periodically checks for instances matching filter criteria +2. **Change Detection**: Triggers detect changes based on `lastUpdatedTime` of instances +3. **Batching**: Multiple matching instances are collected into batches +4. **Execution**: When batch criteria are met (size or timeout), workflow starts with collected instances as input + +### Trigger Input Format + +The trigger passes data to the workflow via `${workflow.input.items}`: + +```json +{ + "version": "v1_prepare", + "items": [ + { + "instanceType": "node", + "externalId": "file123", + "space": "mySpace", + "properties": { + "mySpace": { + "FileView/v1": { + "name": "diagram.pdf", + "tags": ["ToAnnotate"], + "externalId": "file123" + } + } + } + } + ] +} +``` + +## Benefits + +| Benefit | Impact | +| ------------------- | ------------------------------------------------------ | +| **Cost Efficiency** | 50-90% reduction in wasted function executions | +| **Responsiveness** | <2 min latency (vs 0-15 min with scheduled triggers) | +| **Scalability** | Automatic batching handles bursts of files efficiently | +| **Architecture** | Clean separation of prepare/launch/finalize phases | +| **Observability** | Built-in trigger run history for monitoring | + +### Cost Comparison + +**Before (Scheduled):** + +- 96 function executions per day (6 × 4/hour × 24h) +- 60-90% exit early with no work done +- **Wasted: ~60-85 executions/day** + +**After (Event-Driven):** + +- Functions only execute when data is ready +- Zero wasted cold starts +- **Savings: 50-90% reduction** + +## State Machine & Re-triggering Prevention + +The architecture prevents infinite loops through careful state management: + +``` +Prepare Trigger: + Fires on → files.tags contains "ToAnnotate" without "AnnotationInProcess" + Function → adds "AnnotationInProcess" tag + Result → ✅ Won't re-trigger (tags changed) + +Launch Trigger: + Fires on → AnnotationState.status IN ["New", "Retry"] + Function → updates status="Processing" + Result → ✅ Won't re-trigger (status changed) + +Finalize Trigger: + Fires on → AnnotationState.status="Processing" + Function → updates status="Annotated"/"Failed" + Result → ✅ Won't re-trigger (status changed) +``` + +**No additional flags needed** - existing `annotationStatus` property and file `tags` handle state transitions perfectly. + +## Function Behavior + +### Current Implementation + +Functions currently **poll for data internally** using the same queries that the triggers use. This means: + +1. **Trigger fires** when data matches criteria (e.g., files with "ToAnnotate" tag) +2. **Function receives** `triggerInput` parameter with matching instances +3. **Function can use** the trigger input OR continue polling (flexible approach) + +### Migration Path + +**Phase 1 (Current):** Functions receive `triggerInput` but continue internal polling + +- Zero code changes required in function logic +- Triggers ensure functions only run when work exists +- Already eliminates 50-90% of wasteful executions + +**Phase 2 (Future Optimization):** Update functions to process only `triggerInput` + +- Remove internal polling/querying logic +- Process only the instances provided by trigger +- Further improve efficiency and reduce query costs + +## Monitoring + +Track trigger performance using the trigger run history API: + +- **Fire time**: When the trigger executed +- **Status**: Success or failure +- **Workflow execution ID**: Link to workflow run +- **Failure reason**: Debugging information + +Example query: + +```python +trigger_runs = client.workflows.triggers.runs.list( + external_id="wf_prepare_trigger", + limit=100 +) +``` + +## Configuration Variables + +The following variables in `default.config.yaml` control trigger behavior: + +```yaml +# Workflow versions +prepareWorkflowVersion: v1_prepare +launchWorkflowVersion: v1_launch +finalizeWorkflowVersion: v1_finalize + +# Trigger external IDs +prepareWorkflowTrigger: wf_prepare_trigger +launchWorkflowTrigger: wf_launch_trigger +finalizeWorkflowTrigger: wf_finalize_trigger + +# Data model configuration +fileSchemaSpace: +fileExternalId: +fileVersion: + +annotationStateSchemaSpace: sp_hdm +annotationStateExternalId: FileAnnotationState +annotationStateVersion: v1.0.0 +``` + +## References + +- [Cognite Workflows Triggers Documentation](https://docs.cognite.com/cdf/data_workflows/triggers/) +- [Data Modeling Queries](https://docs.cognite.com/cdf/data_workflows/triggers/#trigger-on-data-modeling-events) +- [Prevent Excessive Trigger Runs](https://docs.cognite.com/cdf/data_workflows/triggers/#prevent-excessive-data-modeling-trigger-runs) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index b513cdd7..b5796e9a 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -1,33 +1,146 @@ +<<<<<<< Updated upstream - externalId: {{prepareWorkflowTrigger}} triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" +======= +<<<<<<< Updated upstream +externalId: {{workflowExternalId}} +triggerRule: + triggerType: schedule + cronExpression: "{{workflowSchedule}}" +workflowExternalId: {{workflowExternalId}} +workflowVersion: {{workflowVersion}} +authentication: + clientId: {{functionClientId}} + clientSecret: {{functionClientSecret}} +======= +# Prepare Trigger: Fires when files have "ToAnnotate" tag without annotation processing tags +- externalId: {{prepareWorkflowTrigger}} + triggerRule: + triggerType: dataModeling + dataModelingQuery: + with: + files_to_prepare: + nodes: + filter: + and: + - in: + property: [{{fileSchemaSpace}}, '{{fileExternalId}}/{{fileVersion}}', 'tags'] + values: ['ToAnnotate'] + - not: + in: + property: [{{fileSchemaSpace}}, '{{fileExternalId}}/{{fileVersion}}', 'tags'] + values: ['AnnotationInProcess', 'Annotated', 'AnnotationFailed'] + limit: 100 + select: + files_to_prepare: + sources: + - source: + type: view + space: {{fileSchemaSpace}} + externalId: {{fileExternalId}} + version: {{fileVersion}} + properties: + - name + - tags + batchSize: 100 + batchTimeout: 60 +>>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{prepareWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} +<<<<<<< Updated upstream - externalId: {{launchWorkflowTrigger}} triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" +======= +# Launch Trigger: Fires when AnnotationState instances have status "New" or "Retry" +- externalId: {{launchWorkflowTrigger}} + triggerRule: + triggerType: dataModeling + dataModelingQuery: + with: + states_to_launch: + nodes: + filter: + in: + property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] + values: ['New', 'Retry'] + limit: 50 + select: + states_to_launch: + sources: + - source: + type: view + space: {{annotationStateSchemaSpace}} + externalId: {{annotationStateExternalId}} + version: {{annotationStateVersion}} + properties: + - annotationStatus + - linkedFile + - attemptCount + batchSize: 50 + batchTimeout: 30 +>>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{launchWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} +<<<<<<< Updated upstream - externalId: {{finalizeWorkflowTrigger}} triggerRule: triggerType: schedule cronExpression: "{{workflowSchedule}}" +======= +# Finalize Trigger: Fires when AnnotationState instances have status "Processing" +- externalId: {{finalizeWorkflowTrigger}} + triggerRule: + triggerType: dataModeling + dataModelingQuery: + with: + jobs_to_finalize: + nodes: + filter: + and: + - equals: + property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] + value: 'Processing' + - exists: + property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'diagramDetectJobId'] + limit: 20 + select: + jobs_to_finalize: + sources: + - source: + type: view + space: {{annotationStateSchemaSpace}} + externalId: {{annotationStateExternalId}} + version: {{annotationStateVersion}} + properties: + - annotationStatus + - diagramDetectJobId + - patternModeJobId + - linkedFile + batchSize: 20 + batchTimeout: 60 +>>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{finalizeWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} +<<<<<<< Updated upstream +======= +# Promote Trigger: Keep as scheduled for now (runs every 10 minutes) +>>>>>>> Stashed changes - externalId: {{promoteWorkflowTrigger}} triggerRule: triggerType: schedule @@ -37,3 +150,7 @@ authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} +<<<<<<< Updated upstream +======= +>>>>>>> Stashed changes +>>>>>>> Stashed changes diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 163aaabf..0a83b03b 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -1,3 +1,4 @@ +<<<<<<< Updated upstream - workflowExternalId: {{ workflowExternalId }} version: {{ prepareWorkflowVersion }} workflowDefinition: @@ -19,6 +20,30 @@ retries: 0 timeout: 600 onFailure: "abortWorkflow" +======= +<<<<<<< Updated upstream +workflowExternalId: {{ workflowExternalId }} +version: "v1" +workflowDefinition: + description: "A workflow for annotating P&ID and documents." + tasks: + - externalId: fn_launch + type: "function" + parameters: + function: + externalId: {{ launchFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Launch File Annotations + description: Launch + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" +>>>>>>> Stashed changes - workflowExternalId: {{ workflowExternalId }} version: {{ launchWorkflowVersion }} @@ -98,6 +123,131 @@ timeout: 600 onFailure: "abortWorkflow" +<<<<<<< Updated upstream +======= + - externalId: fn_finalize_thread_5 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 5 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" +======= +- workflowExternalId: {{ workflowExternalId }} + version: {{ prepareWorkflowVersion }} + workflowDefinition: + description: "Create annotation state instances for files marked to annotate." + tasks: + - externalId: fn_prepare + type: "function" + parameters: + function: + externalId: {{ prepareFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" + } + isAsyncComplete: false + name: Prepare File Annotations + description: Prepare + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + +- workflowExternalId: {{ workflowExternalId }} + version: {{ launchWorkflowVersion }} + workflowDefinition: + description: "Create diagram detect jobs for annotation state instances marked new or retry." + tasks: + - externalId: fn_launch + type: "function" + parameters: + function: + externalId: {{ launchFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" + } + isAsyncComplete: false + name: Launch File Annotations + description: Launch + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + +- workflowExternalId: {{ workflowExternalId }} + version: {{ finalizeWorkflowVersion }} + workflowDefinition: + description: "Process the diagram detect jobs created by the launch workflow" + tasks: + - externalId: fn_finalize_thread_1 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 1 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + + - externalId: fn_finalize_thread_2 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 2 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + + - externalId: fn_finalize_thread_3 + type: "function" + parameters: + function: + externalId: {{ finalizeFunctionExternalId }} + data: + { + "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, + "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" + } + isAsyncComplete: false + name: Finalize File Annotations - Thread 3 + description: Finalize + retries: 0 + timeout: 600 + onFailure: "abortWorkflow" + +>>>>>>> Stashed changes - externalId: fn_finalize_thread_4 type: "function" parameters: @@ -107,6 +257,10 @@ { "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, "logLevel": "INFO", +<<<<<<< Updated upstream +======= + "triggerInput": "${workflow.input.items}" +>>>>>>> Stashed changes } isAsyncComplete: false name: Finalize File Annotations - Thread 4 @@ -124,6 +278,10 @@ { "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, "logLevel": "INFO", +<<<<<<< Updated upstream +======= + "triggerInput": "${workflow.input.items}" +>>>>>>> Stashed changes } isAsyncComplete: false name: Finalize File Annotations - Thread 5 @@ -131,6 +289,10 @@ retries: 0 timeout: 600 onFailure: "abortWorkflow" +<<<<<<< Updated upstream +======= +>>>>>>> Stashed changes +>>>>>>> Stashed changes - workflowExternalId: {{ workflowExternalId }} version: {{ promoteWorkflowVersion }} From 072eef867d5069ebf09286a9bf28a54259dceb5e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:25:49 -0500 Subject: [PATCH 120/128] resolved conflicts --- .../wf_file_annotation.WorkflowTrigger.yaml | 39 ----- .../wf_file_annotation.WorkflowVersion.yaml | 155 ------------------ 2 files changed, 194 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index b5796e9a..c7a26989 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -1,20 +1,3 @@ -<<<<<<< Updated upstream -- externalId: {{prepareWorkflowTrigger}} - triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -======= -<<<<<<< Updated upstream -externalId: {{workflowExternalId}} -triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -workflowExternalId: {{workflowExternalId}} -workflowVersion: {{workflowVersion}} -authentication: - clientId: {{functionClientId}} - clientSecret: {{functionClientSecret}} -======= # Prepare Trigger: Fires when files have "ToAnnotate" tag without annotation processing tags - externalId: {{prepareWorkflowTrigger}} triggerRule: @@ -46,19 +29,12 @@ authentication: - tags batchSize: 100 batchTimeout: 60 ->>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{prepareWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -<<<<<<< Updated upstream -- externalId: {{launchWorkflowTrigger}} - triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -======= # Launch Trigger: Fires when AnnotationState instances have status "New" or "Retry" - externalId: {{launchWorkflowTrigger}} triggerRule: @@ -86,19 +62,12 @@ authentication: - attemptCount batchSize: 50 batchTimeout: 30 ->>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{launchWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -<<<<<<< Updated upstream -- externalId: {{finalizeWorkflowTrigger}} - triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" -======= # Finalize Trigger: Fires when AnnotationState instances have status "Processing" - externalId: {{finalizeWorkflowTrigger}} triggerRule: @@ -130,17 +99,13 @@ authentication: - linkedFile batchSize: 20 batchTimeout: 60 ->>>>>>> Stashed changes workflowExternalId: {{workflowExternalId}} workflowVersion: {{finalizeWorkflowVersion}} authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -<<<<<<< Updated upstream -======= # Promote Trigger: Keep as scheduled for now (runs every 10 minutes) ->>>>>>> Stashed changes - externalId: {{promoteWorkflowTrigger}} triggerRule: triggerType: schedule @@ -150,7 +115,3 @@ authentication: authentication: clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -<<<<<<< Updated upstream -======= ->>>>>>> Stashed changes ->>>>>>> Stashed changes diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 0a83b03b..78c3dcd8 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -1,147 +1,3 @@ -<<<<<<< Updated upstream -- workflowExternalId: {{ workflowExternalId }} - version: {{ prepareWorkflowVersion }} - workflowDefinition: - description: "Create annotation state instances for files marked to annotate." - tasks: - - externalId: fn_prepare - type: "function" - parameters: - function: - externalId: {{ prepareFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Prepare File Annotations - description: Prepare - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" -======= -<<<<<<< Updated upstream -workflowExternalId: {{ workflowExternalId }} -version: "v1" -workflowDefinition: - description: "A workflow for annotating P&ID and documents." - tasks: - - externalId: fn_launch - type: "function" - parameters: - function: - externalId: {{ launchFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Launch File Annotations - description: Launch - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" ->>>>>>> Stashed changes - -- workflowExternalId: {{ workflowExternalId }} - version: {{ launchWorkflowVersion }} - workflowDefinition: - description: "Create diagram detect jobs for annotation state instances marked new or retry." - tasks: - - externalId: fn_launch - type: "function" - parameters: - function: - externalId: {{ launchFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Launch File Annotations - description: Launch - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" - -- workflowExternalId: {{ workflowExternalId }} - version: {{ finalizeWorkflowVersion }} - workflowDefinition: - description: "Process the diagram detect jobs created by the launch workflow" - tasks: - - externalId: fn_finalize_thread_1 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 1 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" - - - externalId: fn_finalize_thread_2 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 2 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" - - - externalId: fn_finalize_thread_3 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 3 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" - -<<<<<<< Updated upstream -======= - - externalId: fn_finalize_thread_5 - type: "function" - parameters: - function: - externalId: {{ finalizeFunctionExternalId }} - data: - { - "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, - "logLevel": "INFO", - } - isAsyncComplete: false - name: Finalize File Annotations - Thread 5 - description: Finalize - retries: 0 - timeout: 600 - onFailure: "abortWorkflow" -======= - workflowExternalId: {{ workflowExternalId }} version: {{ prepareWorkflowVersion }} workflowDefinition: @@ -247,7 +103,6 @@ workflowDefinition: timeout: 600 onFailure: "abortWorkflow" ->>>>>>> Stashed changes - externalId: fn_finalize_thread_4 type: "function" parameters: @@ -257,10 +112,7 @@ workflowDefinition: { "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, "logLevel": "INFO", -<<<<<<< Updated upstream -======= "triggerInput": "${workflow.input.items}" ->>>>>>> Stashed changes } isAsyncComplete: false name: Finalize File Annotations - Thread 4 @@ -278,10 +130,7 @@ workflowDefinition: { "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, "logLevel": "INFO", -<<<<<<< Updated upstream -======= "triggerInput": "${workflow.input.items}" ->>>>>>> Stashed changes } isAsyncComplete: false name: Finalize File Annotations - Thread 5 @@ -289,10 +138,6 @@ workflowDefinition: retries: 0 timeout: 600 onFailure: "abortWorkflow" -<<<<<<< Updated upstream -======= ->>>>>>> Stashed changes ->>>>>>> Stashed changes - workflowExternalId: {{ workflowExternalId }} version: {{ promoteWorkflowVersion }} From 1cf5b03bb2bc529976785a01504430ce4172e30e Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:39:00 -0500 Subject: [PATCH 121/128] only pick up annotation state instances with a linkedFile --- .../ep_file_annotation.config.yaml | 5 +++- .../workflows/TRIGGER_ARCHITECTURE.md | 28 ++++++++++++------- .../wf_file_annotation.WorkflowTrigger.yaml | 11 +++++--- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index d1196466..82691e89 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -68,6 +68,9 @@ config: negate: False operator: In targetProperty: annotationStatus + - negate: False + operator: Exists + targetProperty: linkedFile limit: 1000 getTargetEntitiesQuery: targetView: @@ -116,7 +119,7 @@ config: negate: False operator: Equals targetProperty: annotationStatus - - negate: False # # NOTE: Do not change unless there's a good reason + - negate: False # NOTE: Do not change unless there's a good reason operator: Exists targetProperty: diagramDetectJobId applyService: diff --git a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md index 3feefbfc..a2ab16e7 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md +++ b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md @@ -54,7 +54,7 @@ with: #### 2. Launch Trigger (`wf_launch_trigger`) -**Fires when:** AnnotationState instances have `annotationStatus` IN ["New", "Retry"] +**Fires when:** AnnotationState instances have `annotationStatus` IN ["New", "Retry"] AND `linkedFile` exists **Batch Config:** @@ -68,14 +68,22 @@ with: states_to_launch: nodes: filter: - In: - property: - [ - annotationStateSchemaSpace, - "annotationStateExternalId/version", - "annotationStatus", - ] - values: ["New", "Retry"] + and: + - in: + property: + [ + annotationStateSchemaSpace, + "annotationStateExternalId/version", + "annotationStatus", + ] + values: ["New", "Retry"] + - exists: + property: + [ + annotationStateSchemaSpace, + "annotationStateExternalId/version", + "linkedFile", + ] ``` **Function Input:** `${workflow.input.items}` - Array of AnnotationState instances @@ -194,7 +202,7 @@ Prepare Trigger: Result → ✅ Won't re-trigger (tags changed) Launch Trigger: - Fires on → AnnotationState.status IN ["New", "Retry"] + Fires on → AnnotationState.status IN ["New", "Retry"] AND linkedFile exists Function → updates status="Processing" Result → ✅ Won't re-trigger (status changed) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index c7a26989..f5497b1f 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -35,7 +35,7 @@ clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -# Launch Trigger: Fires when AnnotationState instances have status "New" or "Retry" +# Launch Trigger: Fires when AnnotationState instances have status "New" or "Retry" AND linkedFile exists - externalId: {{launchWorkflowTrigger}} triggerRule: triggerType: dataModeling @@ -44,9 +44,12 @@ states_to_launch: nodes: filter: - in: - property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] - values: ['New', 'Retry'] + and: + - in: + property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] + values: ['New', 'Retry'] + - exists: + property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'linkedFile'] limit: 50 select: states_to_launch: From 6179bc616adde2119da697821ed86d80a10edc40 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:53:12 -0500 Subject: [PATCH 122/128] no longer need to worry about cluttering extraction pipeline run history with new dm even trigger --- .../fn_file_annotation_finalize/handler.py | 15 ++++++--------- .../utils/DataStructures.py | 6 +++--- .../fn_file_annotation_launch/handler.py | 15 ++++++--------- .../utils/DataStructures.py | 6 +++--- .../fn_file_annotation_prepare/handler.py | 13 ++++++------- .../utils/DataStructures.py | 2 +- 6 files changed, 25 insertions(+), 32 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py index de68d7c7..a44b4ab9 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/handler.py @@ -65,15 +65,12 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: return {"status": run_status, "message": msg} finally: logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") - # only want to report on the count of successful and failed files in ep_logs if there were files that were processed or an error occured - # else run log will be too messy - if tracker_instance.files_failed != 0 or tracker_instance.files_success != 0 or run_status == "failure": - function_id = function_call_info.get("function_id") - call_id = function_call_info.get("call_id") - pipeline_instance.update_extraction_pipeline( - msg=tracker_instance.generate_ep_run("Finalize", function_id, call_id) - ) - pipeline_instance.upload_extraction_pipeline(status=run_status) + function_id = function_call_info.get("function_id") + call_id = function_call_info.get("call_id") + pipeline_instance.update_extraction_pipeline( + msg=tracker_instance.generate_ep_run("Finalize", function_id, call_id) + ) + pipeline_instance.upload_extraction_pipeline(status=run_status) def run_locally(config_file: dict[str, str], log_path: str | None = None): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py index f1db22d9..7bc3c435 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/utils/DataStructures.py @@ -81,8 +81,8 @@ class AnnotationState: sourceUpdatedTime: str = field( default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() ) - sourceCreatedUser: str = "fn_dm_context_annotation_launch" - sourceUpdatedUser: str = "fn_dm_context_annotation_launch" + sourceCreatedUser: str = "fn_dm_context_annotation_finalize" + sourceUpdatedUser: str = "fn_dm_context_annotation_finalize" def _create_external_id(self) -> str: """ @@ -309,7 +309,7 @@ def generate_overall_report(self) -> str: def generate_ep_run( self, - caller: Literal["Launch", "Finalize"], + caller: Literal["Prepare", "Launch", "Finalize"], function_id: str | None, call_id: str | None, ) -> str: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py index 8b3f5439..f5c03ed7 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/handler.py @@ -64,15 +64,12 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: return {"status": run_status, "message": msg} finally: logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") - # only want to report on the count of successful and failed files in ep_logs if there were files that were processed or an error occured - # else run log will be too messy. - if tracker_instance.files_failed != 0 or tracker_instance.files_success != 0 or run_status == "failure": - function_id = function_call_info.get("function_id") - call_id = function_call_info.get("call_id") - pipeline_instance.update_extraction_pipeline( - msg=tracker_instance.generate_ep_run("Launch", function_id, call_id) - ) - pipeline_instance.upload_extraction_pipeline(status=run_status) + function_id = function_call_info.get("function_id") + call_id = function_call_info.get("call_id") + pipeline_instance.update_extraction_pipeline( + msg=tracker_instance.generate_ep_run("Launch", function_id, call_id) + ) + pipeline_instance.upload_extraction_pipeline(status=run_status) def run_locally(config_file: dict[str, str], log_path: str | None = None): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py index f1db22d9..8ef6675d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/utils/DataStructures.py @@ -81,8 +81,8 @@ class AnnotationState: sourceUpdatedTime: str = field( default_factory=lambda: datetime.now(timezone.utc).replace(microsecond=0).isoformat() ) - sourceCreatedUser: str = "fn_dm_context_annotation_launch" - sourceUpdatedUser: str = "fn_dm_context_annotation_launch" + sourceCreatedUser: str = "fn_dm_context_annotation_prepare" + sourceUpdatedUser: str = "fn_dm_context_annotation_prepare" def _create_external_id(self) -> str: """ @@ -309,7 +309,7 @@ def generate_overall_report(self) -> str: def generate_ep_run( self, - caller: Literal["Launch", "Finalize"], + caller: Literal["Prepare", "Launch", "Finalize"], function_id: str | None, call_id: str | None, ) -> str: diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py index e8c71e95..0a3a562e 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/handler.py @@ -62,13 +62,12 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict: logger_instance.info(tracker_instance.generate_overall_report(), "BOTH") # only want to report on the count of successful and failed files in ep_logs if there were files that were processed or an error occured # else run log will be too messy. - if tracker_instance.files_failed != 0 or tracker_instance.files_success != 0 or run_status == "failure": - function_id = function_call_info.get("function_id") - call_id = function_call_info.get("call_id") - pipeline_instance.update_extraction_pipeline( - msg=tracker_instance.generate_ep_run("Prepare", function_id, call_id) - ) - pipeline_instance.upload_extraction_pipeline(status=run_status) + function_id = function_call_info.get("function_id") + call_id = function_call_info.get("call_id") + pipeline_instance.update_extraction_pipeline( + msg=tracker_instance.generate_ep_run("Prepare", function_id, call_id) + ) + pipeline_instance.upload_extraction_pipeline(status=run_status) def run_locally(config_file: dict[str, str], log_path: str | None = None): diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py index e6da5d06..8ef6675d 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/utils/DataStructures.py @@ -309,7 +309,7 @@ def generate_overall_report(self) -> str: def generate_ep_run( self, - caller: Literal["Launch", "Finalize"], + caller: Literal["Prepare", "Launch", "Finalize"], function_id: str | None, call_id: str | None, ) -> str: From fc29a0afd81b53a0888bdffa6f8e45393c7c92f9 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:55:31 -0500 Subject: [PATCH 123/128] logging indentation change --- .../fn_file_annotation_finalize/services/FinalizeService.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py index 506318f3..9388c336 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/FinalizeService.py @@ -212,7 +212,8 @@ def run(self) -> Literal["Done"] | None: ) is None, ) - self.logger.info(f"\t- {annotation_msg}\n\t- {pattern_msg}") + self.logger.info(f"\t- {annotation_msg}") + self.logger.info(f"\t- {pattern_msg}") # Logic to handle multi-page files page_count = results.get("regular", {}).get("pageCount", 1) From 7715567e8e6fa207fcfb031fff1cc1d032360161 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 11:57:25 -0500 Subject: [PATCH 124/128] small ep variable value adjustment --- .../ep_file_annotation.config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml index 82691e89..4a14d99e 100644 --- a/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml +++ b/modules/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml @@ -152,16 +152,16 @@ config: rawTableDocPattern: {{ rawTableDocPattern }} rawTableDocTag: {{ rawTableDocTag }} rawTableDocDoc: {{ rawTableDocDoc }} - deleteRejectedEdges: true - deleteSuggestedEdges: true + deleteRejectedEdges: True + deleteSuggestedEdges: True entitySearchService: - enableExistingAnnotationsSearch: true # Primary: Query annotation edges (fast, checks existing annotation edges) - enableGlobalEntitySearch: true # Fallback: Global entity search - (slow, unstable as instance count grows) + enableExistingAnnotationsSearch: True # Primary: Query annotation edges (fast, checks existing annotation edges) + enableGlobalEntitySearch: True # Fallback: Global entity search - (slow, unstable as instance count grows) maxEntitySearchLimit: 1000 # Max entities to fetch in global search textNormalization: - removeSpecialCharacters: true # Remove non-alphanumeric characters (e.g., "V-0912" → "V0912") - convertToLowercase: true # Convert to lowercase (e.g., "V0912" → "v0912") - stripLeadingZeros: true # Remove leading zeros (e.g., "v0912" → "v912") + removeSpecialCharacters: True # Remove non-alphanumeric characters (e.g., "V-0912" → "V0912") + convertToLowercase: False # Convert to lowercase (e.g., "V0912" → "v0912") + stripLeadingZeros: True # Remove leading zeros (e.g., "v0912" → "v912") cacheService: cacheTableName: {{ rawTablePromoteCache }} From de646e10aeec503266f4f5fb33bccfc579e3ad66 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 14:52:41 -0500 Subject: [PATCH 125/128] added instance space to the data model triggers --- .../workflows/TRIGGER_ARCHITECTURE.md | 66 +++++++++++++++++-- .../wf_file_annotation.WorkflowTrigger.yaml | 9 +++ 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md index a2ab16e7..42b7c4db 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md +++ b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md @@ -36,12 +36,15 @@ with: files_to_prepare: nodes: filter: - And: - - In: + and: + - equals: + property: ["node", "space"] + value: fileInstanceSpace + - in: property: [fileSchemaSpace, "fileExternalId/version", "tags"] values: ["ToAnnotate"] - - Not: - In: + - not: + in: property: [fileSchemaSpace, "fileExternalId/version", "tags"] values: ["AnnotationInProcess", "Annotated", "AnnotationFailed"] ``` @@ -69,6 +72,9 @@ with: nodes: filter: and: + - equals: + property: ["node", "space"] + value: fileInstanceSpace - in: property: [ @@ -108,8 +114,11 @@ with: jobs_to_finalize: nodes: filter: - And: - - Equals: + and: + - equals: + property: ["node", "space"] + value: fileInstanceSpace + - equals: property: [ annotationStateSchemaSpace, @@ -117,7 +126,7 @@ with: "annotationStatus", ] value: "Processing" - - Exists: + - exists: property: [ annotationStateSchemaSpace, @@ -132,6 +141,46 @@ with: --- +## Instance Space Filtering + +**All triggers include instance space filtering** to ensure they only fire for instances in the configured `{{fileInstanceSpace}}`. This is achieved by filtering on the node's space property: + +```yaml +- equals: + property: ["node", "space"] + value: { { fileInstanceSpace } } +``` + +**Example from Prepare Trigger:** + +```yaml +filter: + and: + - equals: + property: ["node", "space"] + value: { { fileInstanceSpace } } + - in: + property: + [ + { { fileSchemaSpace } }, + "{{fileExternalId}}/{{fileVersion}}", + "tags", + ] + values: ["ToAnnotate"] + - # ... other filters +``` + +This approach ensures: + +- ✅ **Isolation**: Triggers only fire for instances in the configured instance space +- ✅ **Consistency**: Matches the behavior of scheduled functions using the extraction pipeline config +- ✅ **Multi-tenancy**: Supports multiple isolated environments using the same data model +- ✅ **Performance**: Reduces query scope to only relevant instances + +The `fileInstanceSpace` variable is configured in `default.config.yaml` and used in both the triggers and the extraction pipeline config for consistent instance space filtering across the entire workflow. + +--- + ## How Triggers Work According to the [Cognite documentation](https://docs.cognite.com/cdf/data_workflows/triggers/), data modeling triggers use a **change-based polling mechanism**: @@ -273,6 +322,7 @@ finalizeWorkflowTrigger: wf_finalize_trigger # Data model configuration fileSchemaSpace: +fileInstanceSpace: # IMPORTANT: Filters trigger scope fileExternalId: fileVersion: @@ -281,6 +331,8 @@ annotationStateExternalId: FileAnnotationState annotationStateVersion: v1.0.0 ``` +**Note:** The `fileInstanceSpace` variable is critical for ensuring triggers only fire for instances in your configured space. This must match the instance space used in your extraction pipeline configuration. + ## References - [Cognite Workflows Triggers Documentation](https://docs.cognite.com/cdf/data_workflows/triggers/) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index f5497b1f..fc251a91 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -8,6 +8,9 @@ nodes: filter: and: + - equals: + property: ["node", "space"] + value: {{fileInstanceSpace}} - in: property: [{{fileSchemaSpace}}, '{{fileExternalId}}/{{fileVersion}}', 'tags'] values: ['ToAnnotate'] @@ -45,6 +48,9 @@ nodes: filter: and: + - equals: + property: ["node", "space"] + value: {{fileInstanceSpace}} - in: property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] values: ['New', 'Retry'] @@ -81,6 +87,9 @@ nodes: filter: and: + - equals: + property: ["node", "space"] + value: {{fileInstanceSpace}} - equals: property: [{{annotationStateSchemaSpace}}, '{{annotationStateExternalId}}/{{annotationStateVersion}}', 'annotationStatus'] value: 'Processing' From b805ba0ed317f7462c0664fa89a2ec710c446e60 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 15:15:50 -0500 Subject: [PATCH 126/128] added a promote function trigger --- .../services/PromoteService.py | 8 +-- .../workflows/TRIGGER_ARCHITECTURE.md | 65 ++++++++++++++++++- .../wf_file_annotation.WorkflowTrigger.yaml | 36 +++++++++- .../wf_file_annotation.WorkflowVersion.yaml | 1 + 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py index 5aa7e7eb..19771b14 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py @@ -278,12 +278,7 @@ def _get_promote_candidates(self) -> EdgeList | None: # Backward compatibility: hardcoded filter query_filter = { "and": [ - { - "equals": { - "property": ["edge", "endNode"], - "value": {"space": self.sink_node_ref.space, "externalId": self.sink_node_ref.external_id}, - } - }, + {"equals": {"property": ["edge", "space"], "value": self.sink_node_ref.space}}, {"equals": {"property": self.core_annotation_view.as_property_ref("status"), "value": "Suggested"}}, { "not": { @@ -302,6 +297,7 @@ def _get_promote_candidates(self) -> EdgeList | None: sources=[self.core_annotation_view.as_view_id()], filter=query_filter, limit=limit, + space=self.sink_node_ref.space ) def _find_entity_with_cache(self, text: str, annotation_type: str, entity_space: str) -> list[Node] | list: diff --git a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md index 42b7c4db..8ec6ee7e 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md +++ b/modules/contextualization/cdf_file_annotation/workflows/TRIGGER_ARCHITECTURE.md @@ -16,6 +16,9 @@ Prepare Function creates AnnotationState with status="New" Launch Function creates diagram detect jobs, sets status="Processing" ↓ (triggers v1_finalize) Finalize Function processes results, sets status="Annotated"/"Failed" + └─ (if pattern-mode enabled) Creates annotation edges with status="Suggested" + ↓ (triggers v1_promote) +Promote Function attempts to resolve pattern-mode annotations to actual entities ``` ### Trigger Configurations @@ -141,9 +144,50 @@ with: --- +#### 4. Promote Trigger (`wf_promote_trigger`) + +**Fires when:** Annotation edges have `status="Suggested"` AND `tags` does NOT contain `"PromoteAttempted"` + +**Batch Config:** + +- Size: 100 edges +- Timeout: 300 seconds (5 minutes) + +**Query:** + +```yaml +with: + edges_to_promote: + edges: + filter: + and: + - equals: + property: ["edge", "space"] + value: patternModeInstanceSpace + - equals: + property: [cdf_cdm, "CogniteDiagramAnnotation/v1", "status"] + value: "Suggested" + - not: + in: + property: [cdf_cdm, "CogniteDiagramAnnotation/v1", "tags"] + values: ["PromoteAttempted"] +``` + +**Function Input:** `${workflow.input.items}` - Array of annotation edges (pattern-mode annotations) + +**Loop Prevention:** Function adds `"PromoteAttempted"` tag to edges, preventing re-triggering + +**Note:** This trigger queries **edges** (not nodes) since promote processes annotation relationships. The trigger fires when the finalize function creates pattern-mode annotations (edges pointing to the sink node with `status="Suggested"`). + +--- + ## Instance Space Filtering -**All triggers include instance space filtering** to ensure they only fire for instances in the configured `{{fileInstanceSpace}}`. This is achieved by filtering on the node's space property: +**All triggers include instance space filtering** to ensure they only fire for instances in the configured `{{fileInstanceSpace}}`. + +### Node-based Triggers (Prepare, Launch, Finalize) + +For triggers that query nodes, filtering is achieved by checking the node's space property: ```yaml - equals: @@ -170,6 +214,20 @@ filter: - # ... other filters ``` +### Edge-based Triggers (Promote) + +For the promote trigger that queries edges, filtering is achieved by checking the edge's own space property: + +```yaml +- equals: + property: ["edge", "space"] + value: { { patternModeInstanceSpace } } +``` + +This ensures only pattern-mode annotation edges stored in your configured pattern mode results instance space trigger the promote workflow. Pattern-mode edges are created by the finalize function and stored in a dedicated instance space (`patternModeInstanceSpace`, typically `sp_dat_pattern_mode_results`). + +### Benefits + This approach ensures: - ✅ **Isolation**: Triggers only fire for instances in the configured instance space @@ -177,7 +235,10 @@ This approach ensures: - ✅ **Multi-tenancy**: Supports multiple isolated environments using the same data model - ✅ **Performance**: Reduces query scope to only relevant instances -The `fileInstanceSpace` variable is configured in `default.config.yaml` and used in both the triggers and the extraction pipeline config for consistent instance space filtering across the entire workflow. +The `fileInstanceSpace` and `patternModeInstanceSpace` variables are configured in `default.config.yaml`: + +- `fileInstanceSpace`: Used for node-based triggers (prepare, launch, finalize) to filter files and annotation states +- `patternModeInstanceSpace`: Used for edge-based triggers (promote) to filter pattern-mode annotation edges --- diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index fc251a91..3b8b0dbd 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -117,11 +117,41 @@ clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -# Promote Trigger: Keep as scheduled for now (runs every 10 minutes) +# Promote Trigger: Fires when annotation edges have status "Suggested" and haven't been promoted yet - externalId: {{promoteWorkflowTrigger}} triggerRule: - triggerType: schedule - cronExpression: "{{workflowSchedule}}" + triggerType: dataModeling + dataModelingQuery: + with: + edges_to_promote: + edges: + filter: + and: + - equals: + property: ["edge", "space"] + value: {{patternModeInstanceSpace}} + - equals: + property: [cdf_cdm, 'CogniteDiagramAnnotation/v1', 'status'] + value: 'Suggested' + - not: + in: + property: [cdf_cdm, 'CogniteDiagramAnnotation/v1', 'tags'] + values: ['PromoteAttempted'] + limit: 100 + select: + edges_to_promote: + sources: + - source: + type: view + space: cdf_cdm + externalId: CogniteDiagramAnnotation + version: v1 + properties: + - status + - tags + - startNodeText + batchSize: 100 + batchTimeout: 300 workflowExternalId: {{workflowExternalId}} workflowVersion: {{promoteWorkflowVersion}} authentication: diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml index 78c3dcd8..94e366e8 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowVersion.yaml @@ -153,6 +153,7 @@ { "ExtractionPipelineExtId": {{ extractionPipelineExternalId }}, "logLevel": "INFO", + "triggerInput": "${workflow.input.items}" } isAsyncComplete: false name: Promote File Annotations From a5dfaa7354eca148dc390423b8b0a8fbfe9300da Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 16:18:32 -0500 Subject: [PATCH 127/128] misc changes needed for promote function --- .../fn_file_annotation_promote/handler.py | 2 +- .../requirements.txt | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/requirements.txt diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py index 74383783..273134b0 100644 --- a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/handler.py @@ -51,7 +51,7 @@ def handle(data: dict, function_call_info: dict, client: CogniteClient) -> dict[ start_time: datetime = datetime.now(timezone.utc) config: Config - config, client = create_config_service(function_data=data) + config, client = create_config_service(function_data=data, client=client) logger: CogniteFunctionLogger = create_logger_service(data.get("logLevel", "DEBUG"), data.get("logPath")) tracker: PromoteTracker = PromoteTracker() diff --git a/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/requirements.txt b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/requirements.txt new file mode 100644 index 00000000..bd7f2bc3 --- /dev/null +++ b/modules/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/requirements.txt @@ -0,0 +1,24 @@ +annotated-types==0.7.0 +certifi==2025.4.26 +cffi==1.17.1 +charset-normalizer==3.4.2 +cognite-sdk==7.76.0 +cryptography==44.0.3 +dotenv==0.9.9 +idna==3.10 +msal==1.32.3 +oauthlib==3.2.2 +packaging==25.0 +protobuf==6.30.2 +pycparser==2.22 +pydantic==2.11.4 +pydantic_core==2.33.2 +PyJWT==2.10.1 +python-dotenv==1.1.0 +PyYAML==6.0.2 +requests==2.32.3 +requests-oauthlib==1.3.1 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +urllib3==2.5.0 + From 34a16ba13417c56b04f6aa6addb7add833cdfe68 Mon Sep 17 00:00:00 2001 From: Jack Zhao Date: Sun, 19 Oct 2025 17:05:23 -0500 Subject: [PATCH 128/128] consider reducing finalize tasks to 1 --- .../workflows/wf_file_annotation.WorkflowTrigger.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml index 3b8b0dbd..681b84d3 100644 --- a/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml +++ b/modules/contextualization/cdf_file_annotation/workflows/wf_file_annotation.WorkflowTrigger.yaml @@ -77,7 +77,7 @@ clientId: {{functionClientId}} clientSecret: {{functionClientSecret}} -# Finalize Trigger: Fires when AnnotationState instances have status "Processing" +# Finalize Trigger: Fires when AnnotationState instances have status "Processing" TODO: may need to only make one thread - externalId: {{finalizeWorkflowTrigger}} triggerRule: triggerType: dataModeling