@@ -204,12 +204,21 @@ def _annotate_with_smart_cascade(
204204 Annotations from the first engine that finds sufficient entities
205205 """
206206 # Stage 1: Try regex first (fastest)
207- regex_result = self .regex_annotator .annotate (text )
208- if self ._cascade_should_stop ("regex" , regex_result ):
209- if structured :
210- _ , result = self .regex_annotator .annotate_with_spans (text )
207+ if structured :
208+ # For structured output, use annotate_with_spans directly to avoid double processing
209+ _ , result = self .regex_annotator .annotate_with_spans (text )
210+ regex_result = {}
211+ for span in result .spans :
212+ if span .label not in regex_result :
213+ regex_result [span .label ] = []
214+ regex_result [span .label ].append (span .text )
215+
216+ if self ._cascade_should_stop ("regex" , regex_result ):
211217 return result .spans
212- return regex_result
218+ else :
219+ regex_result = self .regex_annotator .annotate (text )
220+ if self ._cascade_should_stop ("regex" , regex_result ):
221+ return regex_result
213222
214223 # Stage 2: Try GLiNER (balanced speed/accuracy)
215224 if self .gliner_annotator is not None :
@@ -232,6 +241,97 @@ def _annotate_with_smart_cascade(
232241 return result .spans
233242 return regex_result
234243
244+ def _annotate_single_chunk (
245+ self , text : str , structured : bool = False
246+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
247+ """Annotate a single chunk of text based on the engine type."""
248+ if self .engine == "regex" :
249+ if structured :
250+ _ , result = self .regex_annotator .annotate_with_spans (text )
251+ return result .spans
252+ return self .regex_annotator .annotate (text )
253+ elif self .engine == "spacy" :
254+ if self .spacy_annotator is None :
255+ raise ImportError (
256+ "SpaCy engine not available. Install with: pip install datafog[nlp]"
257+ )
258+ return self .spacy_annotator .annotate (text )
259+ elif self .engine == "gliner" :
260+ if self .gliner_annotator is None :
261+ raise ImportError (
262+ "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
263+ )
264+ return self .gliner_annotator .annotate (text )
265+ elif self .engine == "smart" :
266+ return self ._annotate_with_smart_cascade (text , structured )
267+ elif self .engine == "auto" :
268+ return self ._annotate_with_auto_engine (text , structured )
269+
270+ def _annotate_with_auto_engine (
271+ self , text : str , structured : bool = False
272+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
273+ """Handle auto engine annotation with regex fallback to spacy."""
274+ # Try regex first
275+ if structured :
276+ # For structured output, use annotate_with_spans directly to avoid double processing
277+ _ , result = self .regex_annotator .annotate_with_spans (text )
278+ regex_result = {}
279+ for span in result .spans :
280+ if span .label not in regex_result :
281+ regex_result [span .label ] = []
282+ regex_result [span .label ].append (span .text )
283+
284+ # Check if regex found any entities
285+ if any (entities for entities in regex_result .values ()):
286+ return result .spans
287+ else :
288+ regex_result = self .regex_annotator .annotate (text )
289+
290+ # Check if regex found any entities
291+ if any (entities for entities in regex_result .values ()):
292+ return regex_result
293+
294+ # Fall back to spacy if available
295+ if self .spacy_annotator is not None :
296+ return self .spacy_annotator .annotate (text )
297+
298+ # Return regex result even if empty
299+ if structured :
300+ # We already have the result from above in structured mode
301+ return result .spans
302+ return regex_result
303+
304+ def _annotate_multiple_chunks_structured (self , chunks : List [str ]) -> List ["Span" ]:
305+ """Handle structured annotation across multiple chunks."""
306+ all_spans = []
307+ current_offset = 0
308+
309+ # Get Span class once outside the loop for efficiency
310+ SpanClass = _get_span_class ()
311+
312+ for chunk in chunks :
313+ chunk_spans = self ._annotate_single_chunk (chunk , structured = True )
314+ # Adjust span positions to account for chunk offset
315+ for span in chunk_spans :
316+ adjusted_span = SpanClass (
317+ start = span .start + current_offset ,
318+ end = span .end + current_offset ,
319+ text = span .text ,
320+ label = span .label ,
321+ )
322+ all_spans .append (adjusted_span )
323+ current_offset += len (chunk )
324+
325+ return all_spans
326+
327+ def _annotate_multiple_chunks_dict (self , chunks : List [str ]) -> Dict [str , List [str ]]:
328+ """Handle dictionary annotation across multiple chunks."""
329+ chunk_annotations = []
330+ for chunk in chunks :
331+ chunk_result = self ._annotate_single_chunk (chunk , structured = False )
332+ chunk_annotations .append (chunk_result )
333+ return self ._combine_annotations (chunk_annotations )
334+
235335 def annotate_text_sync (
236336 self , text : str , structured : bool = False
237337 ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
@@ -247,76 +347,15 @@ def annotate_text_sync(
247347 """
248348 if len (text ) <= self .text_chunk_length :
249349 # Single chunk processing
250- if self .engine == "regex" :
251- if structured :
252- _ , result = self .regex_annotator .annotate_with_spans (text )
253- return result .spans
254- return self .regex_annotator .annotate (text )
255- elif self .engine == "spacy" :
256- if self .spacy_annotator is None :
257- raise ImportError (
258- "SpaCy engine not available. Install with: pip install datafog[nlp]"
259- )
260- return self .spacy_annotator .annotate (text )
261- elif self .engine == "gliner" :
262- if self .gliner_annotator is None :
263- raise ImportError (
264- "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
265- )
266- return self .gliner_annotator .annotate (text )
267- elif self .engine == "smart" :
268- return self ._annotate_with_smart_cascade (text , structured )
269- elif self .engine == "auto" :
270- # Try regex first
271- regex_result = self .regex_annotator .annotate (text )
272-
273- # Check if regex found any entities
274- if any (entities for entities in regex_result .values ()):
275- if structured :
276- _ , result = self .regex_annotator .annotate_with_spans (text )
277- return result .spans
278- return regex_result
279-
280- # Fall back to spacy if available
281- if self .spacy_annotator is not None :
282- return self .spacy_annotator .annotate (text )
283-
284- # Return regex result even if empty
285- if structured :
286- _ , result = self .regex_annotator .annotate_with_spans (text )
287- return result .spans
288- return regex_result
350+ return self ._annotate_single_chunk (text , structured )
289351 else :
290352 # Multi-chunk processing
291353 chunks = self ._chunk_text (text )
292354
293355 if structured :
294- # For structured output, we need to handle span positions across chunks
295- all_spans = []
296- current_offset = 0
297-
298- for chunk in chunks :
299- chunk_spans = self .annotate_text_sync (chunk , structured = True )
300- # Adjust span positions to account for chunk offset
301- for span in chunk_spans :
302- SpanClass = _get_span_class ()
303- adjusted_span = SpanClass (
304- start = span .start + current_offset ,
305- end = span .end + current_offset ,
306- text = span .text ,
307- label = span .label ,
308- )
309- all_spans .append (adjusted_span )
310- current_offset += len (chunk )
311-
312- return all_spans
356+ return self ._annotate_multiple_chunks_structured (chunks )
313357 else :
314- # Dictionary format - combine annotations
315- chunk_annotations = []
316- for chunk in chunks :
317- chunk_result = self .annotate_text_sync (chunk , structured = False )
318- chunk_annotations .append (chunk_result )
319- return self ._combine_annotations (chunk_annotations )
358+ return self ._annotate_multiple_chunks_dict (chunks )
320359
321360 async def annotate_text_async (
322361 self , text : str , structured : bool = False
0 commit comments