From 0ccb297bd83c69a355b4c363923e4139ea552a89 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Thu, 26 Jun 2025 17:10:43 +0300
Subject: [PATCH 1/9] Allow using python functions instead of operators (e.g in
 pre-processing pipeline)

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 docs/catalog.py                          |  2 +
 docs/docs/adding_dataset.rst             | 23 +++++-
 docs/docs/adding_operator.rst            | 30 ++++++-
 prepare/cards/xlam_function_calling.py   | 13 ++++
 src/unitxt/api.py                        | 18 ++++-
 src/unitxt/artifact.py                   | 35 +++++++--
 src/unitxt/catalog.py                    |  3 +-
 src/unitxt/inference.py                  | 12 +--
 src/unitxt/llm_as_judge_from_template.py |  5 +-
 src/unitxt/operators.py                  | 58 ++++++++++++++
 src/unitxt/test_utils/artifact.py        | 13 ++--
 src/unitxt/test_utils/card.py            |  5 +-
 src/unitxt/text_utils.py                 | 31 ++++++++
 src/unitxt/utils.py                      | 56 +++++++++++++-
 tests/library/test_function_operators.py | 99 +++++++++++++++++++++++-
 15 files changed, 366 insertions(+), 37 deletions(-)

diff --git a/docs/catalog.py b/docs/catalog.py
index 0d06d5d54b..657f212a2c 100644
--- a/docs/catalog.py
+++ b/docs/catalog.py
@@ -1,6 +1,7 @@
 import json
 import os
 import re
+import types
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
@@ -110,6 +111,7 @@ def all_subtypes_of_artifact(artifact):
         or isinstance(artifact, bool)
         or isinstance(artifact, int)
         or isinstance(artifact, float)
+        or isinstance(artifact, types.FunctionType)
     ):
         return []
     if isinstance(artifact, list):
diff --git a/docs/docs/adding_dataset.rst b/docs/docs/adding_dataset.rst
index 0f6af4ef02..7094668241 100644
--- a/docs/docs/adding_dataset.rst
+++ b/docs/docs/adding_dataset.rst
@@ -5,7 +5,7 @@
    To use this tutorial, you need to :ref:`install Unitxt <install_unitxt>`.
 
 =================
-Datasets 
+Datasets
 =================
 
 This guide will assist you in adding or using your new dataset in Unitxt.
@@ -105,6 +105,27 @@ Most data can be normalized to the task schema using built-in operators, ensurin
 
 For custom operators, refer to the :ref:`Operators Tutorial <adding_operator>`.
 
+.. tip::
+
+    If you cannot find operators fit to your needs simply use instance function operator:
+
+    .. code-block:: python
+
+        def my_function(instance, stream_name=None):
+            instance["x"] += 42
+            return instance
+
+    Or stream function operator:
+
+    .. code-block:: python
+
+        def my_other_function(stream, stream_name=None):
+            for instance in stream:
+                instance["x"] += 42
+                yield instance
+
+    Both functions can be plugged in every place in unitxt requires operators, e.g pre-processing pipeline.
+
 The Template
 ----------------
 
diff --git a/docs/docs/adding_operator.rst b/docs/docs/adding_operator.rst
index ecb3c2d54b..49c68359ed 100644
--- a/docs/docs/adding_operator.rst
+++ b/docs/docs/adding_operator.rst
@@ -5,11 +5,33 @@
    To use this tutorial, you need to :ref:`install unitxt <install_unitxt>`.
 
 =====================================
-Operators 
+Operators
 =====================================
 
 Operators are specialized functions designed to process data.
 
+.. tip::
+
+    If you cannot find operators fit to your needs simply use instance function operator:
+
+    .. code-block:: python
+
+        def my_function(instance, stream_name=None):
+            instance["x"] += 42
+            return instance
+
+    Or stream function operator:
+
+    .. code-block:: python
+
+        def my_other_function(stream, stream_name=None):
+            for instance in stream:
+                instance["x"] += 42
+                yield instance
+
+    Both functions can be plugged in every place in unitxt requires operators, e.g pre-processing pipeline.
+
+
 They are used in the TaskCard for preparing data for specific tasks and by Post Processors
 to process the textual output of the model to the expect input of the metrics.
 
@@ -18,11 +40,11 @@ There are several types of operators.
 1. Field Operators - Operators that modify individual fields of the instances in the input streams.  Example of such operators are operators that
 cast field values, uppercase string fields, or translate text between languages.
 
-2. Instance Operators - Operators that modify individual instances in the input streams. For example, operators that add or remove fields.
+1. Instance Operators - Operators that modify individual instances in the input streams. For example, operators that add or remove fields.
 
-3. Stream Operators - Operators that perform operations on full streams. For example, operators that remove instances based on some condition.
+2. Stream Operators - Operators that perform operations on full streams. For example, operators that remove instances based on some condition.
 
-4. MultiStream Operators - Operator that perform operations on multiple streams.  For example, operators that repartition the instances between train and test splits.
+3. MultiStream Operators - Operator that perform operations on multiple streams.  For example, operators that repartition the instances between train and test splits.
 
 Unitxt comes with a large collection of built in operators - that were design to cover most common requirements of dataset processing.
 
diff --git a/prepare/cards/xlam_function_calling.py b/prepare/cards/xlam_function_calling.py
index cdc761575e..4ac5c473c2 100644
--- a/prepare/cards/xlam_function_calling.py
+++ b/prepare/cards/xlam_function_calling.py
@@ -12,6 +12,19 @@
 from unitxt.struct_data_operators import LoadJson
 from unitxt.test_utils.card import test_card
 
+
+def extract_required_parameters(instance, stream_name=None):
+    result = []
+    for tool in instance["tools"]:
+        required_params = []
+        for param_name, param_info in tool["parameters"]["properties"].items():
+            if "optional" not in param_info["type"]:
+                required_params.append(param_name)
+        result.append(required_params)
+    instance["required"] = result
+    return instance
+
+
 card = TaskCard(
     loader=LoadHF(
         path="Salesforce/xlam-function-calling-60k",
diff --git a/src/unitxt/api.py b/src/unitxt/api.py
index 23de331bd4..88e61defa6 100644
--- a/src/unitxt/api.py
+++ b/src/unitxt/api.py
@@ -180,6 +180,20 @@ class MyClass:
     return obj_str
 
 
+def _remove_id_keys(obj):
+    if isinstance(obj, dict):
+        return {k: _remove_id_keys(v) for k, v in obj.items() if k != "__id__"}
+    if isinstance(obj, list):
+        return [_remove_id_keys(item) for item in obj]
+    return obj
+
+
+def _artifact_string_repr(artifact):
+    artifact_dict = to_dict(artifact, object_to_str_without_addresses)
+    artifact_dict_without_ids = _remove_id_keys(artifact_dict)
+    return json_dump(artifact_dict_without_ids)
+
+
 def _source_to_dataset(
     source: SourceOperator,
     split=None,
@@ -189,9 +203,7 @@ def _source_to_dataset(
     from .dataset import Dataset as UnitxtDataset
 
     # Generate a unique signature for the source
-    source_signature = json.dumps(
-        to_dict(source, object_to_str_without_addresses), sort_keys=True
-    )
+    source_signature = _artifact_string_repr(source)
     config_name = "recipe-" + short_hex_hash(source_signature)
     # Obtain data stream from the source
     stream = source()
diff --git a/src/unitxt/artifact.py b/src/unitxt/artifact.py
index e1ccae320e..b2bd693ec8 100644
--- a/src/unitxt/artifact.py
+++ b/src/unitxt/artifact.py
@@ -4,6 +4,7 @@
 import os
 import pkgutil
 import re
+import types
 import warnings
 from abc import abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union, final
@@ -27,6 +28,8 @@
 from .utils import (
     artifacts_json_cache,
     json_dump,
+    json_load,
+    load_json,
     save_to_file,
     shallow_copy,
 )
@@ -119,16 +122,32 @@ def reset(self):
         self.catalogs = []
 
 
+def maybe_recover_function_operator(func):
+    sig = inspect.signature(func)
+    param_names = tuple(sorted(sig.parameters))
+    if param_names == ("stream", "stream_name") or param_names == (
+        "instance",
+        "stream_name",
+    ):
+        from .operators import FunctionOperator
+
+        return FunctionOperator(function=func)
+    return func
+
+
 def maybe_recover_artifacts_structure(obj):
+    if isinstance(obj, types.FunctionType):
+        obj = maybe_recover_function_operator(obj)
+
     if Artifact.is_possible_identifier(obj):
         return verbosed_fetch_artifact(obj)
     if isinstance(obj, dict):
         for key, value in obj.items():
-            obj[key] = maybe_recover_artifact(value)
+            obj[key] = maybe_recover_artifacts_structure(value)
         return obj
     if isinstance(obj, list):
         for i in range(len(obj)):
-            obj[i] = maybe_recover_artifact(obj[i])
+            obj[i] = maybe_recover_artifacts_structure(obj[i])
         return obj
     return obj
 
@@ -237,8 +256,7 @@ def __init_subclass__(cls, **kwargs):
     def is_artifact_file(cls, path):
         if not os.path.exists(path) or not os.path.isfile(path):
             return False
-        with open(path) as f:
-            d = json.load(f)
+        d = load_json(path)
         return cls.is_artifact_dict(d)
 
     @classmethod
@@ -384,14 +402,15 @@ def serialize(self):
         return self.to_json()
 
     def save(self, path):
-        original_args = Artifact.from_dict(self.to_dict()).get_repr_dict()
+        data = self.to_dict()
+        original_args = Artifact.from_dict(data).get_repr_dict()
         current_args = self.get_repr_dict()
         diffs = dict_diff_string(original_args, current_args)
         if diffs:
             raise UnitxtError(
                 f"Cannot save catalog artifacts that have changed since initialization. Detected differences in the following fields:\n{diffs}"
             )
-        save_to_file(path, self.to_json())
+        save_to_file(path, json_dump(data))
 
     def verify_instance(
         self, instance: Dict[str, Any], name: Optional[str] = None
@@ -581,7 +600,7 @@ def fetch_artifact(
 
     # If Json string, first load into dictionary
     if isinstance(artifact_rep, str):
-        artifact_rep = json.loads(artifact_rep)
+        artifact_rep = json_load(artifact_rep)
     # Load from dictionary (fails if not valid dictionary)
     return Artifact.from_dict(artifact_rep), None
 
@@ -657,7 +676,7 @@ def get_artifacts_data_classification(artifact: str) -> Optional[List[str]]:
     )
 
     try:
-        data_classification = json.loads(data_classification)
+        data_classification = json_load(data_classification)
     except json.decoder.JSONDecodeError as e:
         raise RuntimeError(error_msg) from e
 
diff --git a/src/unitxt/catalog.py b/src/unitxt/catalog.py
index 3221c3ee0d..e1a6a114fb 100644
--- a/src/unitxt/catalog.py
+++ b/src/unitxt/catalog.py
@@ -18,6 +18,7 @@
 from .logging_utils import get_logger
 from .settings_utils import get_constants
 from .text_utils import print_dict
+from .utils import json_load
 from .version import version
 
 logger = get_logger()
@@ -228,7 +229,7 @@ def _get_tags_from_file(file_path):
     result = Counter()
 
     with open(file_path) as f:
-        data = json.load(f)
+        data = json_load(f)
         if "__tags__" in data and isinstance(data["__tags__"], dict):
             tags = data["__tags__"]
             for key, value in tags.items():
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 87488d1da7..809cbc7604 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -51,7 +51,7 @@
 from .operators import ArtifactFetcherMixin
 from .settings_utils import get_constants, get_settings
 from .type_utils import isoftype
-from .utils import retry_connection_with_exponential_backoff
+from .utils import json_load, retry_connection_with_exponential_backoff
 
 constants = get_constants()
 settings = get_settings()
@@ -403,7 +403,7 @@ def to_tools(self, instance):
         if task_data is None:
             return None
         if isinstance(task_data, str):
-            task_data = json.loads(task_data)
+            task_data = json_load(task_data)
         if "__tools__" in task_data:
             return task_data["__tools__"]
         return None
@@ -2562,7 +2562,7 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
     def _extract_queries(instance: Dict[str, Any]) -> Tuple[Optional[str], List]:
         task_data = instance["task_data"]
         if isinstance(task_data, str):
-            task_data = json.loads(task_data)
+            task_data = json_load(task_data)
         question = task_data.get("question")
 
         images = [None]
@@ -2682,7 +2682,7 @@ def to_tools(
             return {"tools": None, "tool_choice": None}
 
         if isinstance(task_data, str):
-            task_data = json.loads(task_data)
+            task_data = json_load(task_data)
         if "__tools__" in task_data:
             tools: List[Dict[str, str]] = task_data["__tools__"]
             tool_choice: Optional[Dict[str, str]] = task_data.get("__tool_choice__")
@@ -2980,7 +2980,7 @@ def _infer(
             task_data = instance["task_data"]
 
             if isinstance(task_data, str):
-                task_data = json.loads(task_data)
+                task_data = json_load(task_data)
 
             for option in task_data["options"]:
                 requests.append(
@@ -3691,7 +3691,7 @@ def _infer(
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         task_data = [
-            json.loads(instance["task_data"]) if "task_data" in instance else {}
+            json_load(instance["task_data"]) if "task_data" in instance else {}
             for instance in dataset
         ]
         predictions = (
diff --git a/src/unitxt/llm_as_judge_from_template.py b/src/unitxt/llm_as_judge_from_template.py
index df2d5abab8..fcb924ce02 100644
--- a/src/unitxt/llm_as_judge_from_template.py
+++ b/src/unitxt/llm_as_judge_from_template.py
@@ -12,16 +12,15 @@
 from .settings_utils import get_settings
 from .system_prompts import EmptySystemPrompt, SystemPrompt
 from .templates import Template
+from .utils import json_load
 
 settings = get_settings()
 
 
 def get_task_data_dict(task_data):
-    import json
-
     # seems like the task data sometimes comes as a string, not a dict
     # this fixes it
-    return json.loads(task_data) if isinstance(task_data, str) else task_data
+    return json_load(task_data) if isinstance(task_data, str) else task_data
 
 
 class LLMAsJudgeBase(BulkInstanceMetric, ArtifactFetcherMixin):
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 410b940df1..39deaf2d16 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -3,6 +3,28 @@
 Operators: Building Blocks of Unitxt Processing Pipelines
 ==============================================================
 
+.. tip::
+
+    If you cannot find operators fit to your needs simply use instance function operator:
+
+    .. code-block:: python
+
+        def my_function(instance, stream_name=None):
+            instance["x"] += 42
+            return instance
+
+    Or stream function operator:
+
+    .. code-block:: python
+
+        def my_other_function(stream, stream_name=None):
+            for instance in stream:
+                instance["x"] += 42
+                yield instance
+
+    Both functions can be plugged in every place in unitxt requires operators, e.g pre-processing pipeline.
+
+
 Within the Unitxt framework, operators serve as the foundational elements used to assemble processing pipelines.
 Each operator is designed to perform specific manipulations on dictionary structures within a stream.
 These operators are callable entities that receive a MultiStream as input.
@@ -39,6 +61,7 @@
 ------------------------
 """
 
+import inspect
 import operator
 import re
 import uuid
@@ -2714,3 +2737,38 @@ def process(
         ), f"field '{self.main_field}' must reside in instance in order to verify its jsonschema correctness. got {instance}"
         self.recursive_trace_for_type_fields(instance[self.main_field])
         return instance
+class FunctionOperator(StreamOperator):
+    function: Callable
+
+    def verify(self):
+        super().verify()
+
+        if not callable(self.function):
+            raise ValueError("Function must be callable.")
+        sig = inspect.signature(self.function)
+        param_names = set(sig.parameters)
+
+        if "stream_name" not in param_names:
+            raise TypeError(
+                "The provided function must have a 'stream_name' parameter."
+            )
+
+        if "stream" not in param_names and "instance" not in param_names:
+            raise TypeError(
+                "The provided function must have a 'stream' parameter or 'instance' parameter."
+            )
+
+        if len(param_names) != 2:
+            raise TypeError("The provided function must have only 2 parameters")
+
+        if "stream" in param_names:
+            self._mode = "stream"
+        if "instance" in param_names:
+            self._mode = "instance"
+
+    def process(self, stream: Stream, stream_name: Optional[str] = None):
+        if self._mode == "stream":
+            yield from self.function(stream, stream_name)
+        if self._mode == "instance":
+            for instance in stream:
+                yield self.function(instance, stream_name)
diff --git a/src/unitxt/test_utils/artifact.py b/src/unitxt/test_utils/artifact.py
index 79b46ba04e..5294f79c99 100644
--- a/src/unitxt/test_utils/artifact.py
+++ b/src/unitxt/test_utils/artifact.py
@@ -1,7 +1,7 @@
-import json
 import tempfile
 
 from .. import add_to_catalog, register_local_catalog
+from ..api import _artifact_string_repr
 from ..artifact import fetch_artifact
 from ..logging_utils import get_logger
 from ..text_utils import print_dict
@@ -19,11 +19,14 @@ def test_artfifact_saving_and_loading(artifact, tester=None):
         loaded_artifact, _ = fetch_artifact(TEMP_NAME)
         if tester is not None:
             with tester.subTest(artifact=artifact, loaded_artifact=loaded_artifact):
-                tester.assertDictEqual(loaded_artifact.to_dict(), artifact.to_dict())
+                tester.assertEqual(
+                    _artifact_string_repr(loaded_artifact),
+                    _artifact_string_repr(artifact),
+                )
         else:
-            if not json.dumps(
-                loaded_artifact.to_dict(), sort_keys=True, ensure_ascii=False
-            ) == json.dumps(artifact.to_dict(), sort_keys=True):
+            if not _artifact_string_repr(loaded_artifact) == _artifact_string_repr(
+                artifact
+            ):
                 logger.info("Artifact loaded is not equal to artifact stored")
                 print_dict(loaded_artifact.to_dict())
                 print_dict(artifact.to_dict())
diff --git a/src/unitxt/test_utils/card.py b/src/unitxt/test_utils/card.py
index a02c811a12..c6ef53693e 100644
--- a/src/unitxt/test_utils/card.py
+++ b/src/unitxt/test_utils/card.py
@@ -4,6 +4,7 @@
 import tempfile
 
 from .. import add_to_catalog, register_local_catalog
+from ..api import _artifact_string_repr
 from ..artifact import fetch_artifact
 from ..collections import Collection
 from ..logging_utils import get_logger
@@ -41,8 +42,8 @@ def test_loading_from_catalog(card):
         )
         register_local_catalog(tmp_dir)
         card_, _ = fetch_artifact(TEMP_NAME)
-        assert json.dumps(card_.to_dict(), sort_keys=True) == json.dumps(
-            card.to_dict(), sort_keys=True
+        assert _artifact_string_repr(card_) == _artifact_string_repr(
+            card
         ), "Card loaded is not equal to card stored"
 
 
diff --git a/src/unitxt/text_utils.py b/src/unitxt/text_utils.py
index c54d3fbd72..5644cf2ca1 100644
--- a/src/unitxt/text_utils.py
+++ b/src/unitxt/text_utils.py
@@ -1,5 +1,6 @@
 import re
 import shutil
+import types
 from typing import List, Tuple
 
 import pandas as pd
@@ -295,6 +296,36 @@ def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
         return [f'"{d}"']
     if d is None or isinstance(d, (int, float, bool)):
         return [f"{d}"]
+
+    if isinstance(d, types.FunctionType):
+        from .utils import get_function_source
+
+        try:
+            source = get_function_source(d)
+            source_lines = source.splitlines()
+
+            # Find the base indentation of the function definition
+            base_indent = len(source_lines[0]) - len(source_lines[0].lstrip())
+
+            # Remove only the base indentation from each line
+            result_lines = []
+            for line in source_lines:
+                # Preserve empty lines
+                if line.strip() == "":
+                    result_lines.append("")
+                else:
+                    # Remove base indent while preserving internal indentation
+                    if line.startswith(" " * base_indent):
+                        result_lines.append(line[base_indent:])
+                    else:
+                        result_lines.append(line.lstrip())
+
+            return result_lines
+
+        except (OSError, TypeError):
+            # If source is not available
+            return [f"<function {d.__name__} (source unavailable)>"]
+
     raise RuntimeError(f"unrecognized value to print as python: {d}")
 
 
diff --git a/src/unitxt/utils.py b/src/unitxt/utils.py
index 2bfd7b1522..70fb31a0fb 100644
--- a/src/unitxt/utils.py
+++ b/src/unitxt/utils.py
@@ -1,11 +1,13 @@
 import copy
 import functools
 import importlib.util
+import inspect
 import json
 import os
 import random
 import re
 import time
+import types
 from collections import OrderedDict
 from contextvars import ContextVar
 from functools import wraps
@@ -221,7 +223,7 @@ def flatten_dict(
 def load_json(path):
     with open(path) as f:
         try:
-            return json.load(f)
+            return json.load(f, object_hook=decode_function)
         except json.decoder.JSONDecodeError as e:
             with open(path) as f:
                 file_content = "\n".join(f.readlines())
@@ -236,8 +238,56 @@ def save_to_file(path, data):
         f.write("\n")
 
 
-def json_dump(data):
-    return json.dumps(data, indent=4, ensure_ascii=False)
+def encode_function(obj):
+    # Allow only plain (module-level) functions
+    if isinstance(obj, types.FunctionType):
+        try:
+            return {"__function__": obj.__name__, "source": get_function_source(obj)}
+        except Exception as e:
+            raise TypeError(f"Failed to serialize function {obj.__name__}") from e
+    elif isinstance(obj, types.MethodType):
+        raise TypeError(
+            f"Method {obj.__func__.__name__} of class {obj.__self__.__class__.__name__} is not JSON serializable"
+        )
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+
+
+def json_dump(data, sort_keys=False):
+    return json.dumps(
+        data, indent=4, default=encode_function, ensure_ascii=False, sort_keys=sort_keys
+    )
+
+
+def get_function_source(func):
+    if hasattr(func, "__exec_source__"):
+        return func.__exec_source__
+    return inspect.getsource(func)
+
+
+def decode_function(obj):
+    # Detect our special function marker
+    if "__function__" in obj and "source" in obj:
+        namespace = {}
+        func_name = obj["__function__"]
+        try:
+            exec(obj["source"], namespace)
+            func = namespace.get(func_name)
+            func.__exec_source__ = obj["source"]
+            if not callable(func):
+                raise ValueError(
+                    f"Source did not define a callable named {func_name!r}"
+                )
+            return func
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load function {func_name!r} from source:\n{obj['source']}"
+            ) from e
+
+    return obj
+
+
+def json_load(s):
+    return json.loads(s, object_hook=decode_function)
 
 
 def is_package_installed(package_name):
diff --git a/tests/library/test_function_operators.py b/tests/library/test_function_operators.py
index f101727e05..522ced9322 100644
--- a/tests/library/test_function_operators.py
+++ b/tests/library/test_function_operators.py
@@ -1,13 +1,110 @@
 import json
+import os
+import tempfile
+import types
 
+from unitxt.artifact import Artifact
 from unitxt.operator import SequentialOperator
-from unitxt.operators import Apply, CopyFields
+from unitxt.operators import Apply, CopyFields, FunctionOperator
 from unitxt.test_utils.operators import check_operator
 
 from tests.utils import UnitxtTestCase
 
 
+def process_stream(stream, stream_name=None):
+    for instance in stream:
+        instance["x"] += 1
+        yield instance
+
+
+def process_instance(instance, stream_name=None):
+    instance["x"] += 1
+    return instance
+
+
+def wrong_function(instance):
+    ...
+
+
 class TestFunctionOperators(UnitxtTestCase):
+    def test_saving_and_loading_operator_holding_function_operator(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            artifact_path = os.path.join(temp_dir, "temp_func.json")
+            SequentialOperator(steps=[process_stream]).save(artifact_path)
+
+            loaded = Artifact.load(artifact_path)
+        self.assertIsInstance(loaded, SequentialOperator)
+        if isinstance(loaded, SequentialOperator):
+            self.assertIsInstance(loaded.steps[0], FunctionOperator)
+            if isinstance(loaded.steps[0], FunctionOperator):
+                self.assertIsInstance(loaded.steps[0].function, types.FunctionType)
+
+    def test_saving_and_loading_function_operator(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            artifact_path = os.path.join(temp_dir, "temp_func.json")
+            FunctionOperator(function=process_stream).save(artifact_path)
+
+            loaded = Artifact.load(artifact_path)
+        self.assertIsInstance(loaded, FunctionOperator)
+        if isinstance(loaded, FunctionOperator):
+            self.assertIsInstance(loaded.function, types.FunctionType)
+
+    def test_saving_and_loading_operator_with_regular_function(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            artifact_path = os.path.join(temp_dir, "temp_func.json")
+            SequentialOperator(steps=[wrong_function]).save(artifact_path)
+
+            loaded = Artifact.load(artifact_path)
+        self.assertIsInstance(loaded, SequentialOperator)
+        if isinstance(loaded, SequentialOperator):
+            self.assertIsInstance(loaded.steps[0], types.FunctionType)
+
+    def test_stream_function_operators(self):
+        operator = FunctionOperator(function=process_stream)
+
+        inputs = [
+            {"x": 1, "b": "2"},
+            {"x": 2, "b": "3"},
+        ]
+
+        targets = [
+            {"x": 2, "b": "2"},
+            {"x": 3, "b": "3"},
+        ]
+
+        check_operator(
+            operator=operator,
+            inputs=inputs,
+            targets=targets,
+            tester=self,
+        )
+
+    def test_instance_function_operators(self):
+        operator = FunctionOperator(function=process_instance)
+
+        inputs = [
+            {"x": 1, "b": "2"},
+            {"x": 2, "b": "3"},
+        ]
+
+        targets = [
+            {"x": 2, "b": "2"},
+            {"x": 3, "b": "3"},
+        ]
+
+        check_operator(
+            operator=operator,
+            inputs=inputs,
+            targets=targets,
+            tester=self,
+        )
+
+    def test_function_operator_with_wrong_function(self):
+        with self.assertRaises(ValueError):
+            FunctionOperator(function=[])
+        with self.assertRaises(TypeError):
+            FunctionOperator(function=wrong_function)
+
     def test_apply_function_operator(self):
         operator = Apply("a", function=str.upper, to_field="b")
 

From ed3a7e43bbb5e19d2272d8c190aedf016de42363 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Fri, 27 Jun 2025 10:05:54 +0300
Subject: [PATCH 2/9] format

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/api.py b/src/unitxt/api.py
index 88e61defa6..0bf36b98f2 100644
--- a/src/unitxt/api.py
+++ b/src/unitxt/api.py
@@ -26,7 +26,7 @@
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
-from .utils import lru_cache_decorator
+from .utils import json_dump, lru_cache_decorator
 
 logger = get_logger()
 constants = get_constants()

From 9e330754f6350f0a93c4837bd901d7f9ea0bdc9d Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Fri, 18 Jul 2025 21:00:22 +0300
Subject: [PATCH 3/9] normalize to_dict of simple tyes, like re.DOTALL, and
 speed test_preparation

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 .github/workflows/catalog_preparation.yml | 4 ++--
 src/unitxt/dataclass.py                   | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index b420165116..a01f84d61b 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -27,7 +27,7 @@ jobs:
 
     strategy:
       matrix:
-        modulo: [0,1,2,3,4,5,6,7]
+        modulo: [0,1,2,3,4,5,6,7,8,9,10,11]
 
     steps:
     - uses: actions/checkout@v5
@@ -53,7 +53,7 @@ jobs:
       run: |
         modulo="${{ matrix.modulo }}"
         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
-        echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
+        echo "sed -i 's/^num_par = 1 /num_par = 12 /' tests/catalog/test_preparation.py" > sedit.sh
         echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
         sh sedit.sh
         python -m unittest tests.catalog.test_preparation
diff --git a/src/unitxt/dataclass.py b/src/unitxt/dataclass.py
index afb92bcf0e..49332a9ed4 100644
--- a/src/unitxt/dataclass.py
+++ b/src/unitxt/dataclass.py
@@ -2,6 +2,7 @@
 import dataclasses
 import functools
 import inspect
+import json
 from abc import ABCMeta
 from inspect import Parameter, Signature
 from typing import Any, Dict, List, Optional, final
@@ -321,6 +322,10 @@ def to_dict(obj, func=copy.deepcopy, _visited=None):
     # Get object ID to track visited objects
     obj_id = id(obj)
 
+    if isinstance(obj, (int, float, bool)):
+        # normalize constants like re.DOTALL
+        obj = json.loads(json.dumps(obj))
+
     # If we've seen this object before, return a placeholder to avoid infinite recursion
     if obj_id in _visited:
         return func(obj)

From f407b94b10ab21af367c69e2768b3ce4936da79b Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Sun, 20 Jul 2025 11:55:49 +0300
Subject: [PATCH 4/9] fixed jsonschema programatically, with python string
 operations. can now read through the whole recipe output

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/cards/xlam_function_calling.py        |  6 +-
 .../cards/xlam_function_calling_60k.json      |  6 +-
 src/unitxt/operators.py                       | 81 +++++++++++++++++++
 3 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/prepare/cards/xlam_function_calling.py b/prepare/cards/xlam_function_calling.py
index 4ac5c473c2..55925fd2cd 100644
--- a/prepare/cards/xlam_function_calling.py
+++ b/prepare/cards/xlam_function_calling.py
@@ -14,14 +14,12 @@
 
 
 def extract_required_parameters(instance, stream_name=None):
-    result = []
     for tool in instance["tools"]:
         required_params = []
         for param_name, param_info in tool["parameters"]["properties"].items():
-            if "optional" not in param_info["type"]:
+            if "optional" not in param_info["type"].lower():
                 required_params.append(param_name)
-        result.append(required_params)
-    instance["required"] = result
+        tool["parameters"]["required"] = required_params
     return instance
 
 
diff --git a/src/unitxt/catalog/cards/xlam_function_calling_60k.json b/src/unitxt/catalog/cards/xlam_function_calling_60k.json
index a7f65e3694..9fcd8f763c 100644
--- a/src/unitxt/catalog/cards/xlam_function_calling_60k.json
+++ b/src/unitxt/catalog/cards/xlam_function_calling_60k.json
@@ -64,10 +64,8 @@
             "expression": "[[p for p, c in tool['parameters']['properties'].items() if 'optional' not in c['type'].lower()] for tool in tools]"
         },
         {
-            "__type__": "copy",
-            "field": "required",
-            "to_field": "tools/*/parameters/required",
-            "set_every_value": true
+            "__function__": "extract_required_parameters",
+            "source": "def extract_required_parameters(instance, stream_name=None):\n    for tool in instance[\"tools\"]:\n        required_params = []\n        for param_name, param_info in tool[\"parameters\"][\"properties\"].items():\n            if \"optional\" not in param_info[\"type\"].lower():\n                required_params.append(param_name)\n        tool[\"parameters\"][\"required\"] = required_params\n    return instance\n"
         },
         {
             "__type__": "fix_json_schema_of_parameter_types",
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 39deaf2d16..1c5157a9f4 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -2772,3 +2772,84 @@ def process(self, stream: Stream, stream_name: Optional[str] = None):
         if self._mode == "instance":
             for instance in stream:
                 yield self.function(instance, stream_name)
+
+
+class FixJsonSchemaOfToolParameterType(InstanceOperator):
+    path_to_parameter: str  # the dict(s) that contain(s) the 'type' to be fixed
+
+    def prepare(self):
+        self.simple_mapping = {
+            "str": "string",
+            "int": "integer",
+            "List": "array",
+            "list": "array",
+            "set": "array",
+            "Set": "array",
+            "float": "number",
+            "bool": "boolean",
+            "dict": "object",
+            "Dict": "object",
+        }
+
+    def dict_type_of(self, type_str: str) -> dict:
+        return {"type": type_str}
+
+    def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
+        if type_str in self.simple_mapping:
+            return self.dict_type_of(self.simple_mapping[type_str])
+        m = re.match(r"^(List|Tuple)\[(.*?)\]$", type_str)
+        if m:
+            basic_type = self.dict_type_of("array")
+            basic_type["items"] = self.type_str_to_jsonschema_dict(
+                m.group(2) if m.group(1) == "List" else m.group(2).split(",")[0].strip()
+            )
+            return basic_type
+
+        m = re.match(r"^(Union)\[(.*?)\]$", type_str)
+        if m:
+            args = m.group(2).split(",")
+            for i in range(len(args)):
+                args[i] = args[i].strip()
+            return {"anyOf": [self.type_str_to_jsonschema_dict(arg) for arg in args]}
+        if "," in type_str:
+            sub_types = type_str.split(",")
+            for i in range(len(sub_types)):
+                sub_types[i] = sub_types[i].strip()
+            assert len(sub_types) in [
+                2,
+                3,
+            ], f"num of subtypes should be 2 or 3, got {type_str}"
+            basic_type = self.type_str_to_jsonschema_dict(sub_types[0])
+            for sub_type in sub_types[1:]:
+                if sub_type.lower().startswith("default"):
+                    basic_type["default"] = re.split(r"[= ]", sub_type, maxsplit=1)[1]
+            for sub_type in sub_types[1:]:
+                if sub_type.lower().startswith("optional"):
+                    return {"anyOf": [basic_type, self.dict_type_of("null")]}
+            return basic_type
+
+        return self.dict_type_of("object")  # otherwise - fall back to a safe zone
+
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        # get a list of sub_dicts to fix (if self.path_to_parameter contains *) or a single sub_dict (if not)
+        parameters_to_fix = dict_get(instance, self.path_to_parameter)
+        if not isinstance(parameters_to_fix, list):
+            parameters_to_fix = [parameters_to_fix]
+        for parameter_to_fix in parameters_to_fix:
+            if not isinstance(parameter_to_fix, list):
+                parameter_to_fix = [parameter_to_fix]
+            for property_to_fix in parameter_to_fix:
+                assert isinstance(
+                    property_to_fix, dict
+                ), f"property to fix should be a dict, got {property_to_fix}"
+                assert (
+                    "type" in property_to_fix
+                ), f"field 'type' should be in property to fix, got {property_to_fix}"
+                jsonschema_dict = self.type_str_to_jsonschema_dict(
+                    property_to_fix["type"]
+                )
+                property_to_fix.pop("type")
+                property_to_fix.update(jsonschema_dict)
+        return instance

From a5fdd0ce3bad36ba7592a2fdd5555e2e0af8c31a Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Sun, 20 Jul 2025 23:24:54 +0300
Subject: [PATCH 5/9] combine the fix of jsonschema with that needed for bfcl

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/cards/bfcl.py                         | 93 ++++++++++---------
 .../cards/bfcl/multi_turn/java_v3.json        |  4 +-
 .../cards/bfcl/multi_turn/javascript_v3.json  |  4 +-
 .../bfcl/multi_turn/live_irrelevance_v3.json  |  4 +-
 .../bfcl/multi_turn/live_multiple_v3.json     |  4 +-
 .../multi_turn/live_parallel_multiple_v3.json |  4 +-
 .../bfcl/multi_turn/live_parallel_v3.json     |  4 +-
 .../bfcl/multi_turn/live_relevance_v3.json    |  4 +-
 .../cards/bfcl/multi_turn/live_simple_v3.json |  4 +-
 .../cards/bfcl/multi_turn/multiple_v3.json    |  4 +-
 .../bfcl/multi_turn/parallel_multiple_v3.json |  4 +-
 .../cards/bfcl/multi_turn/parallel_v3.json    |  4 +-
 .../cards/bfcl/multi_turn/simple_v3.json      |  4 +-
 src/unitxt/catalog/cards/bfcl/simple_v3.json  |  4 +-
 src/unitxt/operators.py                       | 79 ++++++++++------
 15 files changed, 136 insertions(+), 88 deletions(-)

diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py
index 1b5159937c..6cdf692e30 100644
--- a/prepare/cards/bfcl.py
+++ b/prepare/cards/bfcl.py
@@ -5,6 +5,7 @@
 from unitxt.operators import (
     Copy,
     ExecuteExpression,
+    FixJsonSchemaOfToolParameterTypes,
     Set,
 )
 from unitxt.stream_operators import JoinStreams
@@ -33,7 +34,7 @@
                 ),
                 Copy(field="question/0/0/content", to_field="query"),
                 Copy(field="function", to_field="tools"),
-                "operators.fix_json_schema",
+                FixJsonSchemaOfToolParameterTypes(),
                 # Process ground truth data in this dataset, which is a provided as a list of options per field,
                 # and convert it into a list of explicit tool calls
                 #
@@ -102,7 +103,7 @@
                 ),
                 Copy(field="question/*/0", to_field="dialog"),
                 Copy(field="function", to_field="tools"),
-                "operators.fix_json_schema",
+                FixJsonSchemaOfToolParameterTypes(),
                 ExecuteExpression(
                     expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]',
                     to_field="reference_calls",
@@ -138,52 +139,52 @@
         test_card(card, strict=False)
         add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
 
-        for subset in [
-            "live_relevance",
-            "live_irrelevance",
-        ]:
-            card = TaskCard(
-                loader=LoadJsonFile(
-                    files={
-                        "test": base_path + f"BFCL_v3_{subset}.json",
-                    },
-                    lines=True,
-                    data_classification_policy=["public"],
-                ),
-                preprocess_steps=[
-                    Copy(field="question/*/0", to_field="dialog"),
-                    Copy(field="function", to_field="tools"),
-                    "operators.fix_json_schema",
-                    Set(fields={"reference_calls": []}),
-                ],
-                task="tasks.tool_calling.multi_turn",
-                templates=["templates.tool_calling.multi_turn"],
-                __description__=(
-                    """The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users' function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages."""
-                ),
-                __title__=f"""Berkeley Function Calling Leaderboard (Multi Turn Setup) - {subset.replace("_", " ").title()} V3""",
-                __tags__={
-                    "annotations_creators": "expert-generated",
-                    "language": ["en"],
-                    "license": "apache-2.0",
-                    "size_categories": ["10K<n<100K"],
-                    "task_categories": [
-                        "question-answering",
-                        "reading-comprehension",
-                        "tool-calling",
-                        "multi-turn-tool-calling",
-                    ],
-                    "task_ids": [
-                        "tool-calling",
-                        "multi-turn-tool-calling",
-                        "reading-comprehension",
-                    ],
+    for subset in [
+        "live_relevance",
+        "live_irrelevance",
+    ]:
+        card = TaskCard(
+            loader=LoadJsonFile(
+                files={
+                    "test": base_path + f"BFCL_v3_{subset}.json",
                 },
-            )
+                lines=True,
+                data_classification_policy=["public"],
+            ),
+            preprocess_steps=[
+                Copy(field="question/*/0", to_field="dialog"),
+                Copy(field="function", to_field="tools"),
+                FixJsonSchemaOfToolParameterTypes(),
+                Set(fields={"reference_calls": []}),
+            ],
+            task="tasks.tool_calling.multi_turn",
+            templates=["templates.tool_calling.multi_turn"],
+            __description__=(
+                """The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users' function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages."""
+            ),
+            __title__=f"""Berkeley Function Calling Leaderboard (Multi Turn Setup) - {subset.replace("_", " ").title()} V3""",
+            __tags__={
+                "annotations_creators": "expert-generated",
+                "language": ["en"],
+                "license": "apache-2.0",
+                "size_categories": ["10K<n<100K"],
+                "task_categories": [
+                    "question-answering",
+                    "reading-comprehension",
+                    "tool-calling",
+                    "multi-turn-tool-calling",
+                ],
+                "task_ids": [
+                    "tool-calling",
+                    "multi-turn-tool-calling",
+                    "reading-comprehension",
+                ],
+            },
+        )
 
-            # Test and add the card to the catalog
-            test_card(card, strict=False)
-            add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
+        # Test and add the card to the catalog
+        test_card(card, strict=False)
+        add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
 
     # card = TaskCard(
     #     loader=LoadJsonFile(
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
index b6f37ed16a..cb6f420f9f 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
index c3d7e81b09..2a259af79c 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
index 58b8edfdba..b375d859c7 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
@@ -21,7 +21,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "set",
             "fields": {
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
index 26cea69c1d..e6338d77b0 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
index d99a3aaba4..c67b9bcb26 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
index e2f7bff160..ba4b8792a1 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
index b8bbd35491..75391d17f5 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
@@ -21,7 +21,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "set",
             "fields": {
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
index 23b27a92f1..e21fb37e3f 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
index a2f3e55b40..e2d952852c 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
index def9eafd39..3e7dece737 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
index ca51ee9962..f961deda0b 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
index 9e81338345..ad51f352b3 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/simple_v3.json b/src/unitxt/catalog/cards/bfcl/simple_v3.json
index b68303eb8c..f73e3efb1c 100644
--- a/src/unitxt/catalog/cards/bfcl/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/simple_v3.json
@@ -30,7 +30,9 @@
             "field": "function",
             "to_field": "tools"
         },
-        "operators.fix_json_schema",
+        {
+            "__type__": "fix_json_schema_of_tool_parameter_types"
+        },
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 1c5157a9f4..8cf3a6f033 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -2774,26 +2774,62 @@ def process(self, stream: Stream, stream_name: Optional[str] = None):
                 yield self.function(instance, stream_name)
 
 
-class FixJsonSchemaOfToolParameterType(InstanceOperator):
-    path_to_parameter: str  # the dict(s) that contain(s) the 'type' to be fixed
-
+class FixJsonSchemaOfToolParameterTypes(InstanceOperator):
     def prepare(self):
         self.simple_mapping = {
-            "str": "string",
-            "int": "integer",
-            "List": "array",
-            "list": "array",
-            "set": "array",
-            "Set": "array",
-            "float": "number",
+            "": "object",
+            "any": "object",
+            "Any": "object",
+            "Array": "array",
+            "array": "array",
+            "ArrayList": "array",
+            "Bigint": "integer",
             "bool": "boolean",
+            "Boolean": "boolean",
+            "byte": "integer",
+            "char": "string",
             "dict": "object",
             "Dict": "object",
+            "double": "number",
+            "float": "number",
+            "HashMap": "object",
+            "Hashtable": "object",
+            "int": "integer",
+            "integer": "integer",
+            "list": "array",
+            "List": "array",
+            "long": "integer",
+            "number": "number",
+            "Queue": "array",
+            "short": "integer",
+            "Stack": "array",
+            "tuple": "array",
+            "Set": "array",
+            "set": "array",
+            "str": "string",
+            "String": "string",
+            "string": "string",
         }
 
     def dict_type_of(self, type_str: str) -> dict:
         return {"type": type_str}
 
+    def recursive_trace_for_type_fields(self, containing_element):
+        if isinstance(containing_element, dict):
+            keys = list(containing_element.keys())
+            for key in keys:
+                if key == "type" and isinstance(containing_element["type"], str):
+                    jsonschema_dict = self.type_str_to_jsonschema_dict(
+                        containing_element["type"]
+                    )
+                    containing_element.pop("type")
+                    containing_element.update(jsonschema_dict)
+                else:
+                    self.recursive_trace_for_type_fields(containing_element[key])
+        elif isinstance(containing_element, list):
+            for list_element in containing_element:
+                self.recursive_trace_for_type_fields(list_element)
+
     def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
         if type_str in self.simple_mapping:
             return self.dict_type_of(self.simple_mapping[type_str])
@@ -2833,23 +2869,8 @@ def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
-        # get a list of sub_dicts to fix (if self.path_to_parameter contains *) or a single sub_dict (if not)
-        parameters_to_fix = dict_get(instance, self.path_to_parameter)
-        if not isinstance(parameters_to_fix, list):
-            parameters_to_fix = [parameters_to_fix]
-        for parameter_to_fix in parameters_to_fix:
-            if not isinstance(parameter_to_fix, list):
-                parameter_to_fix = [parameter_to_fix]
-            for property_to_fix in parameter_to_fix:
-                assert isinstance(
-                    property_to_fix, dict
-                ), f"property to fix should be a dict, got {property_to_fix}"
-                assert (
-                    "type" in property_to_fix
-                ), f"field 'type' should be in property to fix, got {property_to_fix}"
-                jsonschema_dict = self.type_str_to_jsonschema_dict(
-                    property_to_fix["type"]
-                )
-                property_to_fix.pop("type")
-                property_to_fix.update(jsonschema_dict)
+        assert (
+            "tools" in instance
+        ), f"field 'tools' must reside in instance in order to verify its jsonschema correctness. got {instance}"
+        self.recursive_trace_for_type_fields(instance["tools"])
         return instance

From fc81dc82dcc705215c19092bfbecde6e1718dada Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Mon, 21 Jul 2025 16:44:37 +0300
Subject: [PATCH 6/9] last touches in bfcl, reviewing whole datasets

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 docs/docs/adding_operator.rst                             | 6 +++---
 prepare/cards/bfcl.py                                     | 6 +++---
 src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json     | 2 +-
 .../catalog/cards/bfcl/multi_turn/javascript_v3.json      | 2 +-
 .../catalog/cards/bfcl/multi_turn/live_multiple_v3.json   | 2 +-
 .../cards/bfcl/multi_turn/live_parallel_multiple_v3.json  | 2 +-
 .../catalog/cards/bfcl/multi_turn/live_parallel_v3.json   | 2 +-
 .../catalog/cards/bfcl/multi_turn/live_simple_v3.json     | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json | 2 +-
 .../cards/bfcl/multi_turn/parallel_multiple_v3.json       | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json   | 2 +-
 src/unitxt/operators.py                                   | 8 +++-----
 13 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/docs/docs/adding_operator.rst b/docs/docs/adding_operator.rst
index 49c68359ed..c258ad9139 100644
--- a/docs/docs/adding_operator.rst
+++ b/docs/docs/adding_operator.rst
@@ -40,11 +40,11 @@ There are several types of operators.
 1. Field Operators - Operators that modify individual fields of the instances in the input streams.  Example of such operators are operators that
 cast field values, uppercase string fields, or translate text between languages.
 
-1. Instance Operators - Operators that modify individual instances in the input streams. For example, operators that add or remove fields.
+2. Instance Operators - Operators that modify individual instances in the input streams. For example, operators that add or remove fields.
 
-2. Stream Operators - Operators that perform operations on full streams. For example, operators that remove instances based on some condition.
+3. Stream Operators - Operators that perform operations on full streams. For example, operators that remove instances based on some condition.
 
-3. MultiStream Operators - Operator that perform operations on multiple streams.  For example, operators that repartition the instances between train and test splits.
+4. MultiStream Operators - Operator that perform operations on multiple streams.  For example, operators that repartition the instances between train and test splits.
 
 Unitxt comes with a large collection of built in operators - that were design to cover most common requirements of dataset processing.
 
diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py
index 6cdf692e30..8b1c67a82b 100644
--- a/prepare/cards/bfcl.py
+++ b/prepare/cards/bfcl.py
@@ -75,12 +75,12 @@
     for subset in [
         "simple",
         "multiple",
-        "live_multiple",
+        "live_multiple",  # instances above 900 reach size of hundreds of MBs
         "live_simple",
         "java",
         "javascript",
         "parallel",
-        "parallel_multiple",
+        "parallel_multiple",  # error caused by instance 179, hence expression now constrains: if isinstance(v, dict)
         "live_parallel",
         "live_parallel_multiple",
     ]:
@@ -105,7 +105,7 @@
                 Copy(field="function", to_field="tools"),
                 FixJsonSchemaOfToolParameterTypes(),
                 ExecuteExpression(
-                    expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]',
+                    expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]',
                     to_field="reference_calls",
                     imports_list=["itertools"],
                 ),
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
index cb6f420f9f..5884e73362 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
index 2a259af79c..9f3fc92841 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
index e6338d77b0..e166caec6a 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
index c67b9bcb26..62a4057715 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
index ba4b8792a1..9fb28a5847 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
index e21fb37e3f..669796edc1 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
index e2d952852c..51178a0a83 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
index 3e7dece737..2d8fcd4f1c 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
index f961deda0b..c3da658808 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
index ad51f352b3..2b7311103d 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 8cf3a6f033..35db9dcc6f 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -2781,7 +2781,6 @@ def prepare(self):
             "any": "object",
             "Any": "object",
             "Array": "array",
-            "array": "array",
             "ArrayList": "array",
             "Bigint": "integer",
             "bool": "boolean",
@@ -2795,11 +2794,9 @@ def prepare(self):
             "HashMap": "object",
             "Hashtable": "object",
             "int": "integer",
-            "integer": "integer",
             "list": "array",
             "List": "array",
             "long": "integer",
-            "number": "number",
             "Queue": "array",
             "short": "integer",
             "Stack": "array",
@@ -2808,7 +2805,6 @@ def prepare(self):
             "set": "array",
             "str": "string",
             "String": "string",
-            "string": "string",
         }
 
     def dict_type_of(self, type_str: str) -> dict:
@@ -2847,6 +2843,8 @@ def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
             for i in range(len(args)):
                 args[i] = args[i].strip()
             return {"anyOf": [self.type_str_to_jsonschema_dict(arg) for arg in args]}
+        if re.match(r"^(Callable)\[(.*?)\]$", type_str):
+            return self.dict_type_of("object")
         if "," in type_str:
             sub_types = type_str.split(",")
             for i in range(len(sub_types)):
@@ -2864,7 +2862,7 @@ def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
                     return {"anyOf": [basic_type, self.dict_type_of("null")]}
             return basic_type
 
-        return self.dict_type_of("object")  # otherwise - fall back to a safe zone
+        return self.dict_type_of(type_str)  # otherwise - return what arrived
 
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None

From 73ec1e4cc7e49becf871f5e5169ea818af8dc24e Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Sun, 3 Aug 2025 14:35:28 +0300
Subject: [PATCH 7/9] revert to num-parallel = 8 in test_preparation

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 .github/workflows/catalog_preparation.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
index a01f84d61b..b420165116 100644
--- a/.github/workflows/catalog_preparation.yml
+++ b/.github/workflows/catalog_preparation.yml
@@ -27,7 +27,7 @@ jobs:
 
     strategy:
       matrix:
-        modulo: [0,1,2,3,4,5,6,7,8,9,10,11]
+        modulo: [0,1,2,3,4,5,6,7]
 
     steps:
     - uses: actions/checkout@v5
@@ -53,7 +53,7 @@ jobs:
       run: |
         modulo="${{ matrix.modulo }}"
         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
-        echo "sed -i 's/^num_par = 1 /num_par = 12 /' tests/catalog/test_preparation.py" > sedit.sh
+        echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
         echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
         sh sedit.sh
         python -m unittest tests.catalog.test_preparation

From 4aa1e22f5c54be0c6bc3b53a76640a3dd8976edb Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Sun, 3 Aug 2025 20:27:54 +0300
Subject: [PATCH 8/9] fix no extra problematic subset here, leave for a
 separate PR

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/cards/bfcl.py                                         | 4 ++--
 src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json         | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json   | 2 +-
 .../catalog/cards/bfcl/multi_turn/live_multiple_v3.json       | 2 +-
 .../cards/bfcl/multi_turn/live_parallel_multiple_v3.json      | 2 +-
 .../catalog/cards/bfcl/multi_turn/live_parallel_v3.json       | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json  | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json     | 2 +-
 .../catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json   | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json     | 2 +-
 src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json       | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py
index 8b1c67a82b..8551bfb42d 100644
--- a/prepare/cards/bfcl.py
+++ b/prepare/cards/bfcl.py
@@ -80,7 +80,7 @@
         "java",
         "javascript",
         "parallel",
-        "parallel_multiple",  # error caused by instance 179, hence expression now constrains: if isinstance(v, dict)
+        "parallel_multiple",  # error caused by instance 179
         "live_parallel",
         "live_parallel_multiple",
     ]:
@@ -105,7 +105,7 @@
                 Copy(field="function", to_field="tools"),
                 FixJsonSchemaOfToolParameterTypes(),
                 ExecuteExpression(
-                    expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]',
+                    expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]',
                     to_field="reference_calls",
                     imports_list=["itertools"],
                 ),
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
index 5884e73362..cb6f420f9f 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
index 9f3fc92841..2a259af79c 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
index e166caec6a..e6338d77b0 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
index 62a4057715..c67b9bcb26 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
index 9fb28a5847..ba4b8792a1 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
index 669796edc1..e21fb37e3f 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
index 51178a0a83..e2d952852c 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
index 2d8fcd4f1c..3e7dece737 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
index c3da658808..f961deda0b 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
index 2b7311103d..ad51f352b3 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
@@ -35,7 +35,7 @@
         },
         {
             "__type__": "execute_expression",
-            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() if isinstance(v, dict) for vals in itertools.product(*v.values())]",
+            "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
             "to_field": "reference_calls",
             "imports_list": [
                 "itertools"

From c6a16c622713d4cdafc1cbdaae4a2093cb284e01 Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Thu, 14 Aug 2025 15:48:05 +0300
Subject: [PATCH 9/9] revert bfcl to main, will be fixed in another PR, revert
 changes to xlam -- were already comitted to main, and now live with the
 general schema fixer operator, so removed the one tailored for tools

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/cards/bfcl.py                         |  97 +++++++++--------
 prepare/cards/xlam_function_calling.py        |  11 --
 .../cards/bfcl/multi_turn/java_v3.json        |   4 +-
 .../cards/bfcl/multi_turn/javascript_v3.json  |   4 +-
 .../bfcl/multi_turn/live_irrelevance_v3.json  |   4 +-
 .../bfcl/multi_turn/live_multiple_v3.json     |   4 +-
 .../multi_turn/live_parallel_multiple_v3.json |   4 +-
 .../bfcl/multi_turn/live_parallel_v3.json     |   4 +-
 .../bfcl/multi_turn/live_relevance_v3.json    |   4 +-
 .../cards/bfcl/multi_turn/live_simple_v3.json |   4 +-
 .../cards/bfcl/multi_turn/multiple_v3.json    |   4 +-
 .../bfcl/multi_turn/parallel_multiple_v3.json |   4 +-
 .../cards/bfcl/multi_turn/parallel_v3.json    |   4 +-
 .../cards/bfcl/multi_turn/simple_v3.json      |   4 +-
 src/unitxt/catalog/cards/bfcl/simple_v3.json  |   4 +-
 .../cards/xlam_function_calling_60k.json      |   6 +-
 src/unitxt/operators.py                       | 102 +-----------------
 17 files changed, 67 insertions(+), 201 deletions(-)

diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py
index 8551bfb42d..1b5159937c 100644
--- a/prepare/cards/bfcl.py
+++ b/prepare/cards/bfcl.py
@@ -5,7 +5,6 @@
 from unitxt.operators import (
     Copy,
     ExecuteExpression,
-    FixJsonSchemaOfToolParameterTypes,
     Set,
 )
 from unitxt.stream_operators import JoinStreams
@@ -34,7 +33,7 @@
                 ),
                 Copy(field="question/0/0/content", to_field="query"),
                 Copy(field="function", to_field="tools"),
-                FixJsonSchemaOfToolParameterTypes(),
+                "operators.fix_json_schema",
                 # Process ground truth data in this dataset, which is a provided as a list of options per field,
                 # and convert it into a list of explicit tool calls
                 #
@@ -75,12 +74,12 @@
     for subset in [
         "simple",
         "multiple",
-        "live_multiple",  # instances above 900 reach size of hundreds of MBs
+        "live_multiple",
         "live_simple",
         "java",
         "javascript",
         "parallel",
-        "parallel_multiple",  # error caused by instance 179
+        "parallel_multiple",
         "live_parallel",
         "live_parallel_multiple",
     ]:
@@ -103,7 +102,7 @@
                 ),
                 Copy(field="question/*/0", to_field="dialog"),
                 Copy(field="function", to_field="tools"),
-                FixJsonSchemaOfToolParameterTypes(),
+                "operators.fix_json_schema",
                 ExecuteExpression(
                     expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]',
                     to_field="reference_calls",
@@ -139,52 +138,52 @@
         test_card(card, strict=False)
         add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
 
-    for subset in [
-        "live_relevance",
-        "live_irrelevance",
-    ]:
-        card = TaskCard(
-            loader=LoadJsonFile(
-                files={
-                    "test": base_path + f"BFCL_v3_{subset}.json",
-                },
-                lines=True,
-                data_classification_policy=["public"],
-            ),
-            preprocess_steps=[
-                Copy(field="question/*/0", to_field="dialog"),
-                Copy(field="function", to_field="tools"),
-                FixJsonSchemaOfToolParameterTypes(),
-                Set(fields={"reference_calls": []}),
-            ],
-            task="tasks.tool_calling.multi_turn",
-            templates=["templates.tool_calling.multi_turn"],
-            __description__=(
-                """The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users' function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages."""
-            ),
-            __title__=f"""Berkeley Function Calling Leaderboard (Multi Turn Setup) - {subset.replace("_", " ").title()} V3""",
-            __tags__={
-                "annotations_creators": "expert-generated",
-                "language": ["en"],
-                "license": "apache-2.0",
-                "size_categories": ["10K<n<100K"],
-                "task_categories": [
-                    "question-answering",
-                    "reading-comprehension",
-                    "tool-calling",
-                    "multi-turn-tool-calling",
-                ],
-                "task_ids": [
-                    "tool-calling",
-                    "multi-turn-tool-calling",
-                    "reading-comprehension",
+        for subset in [
+            "live_relevance",
+            "live_irrelevance",
+        ]:
+            card = TaskCard(
+                loader=LoadJsonFile(
+                    files={
+                        "test": base_path + f"BFCL_v3_{subset}.json",
+                    },
+                    lines=True,
+                    data_classification_policy=["public"],
+                ),
+                preprocess_steps=[
+                    Copy(field="question/*/0", to_field="dialog"),
+                    Copy(field="function", to_field="tools"),
+                    "operators.fix_json_schema",
+                    Set(fields={"reference_calls": []}),
                 ],
-            },
-        )
+                task="tasks.tool_calling.multi_turn",
+                templates=["templates.tool_calling.multi_turn"],
+                __description__=(
+                    """The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users' function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages."""
+                ),
+                __title__=f"""Berkeley Function Calling Leaderboard (Multi Turn Setup) - {subset.replace("_", " ").title()} V3""",
+                __tags__={
+                    "annotations_creators": "expert-generated",
+                    "language": ["en"],
+                    "license": "apache-2.0",
+                    "size_categories": ["10K<n<100K"],
+                    "task_categories": [
+                        "question-answering",
+                        "reading-comprehension",
+                        "tool-calling",
+                        "multi-turn-tool-calling",
+                    ],
+                    "task_ids": [
+                        "tool-calling",
+                        "multi-turn-tool-calling",
+                        "reading-comprehension",
+                    ],
+                },
+            )
 
-        # Test and add the card to the catalog
-        test_card(card, strict=False)
-        add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
+            # Test and add the card to the catalog
+            test_card(card, strict=False)
+            add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True)
 
     # card = TaskCard(
     #     loader=LoadJsonFile(
diff --git a/prepare/cards/xlam_function_calling.py b/prepare/cards/xlam_function_calling.py
index 55925fd2cd..cdc761575e 100644
--- a/prepare/cards/xlam_function_calling.py
+++ b/prepare/cards/xlam_function_calling.py
@@ -12,17 +12,6 @@
 from unitxt.struct_data_operators import LoadJson
 from unitxt.test_utils.card import test_card
 
-
-def extract_required_parameters(instance, stream_name=None):
-    for tool in instance["tools"]:
-        required_params = []
-        for param_name, param_info in tool["parameters"]["properties"].items():
-            if "optional" not in param_info["type"].lower():
-                required_params.append(param_name)
-        tool["parameters"]["required"] = required_params
-    return instance
-
-
 card = TaskCard(
     loader=LoadHF(
         path="Salesforce/xlam-function-calling-60k",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
index cb6f420f9f..b6f37ed16a 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/java_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
index 2a259af79c..c3d7e81b09 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/javascript_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
index b375d859c7..58b8edfdba 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_irrelevance_v3.json
@@ -21,9 +21,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "set",
             "fields": {
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
index e6338d77b0..26cea69c1d 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_multiple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
index c67b9bcb26..d99a3aaba4 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_multiple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
index ba4b8792a1..e2f7bff160 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_parallel_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
index 75391d17f5..b8bbd35491 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_relevance_v3.json
@@ -21,9 +21,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "set",
             "fields": {
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
index e21fb37e3f..23b27a92f1 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/live_simple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
index e2d952852c..a2f3e55b40 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/multiple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
index 3e7dece737..def9eafd39 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_multiple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
index f961deda0b..ca51ee9962 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/parallel_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
index ad51f352b3..9e81338345 100644
--- a/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/multi_turn/simple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/bfcl/simple_v3.json b/src/unitxt/catalog/cards/bfcl/simple_v3.json
index f73e3efb1c..b68303eb8c 100644
--- a/src/unitxt/catalog/cards/bfcl/simple_v3.json
+++ b/src/unitxt/catalog/cards/bfcl/simple_v3.json
@@ -30,9 +30,7 @@
             "field": "function",
             "to_field": "tools"
         },
-        {
-            "__type__": "fix_json_schema_of_tool_parameter_types"
-        },
+        "operators.fix_json_schema",
         {
             "__type__": "execute_expression",
             "expression": "[{\"name\": k, \"arguments\": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]",
diff --git a/src/unitxt/catalog/cards/xlam_function_calling_60k.json b/src/unitxt/catalog/cards/xlam_function_calling_60k.json
index 9fcd8f763c..a7f65e3694 100644
--- a/src/unitxt/catalog/cards/xlam_function_calling_60k.json
+++ b/src/unitxt/catalog/cards/xlam_function_calling_60k.json
@@ -64,8 +64,10 @@
             "expression": "[[p for p, c in tool['parameters']['properties'].items() if 'optional' not in c['type'].lower()] for tool in tools]"
         },
         {
-            "__function__": "extract_required_parameters",
-            "source": "def extract_required_parameters(instance, stream_name=None):\n    for tool in instance[\"tools\"]:\n        required_params = []\n        for param_name, param_info in tool[\"parameters\"][\"properties\"].items():\n            if \"optional\" not in param_info[\"type\"].lower():\n                required_params.append(param_name)\n        tool[\"parameters\"][\"required\"] = required_params\n    return instance\n"
+            "__type__": "copy",
+            "field": "required",
+            "to_field": "tools/*/parameters/required",
+            "set_every_value": true
         },
         {
             "__type__": "fix_json_schema_of_parameter_types",
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 35db9dcc6f..7647cd02fa 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -2737,6 +2737,8 @@ def process(
         ), f"field '{self.main_field}' must reside in instance in order to verify its jsonschema correctness. got {instance}"
         self.recursive_trace_for_type_fields(instance[self.main_field])
         return instance
+
+
 class FunctionOperator(StreamOperator):
     function: Callable
 
@@ -2772,103 +2774,3 @@ def process(self, stream: Stream, stream_name: Optional[str] = None):
         if self._mode == "instance":
             for instance in stream:
                 yield self.function(instance, stream_name)
-
-
-class FixJsonSchemaOfToolParameterTypes(InstanceOperator):
-    def prepare(self):
-        self.simple_mapping = {
-            "": "object",
-            "any": "object",
-            "Any": "object",
-            "Array": "array",
-            "ArrayList": "array",
-            "Bigint": "integer",
-            "bool": "boolean",
-            "Boolean": "boolean",
-            "byte": "integer",
-            "char": "string",
-            "dict": "object",
-            "Dict": "object",
-            "double": "number",
-            "float": "number",
-            "HashMap": "object",
-            "Hashtable": "object",
-            "int": "integer",
-            "list": "array",
-            "List": "array",
-            "long": "integer",
-            "Queue": "array",
-            "short": "integer",
-            "Stack": "array",
-            "tuple": "array",
-            "Set": "array",
-            "set": "array",
-            "str": "string",
-            "String": "string",
-        }
-
-    def dict_type_of(self, type_str: str) -> dict:
-        return {"type": type_str}
-
-    def recursive_trace_for_type_fields(self, containing_element):
-        if isinstance(containing_element, dict):
-            keys = list(containing_element.keys())
-            for key in keys:
-                if key == "type" and isinstance(containing_element["type"], str):
-                    jsonschema_dict = self.type_str_to_jsonschema_dict(
-                        containing_element["type"]
-                    )
-                    containing_element.pop("type")
-                    containing_element.update(jsonschema_dict)
-                else:
-                    self.recursive_trace_for_type_fields(containing_element[key])
-        elif isinstance(containing_element, list):
-            for list_element in containing_element:
-                self.recursive_trace_for_type_fields(list_element)
-
-    def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
-        if type_str in self.simple_mapping:
-            return self.dict_type_of(self.simple_mapping[type_str])
-        m = re.match(r"^(List|Tuple)\[(.*?)\]$", type_str)
-        if m:
-            basic_type = self.dict_type_of("array")
-            basic_type["items"] = self.type_str_to_jsonschema_dict(
-                m.group(2) if m.group(1) == "List" else m.group(2).split(",")[0].strip()
-            )
-            return basic_type
-
-        m = re.match(r"^(Union)\[(.*?)\]$", type_str)
-        if m:
-            args = m.group(2).split(",")
-            for i in range(len(args)):
-                args[i] = args[i].strip()
-            return {"anyOf": [self.type_str_to_jsonschema_dict(arg) for arg in args]}
-        if re.match(r"^(Callable)\[(.*?)\]$", type_str):
-            return self.dict_type_of("object")
-        if "," in type_str:
-            sub_types = type_str.split(",")
-            for i in range(len(sub_types)):
-                sub_types[i] = sub_types[i].strip()
-            assert len(sub_types) in [
-                2,
-                3,
-            ], f"num of subtypes should be 2 or 3, got {type_str}"
-            basic_type = self.type_str_to_jsonschema_dict(sub_types[0])
-            for sub_type in sub_types[1:]:
-                if sub_type.lower().startswith("default"):
-                    basic_type["default"] = re.split(r"[= ]", sub_type, maxsplit=1)[1]
-            for sub_type in sub_types[1:]:
-                if sub_type.lower().startswith("optional"):
-                    return {"anyOf": [basic_type, self.dict_type_of("null")]}
-            return basic_type
-
-        return self.dict_type_of(type_str)  # otherwise - return what arrived
-
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        assert (
-            "tools" in instance
-        ), f"field 'tools' must reside in instance in order to verify its jsonschema correctness. got {instance}"
-        self.recursive_trace_for_type_fields(instance["tools"])
-        return instance