diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py index a5853628b5..c0ac04542d 100644 --- a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py +++ b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py @@ -6,7 +6,11 @@ ) from unitxt.llm_as_judge import LLMAsJudge -model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"] +model_list = [ + "meta-llama/llama-3-8b-instruct", + "meta-llama/llama-3-70b-instruct", + "meta-llama/llama-3-3-70b-instruct", +] format = "formats.llama3_instruct" templates = [ "templates.response_assessment.pairwise_comparative_rating.arena_hard", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard.json new file mode 100644 index 0000000000..706e4ff03a --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard.json @@ -0,0 +1,11 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "generic_inference_engine", + "default": "engines.ibm_gen_ai.llama_3_70b_instruct" + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_3_70b_instruct_generic_engine_template_arena_hard" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard_with_shuffling.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard_with_shuffling.json new file mode 100644 index 0000000000..954f0d59bd --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_generic_engine_template_arena_hard_with_shuffling.json @@ -0,0 +1,11 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "generic_inference_engine", + "default": "engines.ibm_gen_ai.llama_3_70b_instruct" + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_3_70b_instruct_generic_engine_template_arena_hard_with_shuffling" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard.json new file mode 100644 index 0000000000..844d0e90c0 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "wml_inference_engine", + "model_name": "meta-llama/llama-3-3-70b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_3_70b_instruct_ibm_wml_template_arena_hard" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard_with_shuffling.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard_with_shuffling.json new file mode 100644 index 0000000000..2e824cd1e2 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_3_70b_instruct_ibm_wml_template_arena_hard_with_shuffling.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "wml_inference_engine", + "model_name": "meta-llama/llama-3-3-70b-instruct", + "max_new_tokens": 2048, + "random_seed": 42 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.llama3_instruct", + "main_score": "llama_3_3_70b_instruct_ibm_wml_template_arena_hard_with_shuffling" +}