-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathrun_ablation_experiments.py
More file actions
115 lines (102 loc) · 3.7 KB
/
run_ablation_experiments.py
File metadata and controls
115 lines (102 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import swanlab
from spectrumlab.models import Qwen_2_5_VL_32B
from spectrumlab.benchmark.signal_group import SignalGroup
from spectrumlab.benchmark.generation_group import GenerationGroup
from spectrumlab.benchmark.perception_group import PerceptionGroup
from spectrumlab.benchmark.semantic_group import SemanticGroup
from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator
from spectrumlab.evaluator.open_evaluator import OpenEvaluator
# ABLATION_CONFIGS = [
# {
# "model_class": Qwen_2_5_VL_72B,
# "model_name": "Qwen-2.5-VL-72B",
# "temperature": 0,
# "top_p": 1.0,
# "save_dir": "./ablation_qwen_2_5_vl_72b_temp_0_evaluation_results",
# },
# ]
ABLATION_CONFIGS = [
{
"model_class": Qwen_2_5_VL_32B,
"model_name": "Qwen-2.5-VL-32B",
"temperature": 1,
"top_p": 1,
"save_dir": "./ablation_qwen_2_5_vl_32b_baselines_evaluation_results",
},
]
# 定义每个 Group 及其子任务和评测器 - 先测试Signal组
GROUPS = [
{
"name": "Signal",
"group": SignalGroup("data"),
"evaluator": ChoiceEvaluator(),
"subcategories": None, # None 表示全部
},
{
"name": "Perception",
"group": PerceptionGroup("data"),
"evaluator": ChoiceEvaluator(),
"subcategories": None,
},
{
"name": "Semantic",
"group": SemanticGroup("data"),
"evaluator": ChoiceEvaluator(),
"subcategories": None,
},
{
"name": "Generation",
"group": GenerationGroup("data"),
"evaluator": OpenEvaluator(),
"subcategories": None,
},
]
for config in ABLATION_CONFIGS:
print(f"\n{'='*60}")
print(
f"开始消融实验: {config['model_name']} (temperature={config['temperature']}, top_p={config['top_p']})"
)
print(f"{'='*60}")
model = config["model_class"]()
# 初始化 SwanLab
swanlab.init(
workspace="SpectrumLab",
project="spectrumlab-ablation",
experiment_name=f"{config['model_name']}_temp_{config['temperature']}_top_p_{config['top_p']}",
config=config,
)
# 遍历每个评测组
for group_info in GROUPS:
name = group_info["name"]
group = group_info["group"]
evaluator = group_info["evaluator"]
subcategories = group_info["subcategories"]
print(f"\n===== Evaluating {name} Group =====")
data = group.get_data_by_subcategories(subcategories or "all")
class ModelWithSamplingParams:
def __init__(self, base_model, temperature, top_p):
self.base_model = base_model
self.temperature = temperature
self.top_p = top_p
self.model_name = base_model.model_name
def generate(self, prompt, max_tokens=512):
return self.base_model.generate(
prompt,
max_tokens=max_tokens,
temperature=self.temperature,
top_p=self.top_p,
)
wrapped_model = ModelWithSamplingParams(
model, config["temperature"], config["top_p"]
)
results = evaluator.evaluate(
data_items=data, model=wrapped_model, save_path=config["save_dir"]
)
accuracy = results["metrics"]["overall"]["accuracy"]
print(f"{name} Group evaluation completed! Overall accuracy: {accuracy:.2f}%\n")
swanlab.log({f"{name}_accuracy": accuracy})
swanlab.finish()
print(f"\n消融实验 {config['model_name']} 完成!")
print(f"结果保存在: {config['save_dir']}")
# use nohup in the terminal to start the evaluation
# nohup python run_ablation_experiments.py > run_ablation.log 2>&1 &