-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.yaml
More file actions
185 lines (167 loc) · 6.33 KB
/
config.yaml
File metadata and controls
185 lines (167 loc) · 6.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# VLLM Host Configuration
# Set the active_model to the model you want to run
# The run.sh script will automatically submit the correct SLURM job
# Active model selection
# Options: gpt-oss-20b, gpt-oss-120b, glm-4.7-flash, glm-4.7, glm-4.6v, qwen3-vl-235b, llama-4-scout, llama-4-maverick
active_model: glm-4.6v
# Model configurations
models:
gpt-oss-20b:
name: "GPT-OSS 20B"
huggingface_id: "openai/gpt-oss-20b"
job_script: "jobs/gpt_oss_20b_single_h100.sh"
query_script: "scripts/query_openai_compatible.py" # Optional: script to query the model
port: 8000
gpus: 1
nodes: 1
quantization: "mxfp4" # Built-in MXFP4 quantization
precision: "8bit" # Options: "8bit" (default), "16bit" (2x VRAM)
vision: false
description: "24GB VRAM, fast reasoning model with tool use"
gpt-oss-120b:
name: "GPT-OSS 120B"
huggingface_id: "openai/gpt-oss-120b"
job_script: "jobs/gpt_oss_120b_single_h100.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8000
gpus: 1
nodes: 1
quantization: "mxfp4"
precision: "8bit"
vision: false
description: "80GB VRAM, high-reasoning model for production"
glm-4.7-flash:
name: "GLM-4.7-Flash"
huggingface_id: "zai-org/GLM-4.7-Flash"
job_script: "jobs/glm4_7_flash_2gpus.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8001
gpus: 2
nodes: 1
quantization: "fp8"
precision: "8bit"
vision: false
description: "30B-A3B MoE, 128K context, fast inference on 2 GPUs"
glm-4.7:
name: "GLM-4.7"
huggingface_id: "zai-org/GLM-4.7"
job_script: "jobs/glm4_7_2nodes_4gpus.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8002
gpus: 8
nodes: 2
quantization: "fp8"
precision: "8bit"
vision: false
description: "358B MoE, 131K context, 8 GPUs (2 nodes)"
glm-4.6v:
name: "GLM-4.6V"
huggingface_id: "zai-org/GLM-4.6V-FP8"
job_script: "jobs/glm4_6v_2gpus_fp8.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8004
gpus: 2
nodes: 1
quantization: "fp8"
precision: "8bit" # 8-bit uses 2 GPUs, 16-bit would need 4 GPUs
vision: true
description: "108B vision-language MoE, 128K context, 2 GPUs with FP8"
glm-4.6v-fp16:
name: "GLM-4.6V (16-bit)"
huggingface_id: "zai-org/GLM-4.6V"
job_script: "jobs/glm4_6v_4gpus_fp16.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8004
gpus: 4
nodes: 1
quantization: "bf16"
precision: "16bit"
vision: true
description: "108B vision-language MoE, FULL PRECISION, requires 4 GPUs"
qwen3-vl-235b:
name: "Qwen3-VL-235B-A22B-Thinking"
huggingface_id: "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8"
job_script: "jobs/qwen3_vl_235b_1node.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8003
gpus: 8
nodes: 2
quantization: "fp8"
precision: "8bit"
vision: true
time: "24:00:00" # Job time limit (overrides slurm.time_default)
# partition: "long" # Optional: override SLURM partition
description: "236B MoE VL, 256K context, visual agent capabilities, 16 GPUs (2 nodes)"
llama-4-scout:
name: "Llama-4-Scout-17B-16E-Instruct"
huggingface_id: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
job_script: "jobs/llama4_scout_2gpus.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8005
gpus: 2
nodes: 1
quantization: "fp8"
precision: "8bit"
vision: true
description: "17Bx16E MoE, 10M context, multimodal VLM, 2 GPUs with FP8"
llama-4-maverick:
name: "Llama-4-Maverick-17B-128E-Instruct"
huggingface_id: "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
job_script: "jobs/llama4_maverick_2nodes_8gpus.sh"
query_script: "scripts/query_openai_compatible.py"
port: 8006
gpus: 8
nodes: 2
quantization: "fp8"
precision: "8bit"
vision: true
description: "17Bx128E MoE, 1M context, multimodal VLM, 8 GPUs (2 nodes) with FP8"
# SLURM cluster configuration
slurm:
partition: "capella"
account: "p_scads_pathology" # SLURM account for job submission
time_default: "12:00:00" # Default job time limit (HH:MM:SS)
cpus_per_task: 16 # Number of CPU cores per GPU task
mem_per_gpu: "64G" # Memory allocation per GPU
# Cache directories (shared across jobs)
# Set these to your preferred cache locations to avoid exposing paths in job scripts
cache:
xdg_cache_home: "/data/horse/ws/s1787956-Cache" # Leave empty to use system default, or set to: /path/to/cache
triton_cache_dir: "/data/horse/ws/s1787956-Cache/triton" # Leave empty to use $XDG_CACHE_HOME/triton, or set to: /path/to/triton_cache
huggingface_cache: "/data/horse/ws/s1787956-Cache/huggingface" # Optional: HuggingFace hub cache directory
# Environment paths
paths:
venv_dir: "/data/horse/ws/s1787956-host_vllm/code/host_vllm/.venv" # Python virtual environment directory (e.g., $HOME/host_vllm/.venv)
# Leave empty to auto-detect from current environment
# vLLM server settings
vllm:
gpu_memory_utilization: 0.95 # Fraction of GPU memory to use (0.0-1.0)
max_model_len_default: 32768 # Default maximum sequence length
max_num_seqs: 8 # Maximum number of sequences to process in parallel
tensor_parallel_size: null # Auto-calculated from model.gpus * model.nodes
dtype: "auto" # Data type: auto, half, float16, bfloat16
# Multi-node distributed configuration
distributed:
rdzv_backend: "c10d" # PyTorch rendezvous backend (c10d, gloo, mpi)
rdzv_id: "vllm_job" # Unique job identifier for rendezvous
rdzv_timeout: 1200 # Rendezvous timeout in seconds
master_port_base: 29500 # Base port for communication
# NCCL configuration for multi-GPU/multi-node
nccl:
socket_ifname: "" # Network interface for NCCL (e.g., eth0, ib0). Leave empty for auto-detection
blocksize: 1048576 # NCCL block size for communication
ib_timeout: 22 # InfiniBand timeout
debug: "WARN" # NCCL debug level: TRACE, INFO, WARN, ERROR
# Quantization settings
quantization:
fp8: # FP8 quantization settings
enabled: false # Enable FP8 if model supports it
#calibration_size: 512 # Calibration size for FP8 activation scaling
mxfp4: # MXFP4 quantization settings
enabled: false # Enable MXFP4 if model uses it
bf16: # BFloat16 settings
enabled: false # Use BFloat16 instead of FP8/INT8
# Logging
logging:
log_dir: "logs"
verbose: true