host_vllm/config.yaml at main · KatherLab/host_vllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# VLLM Host Configuration
# Set the active_model to the model you want to run
# The run.sh script will automatically submit the correct SLURM job

# Active model selection
# Options: gpt-oss-20b, gpt-oss-120b, glm-4.7-flash, glm-4.7, glm-4.6v, qwen3-vl-235b, llama-4-scout, llama-4-maverick
active_model: glm-4.6v

# Model configurations
models:
  gpt-oss-20b:
    name: "GPT-OSS 20B"
    huggingface_id: "openai/gpt-oss-20b"
    job_script: "jobs/gpt_oss_20b_single_h100.sh"
    query_script: "scripts/query_openai_compatible.py"  # Optional: script to query the model
    port: 8000
    gpus: 1
    nodes: 1
    quantization: "mxfp4"  # Built-in MXFP4 quantization
    precision: "8bit"  # Options: "8bit" (default), "16bit" (2x VRAM)
    vision: false
    description: "24GB VRAM, fast reasoning model with tool use"

  gpt-oss-120b:
    name: "GPT-OSS 120B"
    huggingface_id: "openai/gpt-oss-120b"
    job_script: "jobs/gpt_oss_120b_single_h100.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8000
    gpus: 1
    nodes: 1
    quantization: "mxfp4"
    precision: "8bit"
    vision: false
    description: "80GB VRAM, high-reasoning model for production"

  glm-4.7-flash:
    name: "GLM-4.7-Flash"
    huggingface_id: "zai-org/GLM-4.7-Flash"
    job_script: "jobs/glm4_7_flash_2gpus.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8001
    gpus: 2
    nodes: 1
    quantization: "fp8"
    precision: "8bit"
    vision: false
    description: "30B-A3B MoE, 128K context, fast inference on 2 GPUs"

  glm-4.7:
    name: "GLM-4.7"
    huggingface_id: "zai-org/GLM-4.7"
    job_script: "jobs/glm4_7_2nodes_4gpus.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8002
    gpus: 8
    nodes: 2
    quantization: "fp8"
    precision: "8bit"
    vision: false
    description: "358B MoE, 131K context, 8 GPUs (2 nodes)"

  glm-4.6v:
    name: "GLM-4.6V"
    huggingface_id: "zai-org/GLM-4.6V-FP8"
    job_script: "jobs/glm4_6v_2gpus_fp8.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8004
    gpus: 2
    nodes: 1
    quantization: "fp8"
    precision: "8bit"  # 8-bit uses 2 GPUs, 16-bit would need 4 GPUs
    vision: true
    description: "108B vision-language MoE, 128K context, 2 GPUs with FP8"

  glm-4.6v-fp16:
    name: "GLM-4.6V (16-bit)"
    huggingface_id: "zai-org/GLM-4.6V"
    job_script: "jobs/glm4_6v_4gpus_fp16.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8004
    gpus: 4
    nodes: 1
    quantization: "bf16"
    precision: "16bit"
    vision: true
    description: "108B vision-language MoE, FULL PRECISION, requires 4 GPUs"

  qwen3-vl-235b:
    name: "Qwen3-VL-235B-A22B-Thinking"
    huggingface_id: "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8"
    job_script: "jobs/qwen3_vl_235b_1node.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8003
    gpus: 8
    nodes: 2
    quantization: "fp8"
    precision: "8bit"
    vision: true
    time: "24:00:00"      # Job time limit (overrides slurm.time_default)
    # partition: "long"     # Optional: override SLURM partition
    description: "236B MoE VL, 256K context, visual agent capabilities, 16 GPUs (2 nodes)"

  llama-4-scout:
    name: "Llama-4-Scout-17B-16E-Instruct"
    huggingface_id: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
    job_script: "jobs/llama4_scout_2gpus.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8005
    gpus: 2
    nodes: 1
    quantization: "fp8"
    precision: "8bit"
    vision: true
    description: "17Bx16E MoE, 10M context, multimodal VLM, 2 GPUs with FP8"

  llama-4-maverick:
    name: "Llama-4-Maverick-17B-128E-Instruct"
    huggingface_id: "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
    job_script: "jobs/llama4_maverick_2nodes_8gpus.sh"
    query_script: "scripts/query_openai_compatible.py"
    port: 8006
    gpus: 8
    nodes: 2
    quantization: "fp8"
    precision: "8bit"
    vision: true
    description: "17Bx128E MoE, 1M context, multimodal VLM, 8 GPUs (2 nodes) with FP8"

# SLURM cluster configuration
slurm:
  partition: "capella"
  account: "p_scads_pathology"  # SLURM account for job submission
  time_default: "12:00:00"  # Default job time limit (HH:MM:SS)
  cpus_per_task: 16  # Number of CPU cores per GPU task
  mem_per_gpu: "64G"  # Memory allocation per GPU

# Cache directories (shared across jobs)
# Set these to your preferred cache locations to avoid exposing paths in job scripts
cache:
  xdg_cache_home: "/data/horse/ws/s1787956-Cache"  # Leave empty to use system default, or set to: /path/to/cache
  triton_cache_dir: "/data/horse/ws/s1787956-Cache/triton"  # Leave empty to use $XDG_CACHE_HOME/triton, or set to: /path/to/triton_cache
  huggingface_cache: "/data/horse/ws/s1787956-Cache/huggingface"  # Optional: HuggingFace hub cache directory

# Environment paths
paths:
  venv_dir: "/data/horse/ws/s1787956-host_vllm/code/host_vllm/.venv"  # Python virtual environment directory (e.g., $HOME/host_vllm/.venv)
  # Leave empty to auto-detect from current environment

# vLLM server settings
vllm:
  gpu_memory_utilization: 0.95  # Fraction of GPU memory to use (0.0-1.0)
  max_model_len_default: 32768  # Default maximum sequence length
  max_num_seqs: 8  # Maximum number of sequences to process in parallel
  tensor_parallel_size: null  # Auto-calculated from model.gpus * model.nodes
  dtype: "auto"  # Data type: auto, half, float16, bfloat16

# Multi-node distributed configuration
distributed:
  rdzv_backend: "c10d"  # PyTorch rendezvous backend (c10d, gloo, mpi)
  rdzv_id: "vllm_job"  # Unique job identifier for rendezvous
  rdzv_timeout: 1200  # Rendezvous timeout in seconds
  master_port_base: 29500  # Base port for communication

# NCCL configuration for multi-GPU/multi-node
nccl:
  socket_ifname: ""  # Network interface for NCCL (e.g., eth0, ib0). Leave empty for auto-detection
  blocksize: 1048576  # NCCL block size for communication
  ib_timeout: 22  # InfiniBand timeout
  debug: "WARN"  # NCCL debug level: TRACE, INFO, WARN, ERROR

# Quantization settings
quantization:
  fp8:  # FP8 quantization settings
    enabled: false  # Enable FP8 if model supports it
    #calibration_size: 512  # Calibration size for FP8 activation scaling
  mxfp4:  # MXFP4 quantization settings
    enabled: false  # Enable MXFP4 if model uses it
  bf16:  # BFloat16 settings
    enabled: false  # Use BFloat16 instead of FP8/INT8

# Logging
logging:
  log_dir: "logs"
  verbose: true