From f61ed44147db719f286ca1476b969fc5d191060c Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sat, 15 Nov 2025 22:56:34 +0900 Subject: [PATCH 01/16] plamo3 --- convert_hf_to_gguf.py | 188 ++++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 20 ++++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 2 + src/CMakeLists.txt | 1 + src/llama-arch.cpp | 22 ++++ src/llama-arch.h | 2 + src/llama-model-saver.cpp | 2 +- src/llama-model.cpp | 72 ++++++++++++- src/models/models.h | 4 + src/models/plamo3.cpp | 106 +++++++++++++++++++ 11 files changed, 415 insertions(+), 7 deletions(-) create mode 100644 src/models/plamo3.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cc77a3db273e4..f58390ce95e9b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -493,7 +493,6 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused return [(self.map_tensor_name(name), data_torch)] @@ -1674,7 +1673,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -4812,6 +4810,192 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] +class PlamoTokenizerMixin: + def _set_plamo_vocab(self) -> None: + # PLaMo models use a custom tokenizer with a .jsonl file + tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" + tokenizer_config_path = self.dir_model / "tokenizer_config.json" + + if not tokenizer_jsonl_path.is_file(): + raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") + + # Load tokenizer config + with open(tokenizer_config_path, "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + + # Load tokens from JSONL file (actually a list format) + tokens = [] + scores = [] + toktypes = [] + + with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f: + for line_num, line in enumerate(f): + if line.strip(): + token_data = json.loads(line) + # Format: [token, score, type, ?, ?, ?, ?] + token = token_data[0].encode("utf-8") + score = float(token_data[1]) + token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL" + + tokens.append(token) + scores.append(score) + + if token_type_str == "UNKNOWN": + toktypes.append(gguf.TokenType.UNKNOWN) + elif token_type_str == "CONTROL": + toktypes.append(gguf.TokenType.CONTROL) + elif token_type_str == "BYTE": + toktypes.append(gguf.TokenType.BYTE) + else: + token_str = token_data[0] + if token_str.startswith("<|plamo:") and token_str.endswith("|>"): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + vocab_size = self.hparams["vocab_size"] + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("plamo2") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None: + token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8")) + self.gguf_writer.add_bos_token_id(token_id) + if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None: + token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8")) + self.gguf_writer.add_eos_token_id(token_id) + if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None: + token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8")) + self.gguf_writer.add_pad_token_id(token_id) + if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None: + token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8")) + self.gguf_writer.add_sep_token_id(token_id) + if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None: + token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8")) + self.gguf_writer.add_unk_token_id(token_id) + + # Add <|plamo:op|> as EOT to ensure appropriate end of generation + self.gguf_writer.add_eot_token_id(4) + + self.gguf_writer.add_add_space_prefix(False) + + +@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") +class Plamo3Model(PlamoTokenizerMixin, TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO3 + + def set_vocab(self): + self._set_plamo_vocab() + + def _sliding_window_pattern(self, block_count: int) -> list[bool]: + layer_types = self.hparams.get("layer_types") + if isinstance(layer_types, list) and len(layer_types) == block_count: + return [t == "sliding_attention" for t in layer_types] + + pattern = self.hparams.get("sliding_window_pattern") + if isinstance(pattern, int) and pattern > 0: + return [((i + 1) % pattern) != 0 for i in range(block_count)] + + return [] + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + head_dim = hparams["head_dim"] + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_rope_freq_base(hparams["rope_theta"]) + rope_local = hparams.get("rope_local_theta") + if rope_local is not None: + self.gguf_writer.add_rope_freq_base_swa(rope_local) + + window_size = hparams.get("window_size") or hparams.get("sliding_window") or 0 + self.gguf_writer.add_sliding_window(window_size) + + pattern = self._sliding_window_pattern(block_count) + if len(pattern) == block_count and any(pattern): + self.gguf_writer.add_sliding_window_pattern(pattern) + + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + if name.endswith(".pre_mixer_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mixer_norm.weight"): + data_torch = data_torch + 1.0 / 5 + elif name.endswith(".pre_mlp_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mlp_norm.weight"): + data_torch = data_torch + 1.0 / (5**1.5) + elif name.endswith(".norm.weight"): + data_torch = data_torch + 1.0 + + results: list[tuple[str, Tensor]] = [] + + if "gate_up_proj.weight" in name: + name_up = name.replace("gate_up_proj.weight", "up_proj.weight") + name_gate = name.replace("gate_up_proj.weight", "gate_proj.weight") + + n_embd = self.hparams["hidden_size"] + n_ff = self.hparams["intermediate_size"] + two_ff = 2 * n_ff + + if data_torch.shape == (two_ff, n_embd): + chunks = torch.chunk(data_torch, 2, dim=0) + elif data_torch.shape == (n_embd, two_ff): + chunks = torch.chunk(data_torch, 2, dim=1) + else: + raise ValueError(f"Unexpected gate_up_proj shape {tuple(data_torch.shape)}") + + processed: list[Tensor] = [] + + for chunk in chunks: + if chunk.shape == (n_ff, n_embd): + chunk = chunk.transpose(0, 1) + elif chunk.shape != (n_embd, n_ff): + raise ValueError(f"Unexpected gate/up chunk shape {tuple(chunk.shape)}") + # processed.append(chunk.contiguous()) + processed.append(chunk.contiguous().transpose(0, 1)) + + gate_proj_weight, up_proj_weight = processed + + results.append((self.map_tensor_name(name_gate), gate_proj_weight)) + results.append((self.map_tensor_name(name_up), up_proj_weight)) + else: + mapped = self.map_tensor_name(name) + if mapped.endswith(("ffn_gate.weight", "ffn_up.weight")): + n_embd = self.hparams["hidden_size"] + n_ff = self.hparams["intermediate_size"] + if bid is None or bid == 0: + logger.info("plamo3 map %s -> %s raw shape %s", name, mapped, tuple(data_torch.shape)) + if data_torch.shape == (n_ff, n_embd): + data_torch = data_torch.transpose(0, 1).contiguous() + elif data_torch.shape != (n_embd, n_ff): + raise ValueError(f"Unexpected FFN tensor shape {mapped}: {tuple(data_torch.shape)}") + results.append((mapped, data_torch)) + + return results + @ModelBase.register("CodeShellForCausalLM") class CodeShellModel(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6b4b6c5ab075d..a9fc4d0fb834c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -166,6 +166,7 @@ class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" + FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" @@ -359,6 +360,7 @@ class MODEL_ARCH(IntEnum): PHIMOE = auto() PLAMO = auto() PLAMO2 = auto() + PLAMO3 = auto() CODESHELL = auto() ORION = auto() INTERNLM2 = auto() @@ -726,6 +728,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.PHIMOE: "phimoe", MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO2: "plamo2", + MODEL_ARCH.PLAMO3: "plamo3", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", @@ -1628,6 +1631,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_B_NORM, MODEL_TENSOR.SSM_C_NORM, ], + MODEL_ARCH.PLAMO3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.GPT2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.POS_EMBD, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a051daeeb1341..68440f0d668e4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -872,6 +872,9 @@ def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: def add_rope_freq_base(self, value: float) -> None: self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) + def add_rope_freq_base_swa(self, value: float) -> None: + self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value) + def add_rope_scaling_type(self, value: RopeScalingType) -> None: self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 929406687610c..7738c40862a41 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -567,6 +567,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm "model.layers.layers.{bid}.mixer.q", # plamo2 + "model.layers.layers.{bid}.mixer.q_norm", # plamo3 "layers.{bid}.self_attn.q_norm", # qwen3-embedding "model.layers.{bid}.attention.query_layernorm", # apertus ), @@ -582,6 +583,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm "model.layers.layers.{bid}.mixer.k", # plamo2 + "model.layers.layers.{bid}.mixer.k_norm", # plamo3 "layers.{bid}.self_attn.k_norm", # qwen3-embedding "model.layers.{bid}.attention.key_layernorm", # apertus ), diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6fc5b00101058..004e7c7e61f87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -104,6 +104,7 @@ add_library(llama models/phi3.cpp models/plamo.cpp models/plamo2.cpp + models/plamo3.cpp models/plm.cpp models/qwen.cpp models/qwen2.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b7642b568dffb..05dda0e207808 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -39,6 +39,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_PHIMOE, "phimoe" }, { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO2, "plamo2" }, + { LLM_ARCH_PLAMO3, "plamo3" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, @@ -195,6 +196,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, @@ -928,6 +930,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, + { + LLM_ARCH_PLAMO3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_CODESHELL, { diff --git a/src/llama-arch.h b/src/llama-arch.h index a769dd1e85741..19d0fd9703825 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -43,6 +43,7 @@ enum llm_arch { LLM_ARCH_PHIMOE, LLM_ARCH_PLAMO, LLM_ARCH_PLAMO2, + LLM_ARCH_PLAMO3, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, @@ -199,6 +200,7 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_FREQ_BASE_SWA, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 563823dc35d8e..3b66e698acee7 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -188,6 +188,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); + add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor); @@ -279,4 +280,3 @@ void llama_model_saver::add_tensors_from_model() { void llama_model_saver::save(const std::string & path_model) { gguf_write_to_file(gguf_ctx, path_model.c_str(), false); } - diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 829f1e3c14f82..0bdeeb49c042a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -565,6 +565,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); @@ -1146,6 +1147,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); } break; + case LLM_ARCH_PLAMO3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + if (hparams.n_swa == 0) { + hparams.n_swa = 2048; + } + hparams.set_swa_pattern(8); + + switch (hparams.n_layer) { + case 24: type = LLM_TYPE_2B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_GPT2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2243,16 +2259,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) { split_sum += splits[i]; splits[i] = split_sum; } - for (size_t i = 0; i < n_devices(); ++i) { - splits[i] /= split_sum; + if (split_sum > 0.0f) { + for (size_t i = 0; i < n_devices(); ++i) { + splits[i] /= split_sum; + } + } else { + LLAMA_LOG_WARN("load_tensors: no available GPU memory detected, falling back to CPU\n"); } - ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (cpu_dev == nullptr) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); } const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); + const bool enable_gpu = !devices.empty() && split_sum > 0.0f; + const int act_gpu_layers = enable_gpu ? std::min(n_gpu_layers, (int)n_layer + 1) : 0; auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { @@ -3618,6 +3638,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0); } } break; + case LLM_ARCH_PLAMO3: + { + const int64_t head_dim_q = hparams.n_embd_head_k; + const int64_t head_dim_v = hparams.n_embd_head_v; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t num_attention_heads = hparams.n_head(i); + const int64_t num_key_value_heads = hparams.n_head_kv(i); + const int64_t q_proj_dim = num_attention_heads * head_dim_q; + const int64_t k_proj_dim = num_key_value_heads * head_dim_q; + const int64_t v_proj_dim = num_key_value_heads * head_dim_v; + const int64_t n_ff_cur = hparams.n_ff(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), + {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_cur}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0); + } + } break; case LLM_ARCH_GPT2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7021,6 +7080,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_PLAMO3: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_GPT2: { llm = std::make_unique(*this, params); @@ -7499,6 +7562,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PHIMOE: case LLM_ARCH_PLAMO: case LLM_ARCH_PLAMO2: + case LLM_ARCH_PLAMO3: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: diff --git a/src/models/models.h b/src/models/models.h index 2fffb382df2e5..3c2417e1e67ff 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -386,6 +386,10 @@ struct llm_build_plamo : public llm_graph_context { llm_build_plamo(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_plamo3 : public llm_graph_context { + llm_build_plamo3(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_plm : public llm_graph_context { llm_build_plm(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp new file mode 100644 index 0000000000000..95b095aef24a3 --- /dev/null +++ b/src/models/plamo3.cpp @@ -0,0 +1,106 @@ +#include "models.h" + +llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t head_dim_q = hparams.n_embd_head_k; + const int64_t head_dim_v = hparams.n_embd_head_v; + + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + llm_graph_input_attn_kv_iswa * inp_attn_iswa = nullptr; + llm_graph_input_attn_kv * inp_attn = nullptr; + + if (hparams.is_swa_any()) { + inp_attn_iswa = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * residual = inpL; + + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + + ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); + + const int32_t n_head = hparams.n_head(il); + const int32_t n_head_kv = hparams.n_head_kv(il); + + const int64_t q_offset = 0; + const int64_t k_offset = head_dim_q * n_head; + const int64_t v_offset = k_offset + head_dim_q * n_head_kv; + + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens, + head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens, + head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens, + head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv)); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float attn_scale = 1.0f / sqrtf(float(head_dim_q)); + + if (inp_attn_iswa) { + cur = build_attn(inp_attn_iswa, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il); + } else { + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cur = ggml_add(ctx0, cur, residual); + residual = cur; + + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + + ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur); + ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur); + ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up); + + cur = build_lora_mm(model.layers[il].ffn_down, ffn_act); + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, residual); + cur = build_cvec(cur, il); + inpL = cur; + } + + ggml_tensor * cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} From c3b61343cae4701e7e7b87064d472f81dbd6ed22 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 05:49:40 +0900 Subject: [PATCH 02/16] fix plamo3 --- convert_hf_to_gguf.py | 4 +++- src/llama-model.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f58390ce95e9b..820130d945181 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4947,6 +4947,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch + 1.0 elif name.endswith(".post_mlp_norm.weight"): data_torch = data_torch + 1.0 / (5**1.5) + elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")): + data_torch = data_torch + 1.0 elif name.endswith(".norm.weight"): data_torch = data_torch + 1.0 @@ -4975,7 +4977,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif chunk.shape != (n_embd, n_ff): raise ValueError(f"Unexpected gate/up chunk shape {tuple(chunk.shape)}") # processed.append(chunk.contiguous()) - processed.append(chunk.contiguous().transpose(0, 1)) + processed.append(chunk.transpose(0, 1).contiguous()) gate_proj_weight, up_proj_weight = processed diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0bdeeb49c042a..d81b51ee3545f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -565,7 +565,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); - ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + const bool has_rope_freq_base_swa = + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); @@ -581,7 +582,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers - hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + if (!has_rope_freq_base_swa) { + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + } hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); @@ -6608,6 +6611,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); + LLAMA_LOG_INFO("%s: freq_base_train_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); From ce7a9220cedf619293bd859e2d86b5e98e58e9d7 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 07:10:05 +0900 Subject: [PATCH 03/16] clean code --- convert_hf_to_gguf.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 820130d945181..547ecb322f9c9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4984,17 +4984,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter results.append((self.map_tensor_name(name_gate), gate_proj_weight)) results.append((self.map_tensor_name(name_up), up_proj_weight)) else: - mapped = self.map_tensor_name(name) - if mapped.endswith(("ffn_gate.weight", "ffn_up.weight")): - n_embd = self.hparams["hidden_size"] - n_ff = self.hparams["intermediate_size"] - if bid is None or bid == 0: - logger.info("plamo3 map %s -> %s raw shape %s", name, mapped, tuple(data_torch.shape)) - if data_torch.shape == (n_ff, n_embd): - data_torch = data_torch.transpose(0, 1).contiguous() - elif data_torch.shape != (n_embd, n_ff): - raise ValueError(f"Unexpected FFN tensor shape {mapped}: {tuple(data_torch.shape)}") - results.append((mapped, data_torch)) + results.append((self.map_tensor_name(name), data_torch)) return results From 1ab3bba24321e19f7fe568aaa15c6e91a64efbb3 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 07:14:48 +0900 Subject: [PATCH 04/16] clean up the code --- convert_hf_to_gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 547ecb322f9c9..3989c5892fd6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -493,6 +493,7 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused return [(self.map_tensor_name(name), data_torch)] @@ -1673,6 +1674,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -4976,7 +4978,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter chunk = chunk.transpose(0, 1) elif chunk.shape != (n_embd, n_ff): raise ValueError(f"Unexpected gate/up chunk shape {tuple(chunk.shape)}") - # processed.append(chunk.contiguous()) processed.append(chunk.transpose(0, 1).contiguous()) gate_proj_weight, up_proj_weight = processed From d9854cc575a2e00e07edab199d0489692673671a Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 07:19:01 +0900 Subject: [PATCH 05/16] fix diff --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3989c5892fd6a..a280cb57f4053 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -493,7 +493,7 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid # unused return [(self.map_tensor_name(name), data_torch)] @@ -1674,7 +1674,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) From 967810de6bfc9f6a66a31605b190524f4932f66e Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 08:54:20 +0900 Subject: [PATCH 06/16] clean up the code --- src/llama-model.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bb960750c4b1a..36e5a649f1e3d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2294,13 +2294,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { split_sum += splits[i]; splits[i] = split_sum; } - if (split_sum > 0.0f) { - for (size_t i = 0; i < n_devices(); ++i) { - splits[i] /= split_sum; - } - } else { - LLAMA_LOG_WARN("load_tensors: no available GPU memory detected, falling back to CPU\n"); + for (size_t i = 0; i < n_devices(); ++i) { + splits[i] /= split_sum; } + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (cpu_dev == nullptr) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); From 74fa9d6884eb8fb025683b0a659d5a6d622d8ebc Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 08:57:43 +0900 Subject: [PATCH 07/16] clean up the code --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 36e5a649f1e3d..67fdaf5e95d69 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6705,7 +6705,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); - LLAMA_LOG_INFO("%s: freq_base_train_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); + LLAMA_LOG_INFO("%s: freq_base_train_swa = %.1f\n",__func__, hparams.rope_freq_base_train_swa); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); From 3391080885bb17a55d0c0d9b121dcdbd5b580b29 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Sun, 16 Nov 2025 08:58:52 +0900 Subject: [PATCH 08/16] clean up the code --- src/llama-model.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 67fdaf5e95d69..95cb14ccdf336 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2303,8 +2303,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); } const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); - const bool enable_gpu = !devices.empty() && split_sum > 0.0f; - const int act_gpu_layers = enable_gpu ? std::min(n_gpu_layers, (int)n_layer + 1) : 0; + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { From 037d8316d0bf910129e5ccd6638bd6433b474bb2 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 01:06:39 +0900 Subject: [PATCH 09/16] clean up the code --- convert_hf_to_gguf.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 386f533338195..faeb089561b3d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4891,8 +4891,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] -class PlamoTokenizerMixin: - def _set_plamo_vocab(self) -> None: + +@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") +class Plamo3Model(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO3 + + def set_vocab(self): # PLaMo models use a custom tokenizer with a .jsonl file tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" tokenizer_config_path = self.dir_model / "tokenizer_config.json" @@ -4970,14 +4974,6 @@ def _set_plamo_vocab(self) -> None: self.gguf_writer.add_add_space_prefix(False) - -@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") -class Plamo3Model(PlamoTokenizerMixin, TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO3 - - def set_vocab(self): - self._set_plamo_vocab() - def _sliding_window_pattern(self, block_count: int) -> list[bool]: layer_types = self.hparams.get("layer_types") if isinstance(layer_types, list) and len(layer_types) == block_count: From 80c341810c39c884360b0ace3a29fb7ed5fd7b79 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 01:34:42 +0900 Subject: [PATCH 10/16] clean up the code --- convert_hf_to_gguf.py | 3 --- gguf-py/gguf/gguf_writer.py | 3 --- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - src/llama-model-saver.cpp | 1 - src/llama-model.cpp | 7 +------ 6 files changed, 1 insertion(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index faeb089561b3d..33362660cab2d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5001,9 +5001,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_value_length(head_dim) self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_rope_freq_base(hparams["rope_theta"]) - rope_local = hparams.get("rope_local_theta") - if rope_local is not None: - self.gguf_writer.add_rope_freq_base_swa(rope_local) window_size = hparams.get("window_size") or hparams.get("sliding_window") or 0 self.gguf_writer.add_sliding_window(window_size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 68440f0d668e4..a051daeeb1341 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -872,9 +872,6 @@ def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: def add_rope_freq_base(self, value: float) -> None: self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) - def add_rope_freq_base_swa(self, value: float) -> None: - self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value) - def add_rope_scaling_type(self, value: RopeScalingType) -> None: self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f276d032500be..b50306d2cd3bf 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -197,7 +197,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 8f4fd9f12fa23..448419536a0fb 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -201,7 +201,6 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, - LLM_KV_ROPE_FREQ_BASE_SWA, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 3b66e698acee7..f32d6a4915718 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -188,7 +188,6 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); - add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 95cb14ccdf336..b2e3afa27dff5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -566,8 +566,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); - const bool has_rope_freq_base_swa = - ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); @@ -583,9 +581,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers - if (!has_rope_freq_base_swa) { - hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; - } + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); @@ -6704,7 +6700,6 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); - LLAMA_LOG_INFO("%s: freq_base_train_swa = %.1f\n",__func__, hparams.rope_freq_base_train_swa); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); From 9cecb26d5b78a9141232c6325680c120b37e850d Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 01:37:26 +0900 Subject: [PATCH 11/16] clean up the code --- gguf-py/gguf/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83bf234f5c04e..448d4aa7a7dc0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -166,7 +166,6 @@ class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" - FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" From 0df52967ba6f46d765a0f95738c7b318cf8f0cdf Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 01:48:53 +0900 Subject: [PATCH 12/16] clean up the code --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b2e3afa27dff5..d10d1b808a256 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -581,7 +581,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers - hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); From cdb1d2c130abde7ed0e1813a4dcaf0847bdd78cf Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 02:31:18 +0900 Subject: [PATCH 13/16] add chat_template if exist --- convert_hf_to_gguf.py | 3 +++ src/llama-model-saver.cpp | 1 + 2 files changed, 4 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 33362660cab2d..6345f31416861 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4974,6 +4974,9 @@ def set_vocab(self): self.gguf_writer.add_add_space_prefix(False) + if "chat_template" in tokenizer_config and tokenizer_config["chat_template"] is not None: + self.gguf_writer.add_chat_template(tokenizer_config["chat_template"]) + def _sliding_window_pattern(self, block_count: int) -> list[bool]: layer_types = self.hparams.get("layer_types") if isinstance(layer_types, list) and len(layer_types) == block_count: diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index f32d6a4915718..563823dc35d8e 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -279,3 +279,4 @@ void llama_model_saver::add_tensors_from_model() { void llama_model_saver::save(const std::string & path_model) { gguf_write_to_file(gguf_ctx, path_model.c_str(), false); } + From 527c65a91cca6f98a1a3b031c1f12b527e605eb0 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Mon, 17 Nov 2025 18:39:53 +0900 Subject: [PATCH 14/16] clean up the code --- convert_hf_to_gguf.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6345f31416861..94c912def3d01 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5034,29 +5034,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "gate_up_proj.weight" in name: name_up = name.replace("gate_up_proj.weight", "up_proj.weight") name_gate = name.replace("gate_up_proj.weight", "gate_proj.weight") - - n_embd = self.hparams["hidden_size"] - n_ff = self.hparams["intermediate_size"] - two_ff = 2 * n_ff - - if data_torch.shape == (two_ff, n_embd): - chunks = torch.chunk(data_torch, 2, dim=0) - elif data_torch.shape == (n_embd, two_ff): - chunks = torch.chunk(data_torch, 2, dim=1) - else: - raise ValueError(f"Unexpected gate_up_proj shape {tuple(data_torch.shape)}") - - processed: list[Tensor] = [] - - for chunk in chunks: - if chunk.shape == (n_ff, n_embd): - chunk = chunk.transpose(0, 1) - elif chunk.shape != (n_embd, n_ff): - raise ValueError(f"Unexpected gate/up chunk shape {tuple(chunk.shape)}") - processed.append(chunk.transpose(0, 1).contiguous()) - - gate_proj_weight, up_proj_weight = processed - + gate_proj_weight, up_proj_weight = torch.chunk(data_torch, 2, dim=0) results.append((self.map_tensor_name(name_gate), gate_proj_weight)) results.append((self.map_tensor_name(name_up), up_proj_weight)) else: From 5d52fe6be9336eb8899899e52e4de050f68b1a83 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Tue, 18 Nov 2025 01:24:31 +0900 Subject: [PATCH 15/16] fix cpu-backend --- src/models/plamo3.cpp | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 95b095aef24a3..09331cf697250 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -27,8 +27,10 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); const int32_t n_head = hparams.n_head(il); const int32_t n_head_kv = hparams.n_head_kv(il); @@ -44,8 +46,14 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens, head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv)); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "attn_q_norm", il); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "attn_k_norm", il); Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, @@ -65,6 +73,7 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il); } + cb(cur, "attn_out", il); if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -72,25 +81,36 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa } cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + cur = ggml_add(ctx0, cur, residual); + cb(cur, "attn_residual", il); + residual = cur; cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur); + cb(ffn_up, "ffn_up", il); + ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur); + cb(ffn_gate, "ffn_gate", il); + ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up); + cb(ffn_act, "ffn_act", il); cur = build_lora_mm(model.layers[il].ffn_down, ffn_act); - cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_down", il); - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - residual = ggml_get_rows(ctx0, residual, inp_out_ids); - } + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_post_norm", il); cur = ggml_add(ctx0, cur, residual); + cb(cur, "ffn_residual", il); + cur = build_cvec(cur, il); + cb(cur, "l_out", il); inpL = cur; } From 9bd33d070227f7cb12fc2155efd6fd207382ae85 Mon Sep 17 00:00:00 2001 From: mmngays <146910567+mmngays@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:53:07 +0900 Subject: [PATCH 16/16] chore: whitespace trim fix + typo fix --- src/models/plamo3.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 09331cf697250..1593f5fb878dd 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -27,7 +27,7 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); - cb(cur, "attn_post_norm", il); + cb(cur, "attn_norm", il); ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -49,7 +49,7 @@ llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_pa cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "attn_q_norm", il); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);