From 6494d9efcc57ad69bdbb9759e30bf1cce731637c Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 7 Nov 2025 01:40:54 +0000
Subject: [PATCH 1/8] fix build err and run err

---
 CMakeLists.txt                             |  3 +++
 dlinfer/framework/lmdeploy_ext/__init__.py |  2 +-
 dlinfer/vendor/maca/CMakeLists.txt         |  1 +
 dlinfer/vendor/maca/maca_ops.py            | 19 ++++++++++---------
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae2d9dc5..6ac8fe76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,4 +24,7 @@ elseif(NOT DEVICE IN_LIST SUPPORTED_DEVICE)
   message(FATAL_ERROR "Device ${DEVICE} is not supported! Supported devices: ${SUPPORTED_DEVICE}")
 endif()
 
+add_subdirectory(dlinfer/vendor/${DEVICE})
 add_subdirectory(dlinfer/graph/dicp/vendor)
+
+install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")")  
diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py
index 2bb07e28..6ea0f201 100644
--- a/dlinfer/framework/lmdeploy_ext/__init__.py
+++ b/dlinfer/framework/lmdeploy_ext/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, DeepLink. All rights reserved.
 import dlinfer.framework.transformers_ext
-import dlinfer.framework.torch_npu_ext
+# import dlinfer.framework.torch_npu_ext
 from . import quants
 from . import cudagraph
 from . import device
diff --git a/dlinfer/vendor/maca/CMakeLists.txt b/dlinfer/vendor/maca/CMakeLists.txt
index 26d58f54..de03e3ed 100644
--- a/dlinfer/vendor/maca/CMakeLists.txt
+++ b/dlinfer/vendor/maca/CMakeLists.txt
@@ -35,6 +35,7 @@ ExternalProject_Add(${MACA_SUB_MODULE}
                     BUILD_ALWAYS ON
                     USES_TERMINAL_BUILD ON
                     USES_TERMINAL_INSTALL ON
+                    INSTALL_COMMAND ""
                     CMAKE_ARGS
                       "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
                       "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
index 15bfa7bb..041fb736 100644
--- a/dlinfer/vendor/maca/maca_ops.py
+++ b/dlinfer/vendor/maca/maca_ops.py
@@ -150,16 +150,17 @@ def prefill_attention(
         return attn_output
 
     # for cogvlm vl part.
-    if query.size(-2) != num_q_heads:
+    if q_start_loc.size(0) == q_seq_len.size(0):
         causal = False
-        head_dim = query.size(-1) // num_q_heads
-        query = query.view(-1, num_q_heads, head_dim)
-        key = key.view(-1, num_kv_heads, head_dim)
-        value = value.view(-1, num_kv_heads, head_dim)
-        q_start_loc = torch.tensor(
-            [0, q_seq_len], dtype=torch.int32, device=query.device
-        )
-        softmax_scale = float(1 / math.sqrt(head_dim))
+        #head_dim = query.size(-1) // num_q_heads
+        #query = query.view(-1, num_q_heads, head_dim)
+        #key = key.view(-1, num_kv_heads, head_dim)
+        #value = value.view(-1, num_kv_heads, head_dim)
+        #q_start_loc = torch.tensor(
+        #    [0, q_seq_len.size(0) + 1], dtype=torch.int32, device=query.device
+        #)
+        q_start_loc = torch.cat((torch.tensor([0], dtype=torch.int32, device=query.device), q_seq_len.cumsum(0).to(torch.int32)), dim=0)
+        #softmax_scale = float(1 / math.sqrt(head_dim))
 
     output = flash_attn_varlen_func(
         query,

From fc3375ae2798834110732fd67c1ac5ac7d4ce9b0 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Mon, 24 Nov 2025 12:56:21 +0800
Subject: [PATCH 2/8] fix qwenvl precision

---
 .../lmdeploy_ext/cudagraph/maca_cudagraph.py  |   4 +-
 dlinfer/vendor/maca/maca_ops.py               | 129 ++++++++++++------
 2 files changed, 90 insertions(+), 43 deletions(-)

diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py
index 09ec0d0c..ef3c20b9 100644
--- a/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py
+++ b/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py
@@ -19,11 +19,11 @@ def MacaCudaGraphMixin_make_buffers_cudagraph(
     num_blocks = graph_meta.num_blocks
     device = graph_meta.device
     input_buffers: BuffType = dict()
-    input_buffers["input_ids"] = torch.empty(
+    input_buffers["input_ids"] = torch.zeros(
         1, max_tokens, dtype=torch.int32, device=device
     )
 
-    input_buffers["position_ids"] = torch.empty(
+    input_buffers["position_ids"] = torch.zeros(
         (1, max_tokens), dtype=torch.int32, device=device
     )
 
diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
index 041fb736..4934e36c 100644
--- a/dlinfer/vendor/maca/maca_ops.py
+++ b/dlinfer/vendor/maca/maca_ops.py
@@ -12,6 +12,26 @@
 
 from .fused_moe import fused_experts
 from .maca_extension import ops as maca_ext_ops
+from mcoplib import lmdeploy as mcoplib_ops
+from mcoplib import op as op_origin
+import mcoplib._C
+import mcoplib._moe_C
+
+env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "true")
+USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on")
+
+# Select the ops library based on environment variable
+if USE_MCOPLIB_OPS:
+    print(f"====>{USE_MCOPLIB_OPS}")
+    ops = mcoplib_ops
+    ops_name = "mcoplib_ops"
+else:
+    ops = maca_ext_ops
+    ops_name = "maca_ext_ops"
+
+# Print environment variable value and selected ops library
+print(f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}")
+print(f"[DLInfer] Using ops library: {ops_name}")
 
 __all__ = [
     "add_rms_norm",
@@ -58,7 +78,10 @@ def add_rms_norm(
     weight: Tensor,
     epsilon: float,
 ) -> Tuple[Tensor, Tensor]:
-    maca_ext_ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
+    if USE_MCOPLIB_OPS:
+        torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
+    else:
+        ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
     return hidden_states, residual
 
 
@@ -76,16 +99,27 @@ def apply_rotary_pos_emb(
     query = query.flatten(-2, -1)
     key = key.flatten(-2, -1)
     rot_dim = cos.size(-1)
+    if USE_MCOPLIB_OPS:
+       ops.lmdeploy_rotary_embedding(
+            position_ids_1d,
+            query,
+            key,
+            head_size,
+            cos.view(-1, rot_dim),
+            sin.view(-1, rot_dim),
+            True,
+            )
+    else:
+        ops.rotary_embedding(
+            position_ids_1d,
+            query,
+            key,
+            head_size,
+            cos.view(-1, rot_dim),
+            sin.view(-1, rot_dim),
+            True,
+            )
 
-    maca_ext_ops.rotary_embedding(
-        position_ids_1d,
-        query,
-        key,
-        head_size,
-        cos.view(-1, rot_dim),
-        sin.view(-1, rot_dim),
-        True,
-    )
     return query, key
 
 
@@ -150,18 +184,22 @@ def prefill_attention(
         return attn_output
 
     # for cogvlm vl part.
-    if q_start_loc.size(0) == q_seq_len.size(0):
+    if query.size(-2) != num_q_heads:
         causal = False
-        #head_dim = query.size(-1) // num_q_heads
-        #query = query.view(-1, num_q_heads, head_dim)
-        #key = key.view(-1, num_kv_heads, head_dim)
-        #value = value.view(-1, num_kv_heads, head_dim)
-        #q_start_loc = torch.tensor(
-        #    [0, q_seq_len.size(0) + 1], dtype=torch.int32, device=query.device
-        #)
-        q_start_loc = torch.cat((torch.tensor([0], dtype=torch.int32, device=query.device), q_seq_len.cumsum(0).to(torch.int32)), dim=0)
-        #softmax_scale = float(1 / math.sqrt(head_dim))
+        head_dim = query.size(-1) // num_q_heads
+        query = query.view(-1, num_q_heads, head_dim)
+        key = key.view(-1, num_kv_heads, head_dim)
+        value = value.view(-1, num_kv_heads, head_dim)
+        q_start_loc = torch.tensor(
+            [0, q_seq_len], dtype=torch.int32, device=query.device
+        )
+        softmax_scale = float(1 / math.sqrt(head_dim))
 
+    # for qwen vl part.
+    if q_start_loc.shape[0] == q_seq_len.shape[0]:
+        causal = False
+        q_start_loc = torch.cat([q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)])
+    
     output = flash_attn_varlen_func(
         query,
         key,
@@ -174,6 +212,7 @@ def prefill_attention(
         causal=causal,
         window_size=(-1, -1),
     )
+    attn_output.copy_(output)
     return output
 
 
@@ -201,16 +240,15 @@ def fill_kv_cache(
     quant_bits: int,
 ) -> Tuple[Tensor, Tensor]:
     kv_indices = kv_indices.squeeze(-1)
-    maca_ext_ops.reshape_and_cache_flash(
-        key,
-        value,
-        key_cache,
-        value_cache,
-        kv_indices,
-        "auto",
-        torch.tensor(1.0),
-        torch.tensor(1.0),
-    )
+    k_scale = torch.tensor(1.0)
+    v_scale = torch.tensor(1.0)
+    
+    if USE_MCOPLIB_OPS:
+        torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale)
+    else:
+        ops.reshape_and_cache_flash(
+            key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
+        )
     return key_cache, value_cache
 
 
@@ -239,8 +277,6 @@ def paged_decode_attention(
 
     num_kv_heads = value_cache.size(1)
     block_size = value_cache.size(-2)
-    output = torch.empty_like(query)
-
     is_mla = query.size(-1) == 576
 
     if is_mla:
@@ -348,8 +384,10 @@ def rms_norm(
     hidden_states = hidden_states.to(torch.float32)
     weight = weight.to(torch.float32)
     output = torch.empty_like(hidden_states)
-    maca_ext_ops.rms_norm(output, hidden_states, weight, epsilon)
-
+    if USE_MCOPLIB_OPS:
+       op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None,False)
+    else:
+        ops.rms_norm(output, hidden_states, weight, epsilon)
     return output.to(input_dtype)
 
 
@@ -367,12 +405,20 @@ def moe_gating_topk_softmax(
 
     token_expert_indicies = torch.empty_like(topk_ids)
 
-    maca_ext_ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        router_logits.float(),
-    )
+    if USE_MCOPLIB_OPS:
+        torch.ops._moe_C.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            router_logits.float()
+            )
+    else:
+        ops.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            router_logits.float(),
+        )
 
     del token_expert_indicies  # Not used. Will be used in the future.
 
@@ -389,7 +435,8 @@ def silu_and_mul(x: Tensor, dim: int = -1) -> Tensor:
     d = x.shape[-1] // 2
     output_shape = x.shape[:-1] + (d,)
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-    maca_ext_ops.silu_and_mul(out, x)
+    torch.ops._C.silu_and_mul(out, x)
+    #ops.silu_and_mul(out, x)
     return out
 
 

From 2f1ffe3d8a79eaa832c14f9a82255d6d906a5499 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Thu, 27 Nov 2025 11:18:54 +0800
Subject: [PATCH 3/8] [maca] refactor moe ops

---
 dlinfer/vendor/maca/fused_moe.py | 60 ++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 19 deletions(-)

diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py
index 44fae5d2..d129aa57 100644
--- a/dlinfer/vendor/maca/fused_moe.py
+++ b/dlinfer/vendor/maca/fused_moe.py
@@ -4,7 +4,6 @@
 import json
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple
-
 import torch
 import triton
 import triton.language as tl
@@ -13,6 +12,9 @@
 
 import logging
 
+env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "yes")
+USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on")
+
 logger = logging.getLogger(__name__)
 
 
@@ -251,10 +253,14 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-
-    maca_ext_ops.moe_align_block_size(
-        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
-    )
+    if USE_MCOPLIB_OPS:
+        torch.ops._moe_C.moe_align_block_size(
+            topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+        )
+    else:
+        maca_ext_ops.moe_align_block_size(
+            topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+        )
 
     return sorted_ids, expert_ids, num_tokens_post_pad
 
@@ -460,13 +466,20 @@ def fused_topk(
     token_expert_indicies = torch.empty(
         M, topk, dtype=torch.int32, device=hidden_states.device
     )
-
-    maca_ext_ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
+    if USE_MCOPLIB_OPS:
+        torch.ops._moe_C.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            gating_output.float(),  # TODO(woosuk): Optimize this.
+        )
+    else:
+        maca_ext_ops.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            gating_output.float(),  # TODO(woosuk): Optimize this.
+        )
     del token_expert_indicies  # Not used. Will be used in the future.
 
     if renormalize:
@@ -796,8 +809,12 @@ def fused_experts_impl(
             use_int8_w8a16=use_int8_w8a16,
             block_shape=block_shape,
         )
-
-        maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        if USE_MCOPLIB_OPS:
+            torch.ops._C.silu_and_mul(
+                intermediate_cache2, intermediate_cache1.view(-1, N)
+            )
+        else:
+            maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
         invoke_fused_moe_kernel(
             intermediate_cache2,
@@ -818,11 +835,16 @@ def fused_experts_impl(
             use_int8_w8a16=use_int8_w8a16,
             block_shape=block_shape,
         )
-
-        maca_ext_ops.moe_sum(
-            intermediate_cache3.view(*intermediate_cache3.shape),
-            out_hidden_states[begin_chunk_idx:end_chunk_idx],
-        )
+        if USE_MCOPLIB_OPS:
+            torch.ops._moe_C.moe_sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            )
+        else:
+            maca_ext_ops.moe_sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            )
 
     return out_hidden_states
 

From da2ef657ffc5e88ff396e3e76a0981794ab4fee0 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Thu, 27 Nov 2025 11:20:47 +0800
Subject: [PATCH 4/8] [maca] fix paged prefill attention

---
 dlinfer/vendor/maca/maca_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
index 4934e36c..3ac99409 100644
--- a/dlinfer/vendor/maca/maca_ops.py
+++ b/dlinfer/vendor/maca/maca_ops.py
@@ -356,7 +356,8 @@ def paged_prefill_attention(
         )
         return output[..., :512]
 
-    value_cache = value_cache.permute(0, 1, 3, 2)
+    value_cache = value_cache.permute(0, 2, 3, 1)
+    key_cache = key_cache.permute(0, 2, 3, 1)
     context_attention_fwd(
         query,
         key,
@@ -436,7 +437,6 @@ def silu_and_mul(x: Tensor, dim: int = -1) -> Tensor:
     output_shape = x.shape[:-1] + (d,)
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     torch.ops._C.silu_and_mul(out, x)
-    #ops.silu_and_mul(out, x)
     return out
 
 

From e80f7b52aef62eb9f460b1c37d7abb170c2d5cc6 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Fri, 28 Nov 2025 14:36:14 +0800
Subject: [PATCH 5/8] format code

---
 dlinfer/framework/lmdeploy_ext/__init__.py |  1 +
 dlinfer/vendor/maca/fused_moe.py           | 18 ++++++++++---
 dlinfer/vendor/maca/maca_ops.py            | 31 ++++++++++++----------
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py
index 6ea0f201..6363584f 100644
--- a/dlinfer/framework/lmdeploy_ext/__init__.py
+++ b/dlinfer/framework/lmdeploy_ext/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, DeepLink. All rights reserved.
 import dlinfer.framework.transformers_ext
+
 # import dlinfer.framework.torch_npu_ext
 from . import quants
 from . import cudagraph
diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py
index d129aa57..393fada3 100644
--- a/dlinfer/vendor/maca/fused_moe.py
+++ b/dlinfer/vendor/maca/fused_moe.py
@@ -255,11 +255,21 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
     if USE_MCOPLIB_OPS:
         torch.ops._moe_C.moe_align_block_size(
-            topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
         )
     else:
         maca_ext_ops.moe_align_block_size(
-            topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
         )
 
     return sorted_ids, expert_ids, num_tokens_post_pad
@@ -814,7 +824,9 @@ def fused_experts_impl(
                 intermediate_cache2, intermediate_cache1.view(-1, N)
             )
         else:
-            maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+            maca_ext_ops.silu_and_mul(
+                intermediate_cache2, intermediate_cache1.view(-1, N)
+            )
 
         invoke_fused_moe_kernel(
             intermediate_cache2,
diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
index 3ac99409..037a3079 100644
--- a/dlinfer/vendor/maca/maca_ops.py
+++ b/dlinfer/vendor/maca/maca_ops.py
@@ -30,7 +30,9 @@
     ops_name = "maca_ext_ops"
 
 # Print environment variable value and selected ops library
-print(f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}")
+print(
+    f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}"
+)
 print(f"[DLInfer] Using ops library: {ops_name}")
 
 __all__ = [
@@ -100,7 +102,7 @@ def apply_rotary_pos_emb(
     key = key.flatten(-2, -1)
     rot_dim = cos.size(-1)
     if USE_MCOPLIB_OPS:
-       ops.lmdeploy_rotary_embedding(
+        ops.lmdeploy_rotary_embedding(
             position_ids_1d,
             query,
             key,
@@ -108,7 +110,7 @@ def apply_rotary_pos_emb(
             cos.view(-1, rot_dim),
             sin.view(-1, rot_dim),
             True,
-            )
+        )
     else:
         ops.rotary_embedding(
             position_ids_1d,
@@ -118,7 +120,7 @@ def apply_rotary_pos_emb(
             cos.view(-1, rot_dim),
             sin.view(-1, rot_dim),
             True,
-            )
+        )
 
     return query, key
 
@@ -198,8 +200,10 @@ def prefill_attention(
     # for qwen vl part.
     if q_start_loc.shape[0] == q_seq_len.shape[0]:
         causal = False
-        q_start_loc = torch.cat([q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)])
-    
+        q_start_loc = torch.cat(
+            [q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)]
+        )
+
     output = flash_attn_varlen_func(
         query,
         key,
@@ -242,9 +246,11 @@ def fill_kv_cache(
     kv_indices = kv_indices.squeeze(-1)
     k_scale = torch.tensor(1.0)
     v_scale = torch.tensor(1.0)
-    
+
     if USE_MCOPLIB_OPS:
-        torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale)
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
+        )
     else:
         ops.reshape_and_cache_flash(
             key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
@@ -386,7 +392,7 @@ def rms_norm(
     weight = weight.to(torch.float32)
     output = torch.empty_like(hidden_states)
     if USE_MCOPLIB_OPS:
-       op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None,False)
+        op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False)
     else:
         ops.rms_norm(output, hidden_states, weight, epsilon)
     return output.to(input_dtype)
@@ -408,11 +414,8 @@ def moe_gating_topk_softmax(
 
     if USE_MCOPLIB_OPS:
         torch.ops._moe_C.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            router_logits.float()
-            )
+            topk_weights, topk_ids, token_expert_indicies, router_logits.float()
+        )
     else:
         ops.topk_softmax(
             topk_weights,

From 03fdbdcca8a4ef394549aba6762b43811e8aacdf Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Tue, 2 Dec 2025 14:26:10 +0800
Subject: [PATCH 6/8] format code

---
 CMakeLists.txt                             | 2 +-
 dlinfer/framework/lmdeploy_ext/__init__.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ac8fe76..e2db297b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,4 +27,4 @@ endif()
 add_subdirectory(dlinfer/vendor/${DEVICE})
 add_subdirectory(dlinfer/graph/dicp/vendor)
 
-install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")")  
+install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")")
diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py
index 6363584f..37372ac0 100644
--- a/dlinfer/framework/lmdeploy_ext/__init__.py
+++ b/dlinfer/framework/lmdeploy_ext/__init__.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, DeepLink. All rights reserved.
 import dlinfer.framework.transformers_ext
 
-# import dlinfer.framework.torch_npu_ext
 from . import quants
 from . import cudagraph
 from . import device

From d52cd805db443d65e0ba8962728a2c759be9f501 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Wed, 3 Dec 2025 17:24:15 +0800
Subject: [PATCH 7/8] fix code

---
 dlinfer/framework/lmdeploy_ext/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py
index 37372ac0..2bb07e28 100644
--- a/dlinfer/framework/lmdeploy_ext/__init__.py
+++ b/dlinfer/framework/lmdeploy_ext/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, DeepLink. All rights reserved.
 import dlinfer.framework.transformers_ext
-
+import dlinfer.framework.torch_npu_ext
 from . import quants
 from . import cudagraph
 from . import device

From c910b35f149f741d2dc9cf9e339ce03656f6e110 Mon Sep 17 00:00:00 2001
From: WangQing <2917021186@qq.com>
Date: Fri, 5 Dec 2025 13:35:02 +0800
Subject: [PATCH 8/8] remove if USE_MCOPLIB_OPS

---
 dlinfer/vendor/maca/fused_moe.py | 74 +++++++--------------------
 dlinfer/vendor/maca/maca_ops.py  | 86 +++++++-------------------------
 2 files changed, 37 insertions(+), 123 deletions(-)

diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py
index 393fada3..f7b58aad 100644
--- a/dlinfer/vendor/maca/fused_moe.py
+++ b/dlinfer/vendor/maca/fused_moe.py
@@ -8,13 +8,8 @@
 import triton
 import triton.language as tl
 
-from .maca_extension import ops as maca_ext_ops
-
 import logging
 
-env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "yes")
-USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on")
-
 logger = logging.getLogger(__name__)
 
 
@@ -253,24 +248,14 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    if USE_MCOPLIB_OPS:
-        torch.ops._moe_C.moe_align_block_size(
-            topk_ids,
-            num_experts,
-            block_size,
-            sorted_ids,
-            expert_ids,
-            num_tokens_post_pad,
-        )
-    else:
-        maca_ext_ops.moe_align_block_size(
-            topk_ids,
-            num_experts,
-            block_size,
-            sorted_ids,
-            expert_ids,
-            num_tokens_post_pad,
-        )
+    torch.ops._moe_C.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+    )
 
     return sorted_ids, expert_ids, num_tokens_post_pad
 
@@ -476,20 +461,12 @@ def fused_topk(
     token_expert_indicies = torch.empty(
         M, topk, dtype=torch.int32, device=hidden_states.device
     )
-    if USE_MCOPLIB_OPS:
-        torch.ops._moe_C.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            gating_output.float(),  # TODO(woosuk): Optimize this.
-        )
-    else:
-        maca_ext_ops.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            gating_output.float(),  # TODO(woosuk): Optimize this.
-        )
+    torch.ops._moe_C.topk_softmax(
+        topk_weights,
+        topk_ids,
+        token_expert_indicies,
+        gating_output.float(),  # TODO(woosuk): Optimize this.
+    )
     del token_expert_indicies  # Not used. Will be used in the future.
 
     if renormalize:
@@ -819,14 +796,7 @@ def fused_experts_impl(
             use_int8_w8a16=use_int8_w8a16,
             block_shape=block_shape,
         )
-        if USE_MCOPLIB_OPS:
-            torch.ops._C.silu_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, N)
-            )
-        else:
-            maca_ext_ops.silu_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, N)
-            )
+        torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
         invoke_fused_moe_kernel(
             intermediate_cache2,
@@ -847,16 +817,10 @@ def fused_experts_impl(
             use_int8_w8a16=use_int8_w8a16,
             block_shape=block_shape,
         )
-        if USE_MCOPLIB_OPS:
-            torch.ops._moe_C.moe_sum(
-                intermediate_cache3.view(*intermediate_cache3.shape),
-                out_hidden_states[begin_chunk_idx:end_chunk_idx],
-            )
-        else:
-            maca_ext_ops.moe_sum(
-                intermediate_cache3.view(*intermediate_cache3.shape),
-                out_hidden_states[begin_chunk_idx:end_chunk_idx],
-            )
+        torch.ops._moe_C.moe_sum(
+            intermediate_cache3.view(*intermediate_cache3.shape),
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+        )
 
     return out_hidden_states
 
diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
index 037a3079..e3941896 100644
--- a/dlinfer/vendor/maca/maca_ops.py
+++ b/dlinfer/vendor/maca/maca_ops.py
@@ -11,30 +11,11 @@
 from dlinfer.utils.type_annotation import Tensor, Optional, Sequence, Tuple
 
 from .fused_moe import fused_experts
-from .maca_extension import ops as maca_ext_ops
-from mcoplib import lmdeploy as mcoplib_ops
+from mcoplib import lmdeploy as ops
 from mcoplib import op as op_origin
 import mcoplib._C
 import mcoplib._moe_C
 
-env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "true")
-USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on")
-
-# Select the ops library based on environment variable
-if USE_MCOPLIB_OPS:
-    print(f"====>{USE_MCOPLIB_OPS}")
-    ops = mcoplib_ops
-    ops_name = "mcoplib_ops"
-else:
-    ops = maca_ext_ops
-    ops_name = "maca_ext_ops"
-
-# Print environment variable value and selected ops library
-print(
-    f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}"
-)
-print(f"[DLInfer] Using ops library: {ops_name}")
-
 __all__ = [
     "add_rms_norm",
     "apply_rotary_pos_emb",
@@ -80,10 +61,7 @@ def add_rms_norm(
     weight: Tensor,
     epsilon: float,
 ) -> Tuple[Tensor, Tensor]:
-    if USE_MCOPLIB_OPS:
-        torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
-    else:
-        ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
+    torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
     return hidden_states, residual
 
 
@@ -101,26 +79,15 @@ def apply_rotary_pos_emb(
     query = query.flatten(-2, -1)
     key = key.flatten(-2, -1)
     rot_dim = cos.size(-1)
-    if USE_MCOPLIB_OPS:
-        ops.lmdeploy_rotary_embedding(
-            position_ids_1d,
-            query,
-            key,
-            head_size,
-            cos.view(-1, rot_dim),
-            sin.view(-1, rot_dim),
-            True,
-        )
-    else:
-        ops.rotary_embedding(
-            position_ids_1d,
-            query,
-            key,
-            head_size,
-            cos.view(-1, rot_dim),
-            sin.view(-1, rot_dim),
-            True,
-        )
+    ops.lmdeploy_rotary_embedding(
+        position_ids_1d,
+        query,
+        key,
+        head_size,
+        cos.view(-1, rot_dim),
+        sin.view(-1, rot_dim),
+        True,
+    )
 
     return query, key
 
@@ -247,14 +214,9 @@ def fill_kv_cache(
     k_scale = torch.tensor(1.0)
     v_scale = torch.tensor(1.0)
 
-    if USE_MCOPLIB_OPS:
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
-        )
-    else:
-        ops.reshape_and_cache_flash(
-            key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
-        )
+    torch.ops._C_cache_ops.reshape_and_cache_flash(
+        key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale
+    )
     return key_cache, value_cache
 
 
@@ -391,10 +353,7 @@ def rms_norm(
     hidden_states = hidden_states.to(torch.float32)
     weight = weight.to(torch.float32)
     output = torch.empty_like(hidden_states)
-    if USE_MCOPLIB_OPS:
-        op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False)
-    else:
-        ops.rms_norm(output, hidden_states, weight, epsilon)
+    op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False)
     return output.to(input_dtype)
 
 
@@ -412,18 +371,9 @@ def moe_gating_topk_softmax(
 
     token_expert_indicies = torch.empty_like(topk_ids)
 
-    if USE_MCOPLIB_OPS:
-        torch.ops._moe_C.topk_softmax(
-            topk_weights, topk_ids, token_expert_indicies, router_logits.float()
-        )
-    else:
-        ops.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            router_logits.float(),
-        )
-
+    torch.ops._moe_C.topk_softmax(
+        topk_weights, topk_ids, token_expert_indicies, router_logits.float()
+    )
     del token_expert_indicies  # Not used. Will be used in the future.
 
     if renormalize: