From 6494d9efcc57ad69bdbb9759e30bf1cce731637c Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 7 Nov 2025 01:40:54 +0000 Subject: [PATCH 1/8] fix build err and run err --- CMakeLists.txt | 3 +++ dlinfer/framework/lmdeploy_ext/__init__.py | 2 +- dlinfer/vendor/maca/CMakeLists.txt | 1 + dlinfer/vendor/maca/maca_ops.py | 19 ++++++++++--------- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae2d9dc5..6ac8fe76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,4 +24,7 @@ elseif(NOT DEVICE IN_LIST SUPPORTED_DEVICE) message(FATAL_ERROR "Device ${DEVICE} is not supported! Supported devices: ${SUPPORTED_DEVICE}") endif() +add_subdirectory(dlinfer/vendor/${DEVICE}) add_subdirectory(dlinfer/graph/dicp/vendor) + +install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")") diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py index 2bb07e28..6ea0f201 100644 --- a/dlinfer/framework/lmdeploy_ext/__init__.py +++ b/dlinfer/framework/lmdeploy_ext/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2024, DeepLink. All rights reserved. import dlinfer.framework.transformers_ext -import dlinfer.framework.torch_npu_ext +# import dlinfer.framework.torch_npu_ext from . import quants from . import cudagraph from . import device diff --git a/dlinfer/vendor/maca/CMakeLists.txt b/dlinfer/vendor/maca/CMakeLists.txt index 26d58f54..de03e3ed 100644 --- a/dlinfer/vendor/maca/CMakeLists.txt +++ b/dlinfer/vendor/maca/CMakeLists.txt @@ -35,6 +35,7 @@ ExternalProject_Add(${MACA_SUB_MODULE} BUILD_ALWAYS ON USES_TERMINAL_BUILD ON USES_TERMINAL_INSTALL ON + INSTALL_COMMAND "" CMAKE_ARGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py index 15bfa7bb..041fb736 100644 --- a/dlinfer/vendor/maca/maca_ops.py +++ b/dlinfer/vendor/maca/maca_ops.py @@ -150,16 +150,17 @@ def prefill_attention( return attn_output # for cogvlm vl part. - if query.size(-2) != num_q_heads: + if q_start_loc.size(0) == q_seq_len.size(0): causal = False - head_dim = query.size(-1) // num_q_heads - query = query.view(-1, num_q_heads, head_dim) - key = key.view(-1, num_kv_heads, head_dim) - value = value.view(-1, num_kv_heads, head_dim) - q_start_loc = torch.tensor( - [0, q_seq_len], dtype=torch.int32, device=query.device - ) - softmax_scale = float(1 / math.sqrt(head_dim)) + #head_dim = query.size(-1) // num_q_heads + #query = query.view(-1, num_q_heads, head_dim) + #key = key.view(-1, num_kv_heads, head_dim) + #value = value.view(-1, num_kv_heads, head_dim) + #q_start_loc = torch.tensor( + # [0, q_seq_len.size(0) + 1], dtype=torch.int32, device=query.device + #) + q_start_loc = torch.cat((torch.tensor([0], dtype=torch.int32, device=query.device), q_seq_len.cumsum(0).to(torch.int32)), dim=0) + #softmax_scale = float(1 / math.sqrt(head_dim)) output = flash_attn_varlen_func( query, From fc3375ae2798834110732fd67c1ac5ac7d4ce9b0 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Mon, 24 Nov 2025 12:56:21 +0800 Subject: [PATCH 2/8] fix qwenvl precision --- .../lmdeploy_ext/cudagraph/maca_cudagraph.py | 4 +- dlinfer/vendor/maca/maca_ops.py | 129 ++++++++++++------ 2 files changed, 90 insertions(+), 43 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py index 09ec0d0c..ef3c20b9 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py @@ -19,11 +19,11 @@ def MacaCudaGraphMixin_make_buffers_cudagraph( num_blocks = graph_meta.num_blocks device = graph_meta.device input_buffers: BuffType = dict() - input_buffers["input_ids"] = torch.empty( + input_buffers["input_ids"] = torch.zeros( 1, max_tokens, dtype=torch.int32, device=device ) - input_buffers["position_ids"] = torch.empty( + input_buffers["position_ids"] = torch.zeros( (1, max_tokens), dtype=torch.int32, device=device ) diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py index 041fb736..4934e36c 100644 --- a/dlinfer/vendor/maca/maca_ops.py +++ b/dlinfer/vendor/maca/maca_ops.py @@ -12,6 +12,26 @@ from .fused_moe import fused_experts from .maca_extension import ops as maca_ext_ops +from mcoplib import lmdeploy as mcoplib_ops +from mcoplib import op as op_origin +import mcoplib._C +import mcoplib._moe_C + +env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "true") +USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on") + +# Select the ops library based on environment variable +if USE_MCOPLIB_OPS: + print(f"====>{USE_MCOPLIB_OPS}") + ops = mcoplib_ops + ops_name = "mcoplib_ops" +else: + ops = maca_ext_ops + ops_name = "maca_ext_ops" + +# Print environment variable value and selected ops library +print(f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}") +print(f"[DLInfer] Using ops library: {ops_name}") __all__ = [ "add_rms_norm", @@ -58,7 +78,10 @@ def add_rms_norm( weight: Tensor, epsilon: float, ) -> Tuple[Tensor, Tensor]: - maca_ext_ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon) + if USE_MCOPLIB_OPS: + torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon) + else: + ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon) return hidden_states, residual @@ -76,16 +99,27 @@ def apply_rotary_pos_emb( query = query.flatten(-2, -1) key = key.flatten(-2, -1) rot_dim = cos.size(-1) + if USE_MCOPLIB_OPS: + ops.lmdeploy_rotary_embedding( + position_ids_1d, + query, + key, + head_size, + cos.view(-1, rot_dim), + sin.view(-1, rot_dim), + True, + ) + else: + ops.rotary_embedding( + position_ids_1d, + query, + key, + head_size, + cos.view(-1, rot_dim), + sin.view(-1, rot_dim), + True, + ) - maca_ext_ops.rotary_embedding( - position_ids_1d, - query, - key, - head_size, - cos.view(-1, rot_dim), - sin.view(-1, rot_dim), - True, - ) return query, key @@ -150,18 +184,22 @@ def prefill_attention( return attn_output # for cogvlm vl part. - if q_start_loc.size(0) == q_seq_len.size(0): + if query.size(-2) != num_q_heads: causal = False - #head_dim = query.size(-1) // num_q_heads - #query = query.view(-1, num_q_heads, head_dim) - #key = key.view(-1, num_kv_heads, head_dim) - #value = value.view(-1, num_kv_heads, head_dim) - #q_start_loc = torch.tensor( - # [0, q_seq_len.size(0) + 1], dtype=torch.int32, device=query.device - #) - q_start_loc = torch.cat((torch.tensor([0], dtype=torch.int32, device=query.device), q_seq_len.cumsum(0).to(torch.int32)), dim=0) - #softmax_scale = float(1 / math.sqrt(head_dim)) + head_dim = query.size(-1) // num_q_heads + query = query.view(-1, num_q_heads, head_dim) + key = key.view(-1, num_kv_heads, head_dim) + value = value.view(-1, num_kv_heads, head_dim) + q_start_loc = torch.tensor( + [0, q_seq_len], dtype=torch.int32, device=query.device + ) + softmax_scale = float(1 / math.sqrt(head_dim)) + # for qwen vl part. + if q_start_loc.shape[0] == q_seq_len.shape[0]: + causal = False + q_start_loc = torch.cat([q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)]) + output = flash_attn_varlen_func( query, key, @@ -174,6 +212,7 @@ def prefill_attention( causal=causal, window_size=(-1, -1), ) + attn_output.copy_(output) return output @@ -201,16 +240,15 @@ def fill_kv_cache( quant_bits: int, ) -> Tuple[Tensor, Tensor]: kv_indices = kv_indices.squeeze(-1) - maca_ext_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - kv_indices, - "auto", - torch.tensor(1.0), - torch.tensor(1.0), - ) + k_scale = torch.tensor(1.0) + v_scale = torch.tensor(1.0) + + if USE_MCOPLIB_OPS: + torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale) + else: + ops.reshape_and_cache_flash( + key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale + ) return key_cache, value_cache @@ -239,8 +277,6 @@ def paged_decode_attention( num_kv_heads = value_cache.size(1) block_size = value_cache.size(-2) - output = torch.empty_like(query) - is_mla = query.size(-1) == 576 if is_mla: @@ -348,8 +384,10 @@ def rms_norm( hidden_states = hidden_states.to(torch.float32) weight = weight.to(torch.float32) output = torch.empty_like(hidden_states) - maca_ext_ops.rms_norm(output, hidden_states, weight, epsilon) - + if USE_MCOPLIB_OPS: + op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None,False) + else: + ops.rms_norm(output, hidden_states, weight, epsilon) return output.to(input_dtype) @@ -367,12 +405,20 @@ def moe_gating_topk_softmax( token_expert_indicies = torch.empty_like(topk_ids) - maca_ext_ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - router_logits.float(), - ) + if USE_MCOPLIB_OPS: + torch.ops._moe_C.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + router_logits.float() + ) + else: + ops.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + router_logits.float(), + ) del token_expert_indicies # Not used. Will be used in the future. @@ -389,7 +435,8 @@ def silu_and_mul(x: Tensor, dim: int = -1) -> Tensor: d = x.shape[-1] // 2 output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - maca_ext_ops.silu_and_mul(out, x) + torch.ops._C.silu_and_mul(out, x) + #ops.silu_and_mul(out, x) return out From 2f1ffe3d8a79eaa832c14f9a82255d6d906a5499 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Thu, 27 Nov 2025 11:18:54 +0800 Subject: [PATCH 3/8] [maca] refactor moe ops --- dlinfer/vendor/maca/fused_moe.py | 60 ++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py index 44fae5d2..d129aa57 100644 --- a/dlinfer/vendor/maca/fused_moe.py +++ b/dlinfer/vendor/maca/fused_moe.py @@ -4,7 +4,6 @@ import json import os from typing import Any, Callable, Dict, List, Optional, Tuple - import torch import triton import triton.language as tl @@ -13,6 +12,9 @@ import logging +env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "yes") +USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on") + logger = logging.getLogger(__name__) @@ -251,10 +253,14 @@ def moe_align_block_size( (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device ) num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - - maca_ext_ops.moe_align_block_size( - topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad - ) + if USE_MCOPLIB_OPS: + torch.ops._moe_C.moe_align_block_size( + topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + ) + else: + maca_ext_ops.moe_align_block_size( + topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + ) return sorted_ids, expert_ids, num_tokens_post_pad @@ -460,13 +466,20 @@ def fused_topk( token_expert_indicies = torch.empty( M, topk, dtype=torch.int32, device=hidden_states.device ) - - maca_ext_ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) + if USE_MCOPLIB_OPS: + torch.ops._moe_C.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) + else: + maca_ext_ops.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) del token_expert_indicies # Not used. Will be used in the future. if renormalize: @@ -796,8 +809,12 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, block_shape=block_shape, ) - - maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + if USE_MCOPLIB_OPS: + torch.ops._C.silu_and_mul( + intermediate_cache2, intermediate_cache1.view(-1, N) + ) + else: + maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) invoke_fused_moe_kernel( intermediate_cache2, @@ -818,11 +835,16 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, block_shape=block_shape, ) - - maca_ext_ops.moe_sum( - intermediate_cache3.view(*intermediate_cache3.shape), - out_hidden_states[begin_chunk_idx:end_chunk_idx], - ) + if USE_MCOPLIB_OPS: + torch.ops._moe_C.moe_sum( + intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx], + ) + else: + maca_ext_ops.moe_sum( + intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx], + ) return out_hidden_states From da2ef657ffc5e88ff396e3e76a0981794ab4fee0 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Thu, 27 Nov 2025 11:20:47 +0800 Subject: [PATCH 4/8] [maca] fix paged prefill attention --- dlinfer/vendor/maca/maca_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py index 4934e36c..3ac99409 100644 --- a/dlinfer/vendor/maca/maca_ops.py +++ b/dlinfer/vendor/maca/maca_ops.py @@ -356,7 +356,8 @@ def paged_prefill_attention( ) return output[..., :512] - value_cache = value_cache.permute(0, 1, 3, 2) + value_cache = value_cache.permute(0, 2, 3, 1) + key_cache = key_cache.permute(0, 2, 3, 1) context_attention_fwd( query, key, @@ -436,7 +437,6 @@ def silu_and_mul(x: Tensor, dim: int = -1) -> Tensor: output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) torch.ops._C.silu_and_mul(out, x) - #ops.silu_and_mul(out, x) return out From e80f7b52aef62eb9f460b1c37d7abb170c2d5cc6 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Fri, 28 Nov 2025 14:36:14 +0800 Subject: [PATCH 5/8] format code --- dlinfer/framework/lmdeploy_ext/__init__.py | 1 + dlinfer/vendor/maca/fused_moe.py | 18 ++++++++++--- dlinfer/vendor/maca/maca_ops.py | 31 ++++++++++++---------- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py index 6ea0f201..6363584f 100644 --- a/dlinfer/framework/lmdeploy_ext/__init__.py +++ b/dlinfer/framework/lmdeploy_ext/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, DeepLink. All rights reserved. import dlinfer.framework.transformers_ext + # import dlinfer.framework.torch_npu_ext from . import quants from . import cudagraph diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py index d129aa57..393fada3 100644 --- a/dlinfer/vendor/maca/fused_moe.py +++ b/dlinfer/vendor/maca/fused_moe.py @@ -255,11 +255,21 @@ def moe_align_block_size( num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) if USE_MCOPLIB_OPS: torch.ops._moe_C.moe_align_block_size( - topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, ) else: maca_ext_ops.moe_align_block_size( - topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, ) return sorted_ids, expert_ids, num_tokens_post_pad @@ -814,7 +824,9 @@ def fused_experts_impl( intermediate_cache2, intermediate_cache1.view(-1, N) ) else: - maca_ext_ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + maca_ext_ops.silu_and_mul( + intermediate_cache2, intermediate_cache1.view(-1, N) + ) invoke_fused_moe_kernel( intermediate_cache2, diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py index 3ac99409..037a3079 100644 --- a/dlinfer/vendor/maca/maca_ops.py +++ b/dlinfer/vendor/maca/maca_ops.py @@ -30,7 +30,9 @@ ops_name = "maca_ext_ops" # Print environment variable value and selected ops library -print(f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}") +print( + f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}" +) print(f"[DLInfer] Using ops library: {ops_name}") __all__ = [ @@ -100,7 +102,7 @@ def apply_rotary_pos_emb( key = key.flatten(-2, -1) rot_dim = cos.size(-1) if USE_MCOPLIB_OPS: - ops.lmdeploy_rotary_embedding( + ops.lmdeploy_rotary_embedding( position_ids_1d, query, key, @@ -108,7 +110,7 @@ def apply_rotary_pos_emb( cos.view(-1, rot_dim), sin.view(-1, rot_dim), True, - ) + ) else: ops.rotary_embedding( position_ids_1d, @@ -118,7 +120,7 @@ def apply_rotary_pos_emb( cos.view(-1, rot_dim), sin.view(-1, rot_dim), True, - ) + ) return query, key @@ -198,8 +200,10 @@ def prefill_attention( # for qwen vl part. if q_start_loc.shape[0] == q_seq_len.shape[0]: causal = False - q_start_loc = torch.cat([q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)]) - + q_start_loc = torch.cat( + [q_start_loc, q_seq_len.sum().to(torch.int32).unsqueeze(0)] + ) + output = flash_attn_varlen_func( query, key, @@ -242,9 +246,11 @@ def fill_kv_cache( kv_indices = kv_indices.squeeze(-1) k_scale = torch.tensor(1.0) v_scale = torch.tensor(1.0) - + if USE_MCOPLIB_OPS: - torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale) + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale + ) else: ops.reshape_and_cache_flash( key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale @@ -386,7 +392,7 @@ def rms_norm( weight = weight.to(torch.float32) output = torch.empty_like(hidden_states) if USE_MCOPLIB_OPS: - op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None,False) + op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False) else: ops.rms_norm(output, hidden_states, weight, epsilon) return output.to(input_dtype) @@ -408,11 +414,8 @@ def moe_gating_topk_softmax( if USE_MCOPLIB_OPS: torch.ops._moe_C.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - router_logits.float() - ) + topk_weights, topk_ids, token_expert_indicies, router_logits.float() + ) else: ops.topk_softmax( topk_weights, From 03fdbdcca8a4ef394549aba6762b43811e8aacdf Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Tue, 2 Dec 2025 14:26:10 +0800 Subject: [PATCH 6/8] format code --- CMakeLists.txt | 2 +- dlinfer/framework/lmdeploy_ext/__init__.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ac8fe76..e2db297b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,4 +27,4 @@ endif() add_subdirectory(dlinfer/vendor/${DEVICE}) add_subdirectory(dlinfer/graph/dicp/vendor) -install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")") +install(CODE "message(STATUS \"Install completed for device: ${DEVICE}\")") diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py index 6363584f..37372ac0 100644 --- a/dlinfer/framework/lmdeploy_ext/__init__.py +++ b/dlinfer/framework/lmdeploy_ext/__init__.py @@ -1,7 +1,6 @@ # Copyright (c) 2024, DeepLink. All rights reserved. import dlinfer.framework.transformers_ext -# import dlinfer.framework.torch_npu_ext from . import quants from . import cudagraph from . import device From d52cd805db443d65e0ba8962728a2c759be9f501 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Wed, 3 Dec 2025 17:24:15 +0800 Subject: [PATCH 7/8] fix code --- dlinfer/framework/lmdeploy_ext/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlinfer/framework/lmdeploy_ext/__init__.py b/dlinfer/framework/lmdeploy_ext/__init__.py index 37372ac0..2bb07e28 100644 --- a/dlinfer/framework/lmdeploy_ext/__init__.py +++ b/dlinfer/framework/lmdeploy_ext/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2024, DeepLink. All rights reserved. import dlinfer.framework.transformers_ext - +import dlinfer.framework.torch_npu_ext from . import quants from . import cudagraph from . import device From c910b35f149f741d2dc9cf9e339ce03656f6e110 Mon Sep 17 00:00:00 2001 From: WangQing <2917021186@qq.com> Date: Fri, 5 Dec 2025 13:35:02 +0800 Subject: [PATCH 8/8] remove if USE_MCOPLIB_OPS --- dlinfer/vendor/maca/fused_moe.py | 74 +++++++-------------------- dlinfer/vendor/maca/maca_ops.py | 86 +++++++------------------------- 2 files changed, 37 insertions(+), 123 deletions(-) diff --git a/dlinfer/vendor/maca/fused_moe.py b/dlinfer/vendor/maca/fused_moe.py index 393fada3..f7b58aad 100644 --- a/dlinfer/vendor/maca/fused_moe.py +++ b/dlinfer/vendor/maca/fused_moe.py @@ -8,13 +8,8 @@ import triton import triton.language as tl -from .maca_extension import ops as maca_ext_ops - import logging -env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "yes") -USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on") - logger = logging.getLogger(__name__) @@ -253,24 +248,14 @@ def moe_align_block_size( (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device ) num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - if USE_MCOPLIB_OPS: - torch.ops._moe_C.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - ) - else: - maca_ext_ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - ) + torch.ops._moe_C.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) return sorted_ids, expert_ids, num_tokens_post_pad @@ -476,20 +461,12 @@ def fused_topk( token_expert_indicies = torch.empty( M, topk, dtype=torch.int32, device=hidden_states.device ) - if USE_MCOPLIB_OPS: - torch.ops._moe_C.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) - else: - maca_ext_ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) + torch.ops._moe_C.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) del token_expert_indicies # Not used. Will be used in the future. if renormalize: @@ -819,14 +796,7 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, block_shape=block_shape, ) - if USE_MCOPLIB_OPS: - torch.ops._C.silu_and_mul( - intermediate_cache2, intermediate_cache1.view(-1, N) - ) - else: - maca_ext_ops.silu_and_mul( - intermediate_cache2, intermediate_cache1.view(-1, N) - ) + torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) invoke_fused_moe_kernel( intermediate_cache2, @@ -847,16 +817,10 @@ def fused_experts_impl( use_int8_w8a16=use_int8_w8a16, block_shape=block_shape, ) - if USE_MCOPLIB_OPS: - torch.ops._moe_C.moe_sum( - intermediate_cache3.view(*intermediate_cache3.shape), - out_hidden_states[begin_chunk_idx:end_chunk_idx], - ) - else: - maca_ext_ops.moe_sum( - intermediate_cache3.view(*intermediate_cache3.shape), - out_hidden_states[begin_chunk_idx:end_chunk_idx], - ) + torch.ops._moe_C.moe_sum( + intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx], + ) return out_hidden_states diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py index 037a3079..e3941896 100644 --- a/dlinfer/vendor/maca/maca_ops.py +++ b/dlinfer/vendor/maca/maca_ops.py @@ -11,30 +11,11 @@ from dlinfer.utils.type_annotation import Tensor, Optional, Sequence, Tuple from .fused_moe import fused_experts -from .maca_extension import ops as maca_ext_ops -from mcoplib import lmdeploy as mcoplib_ops +from mcoplib import lmdeploy as ops from mcoplib import op as op_origin import mcoplib._C import mcoplib._moe_C -env_value = os.getenv("MACA_LMDEPLOY_MCOPLIB_OPS", "true") -USE_MCOPLIB_OPS = env_value.lower() in ("true", "1", "yes", "on") - -# Select the ops library based on environment variable -if USE_MCOPLIB_OPS: - print(f"====>{USE_MCOPLIB_OPS}") - ops = mcoplib_ops - ops_name = "mcoplib_ops" -else: - ops = maca_ext_ops - ops_name = "maca_ext_ops" - -# Print environment variable value and selected ops library -print( - f"[DLInfer] MACA_LMDEPLOY_MCOPLIB_OPS environment variable: {env_value} USE_MCOPLIB_OPS:{USE_MCOPLIB_OPS}" -) -print(f"[DLInfer] Using ops library: {ops_name}") - __all__ = [ "add_rms_norm", "apply_rotary_pos_emb", @@ -80,10 +61,7 @@ def add_rms_norm( weight: Tensor, epsilon: float, ) -> Tuple[Tensor, Tensor]: - if USE_MCOPLIB_OPS: - torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon) - else: - ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon) + torch.ops._C.fused_add_rms_norm(hidden_states, residual, weight, epsilon) return hidden_states, residual @@ -101,26 +79,15 @@ def apply_rotary_pos_emb( query = query.flatten(-2, -1) key = key.flatten(-2, -1) rot_dim = cos.size(-1) - if USE_MCOPLIB_OPS: - ops.lmdeploy_rotary_embedding( - position_ids_1d, - query, - key, - head_size, - cos.view(-1, rot_dim), - sin.view(-1, rot_dim), - True, - ) - else: - ops.rotary_embedding( - position_ids_1d, - query, - key, - head_size, - cos.view(-1, rot_dim), - sin.view(-1, rot_dim), - True, - ) + ops.lmdeploy_rotary_embedding( + position_ids_1d, + query, + key, + head_size, + cos.view(-1, rot_dim), + sin.view(-1, rot_dim), + True, + ) return query, key @@ -247,14 +214,9 @@ def fill_kv_cache( k_scale = torch.tensor(1.0) v_scale = torch.tensor(1.0) - if USE_MCOPLIB_OPS: - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale - ) - else: - ops.reshape_and_cache_flash( - key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale - ) + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, value, key_cache, value_cache, kv_indices, "auto", k_scale, v_scale + ) return key_cache, value_cache @@ -391,10 +353,7 @@ def rms_norm( hidden_states = hidden_states.to(torch.float32) weight = weight.to(torch.float32) output = torch.empty_like(hidden_states) - if USE_MCOPLIB_OPS: - op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False) - else: - ops.rms_norm(output, hidden_states, weight, epsilon) + op_origin.rms_norm(output, hidden_states, weight, epsilon, None, None, False) return output.to(input_dtype) @@ -412,18 +371,9 @@ def moe_gating_topk_softmax( token_expert_indicies = torch.empty_like(topk_ids) - if USE_MCOPLIB_OPS: - torch.ops._moe_C.topk_softmax( - topk_weights, topk_ids, token_expert_indicies, router_logits.float() - ) - else: - ops.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - router_logits.float(), - ) - + torch.ops._moe_C.topk_softmax( + topk_weights, topk_ids, token_expert_indicies, router_logits.float() + ) del token_expert_indicies # Not used. Will be used in the future. if renormalize: