Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
bb5253c
Initial draft of cursor port
djns99 Nov 18, 2025
881bfc6
Properly cache jit compiled module
djns99 Nov 19, 2025
e6a3457
Cleanup prints
djns99 Nov 19, 2025
fa69945
Combine tests run without crashing
djns99 Nov 19, 2025
3b5e0d3
Update tests with fake_moe properly
djns99 Nov 19, 2025
3a84b49
Clear MOE workspace before each run
djns99 Nov 19, 2025
f894bce
Cleanup MPI processes on test failures
djns99 Nov 20, 2025
a7c427c
More exit handling for rank failures
djns99 Nov 20, 2025
fb2b9b2
Cleaner test implementation
djns99 Nov 20, 2025
7d55b49
Update MNNVL config setup
djns99 Nov 26, 2025
a033a94
Update test to get ep size from MPI
djns99 Nov 26, 2025
e879354
Update tests with better test bounds
djns99 Nov 26, 2025
c574e36
Fix timeout logic
djns99 Nov 26, 2025
f22e6a0
Disable python steps for MPI tests
djns99 Nov 26, 2025
2baac54
Standardise API name to match existing code better
djns99 Nov 27, 2025
6ac511e
Enhance tests and add convenience APIs for more general usage
djns99 Nov 27, 2025
49e882f
Fix existing dispatch tests
djns99 Nov 27, 2025
f98530c
Tests for sanitize and combine
djns99 Nov 28, 2025
c4ee3c2
Fix logic for inplace combine workspace setup
djns99 Nov 28, 2025
febd132
Limit num tokens to allow combine to successfully run on 1 GPU
djns99 Nov 28, 2025
94df845
Unify naming
djns99 Nov 28, 2025
00d38cc
Add test for payload not in the workspace and fix coderabbit comments
djns99 Nov 28, 2025
3e04d84
Update comm.rst
djns99 Nov 28, 2025
eaa5eb8
Fix coderabbit nits
djns99 Nov 28, 2025
a51b1ea
Properly export all functions
djns99 Nov 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions csrc/nv_internal/cpp/common/envUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,11 +222,6 @@ bool getEnvDisaggLayerwise() {
return disaggLayerwise;
}

bool getEnvParallelCacheSend() {
static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND");
return parallelCacheSend;
}

bool getEnvRequestKVCacheConcurrent() {
static bool const requestKVCacheConcurrent = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_CONCURRENT");
return requestKVCacheConcurrent;
Expand Down Expand Up @@ -277,7 +272,7 @@ size_t getEnvAllReduceWorkspaceSize() {
return workspaceSize;
}

std::string getEnvKVCacheTransferOutputPath() {
std::string const& getEnvKVCacheTimeOutputPath() {
static std::string outputPath = getStrEnv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH").value_or("");
return outputPath;
}
Expand Down Expand Up @@ -328,4 +323,37 @@ uint16_t getEnvNixlPort() {

bool getEnvDisaggBenchmarkGenOnly() { return getBoolEnv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY"); }

bool getEnvMoeA2AOneBlockPerToken() {
// Default true; return false only if env set to "0"
static std::optional<int32_t> const val = getIntEnv("TLLM_MOE_A2A_ONE_BLOCK_PER_TOKEN");
if (!val.has_value()) {
return true;
}
return val.value() != 0;
}

static int sanitizeBlockSize(std::optional<int32_t> const& val) {
// Default 256 when not set or invalid
int block = val.value_or(256);
// Clamp to sane CUDA bounds and warp multiples
if (block <= 0) block = 256;
if (block > 1024) block = 1024;
// Round to nearest multiple of 32 (warp size)
block = (block + 31) / 32 * 32;
if (block == 0) block = 256;
return block;
}

int getEnvMoeA2ADispatchBlockSize() {
static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_DISPATCH_BLOCK_SIZE"));
return kBlock;
}

int getEnvMoeA2ACombineBlockSize() {
static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_COMBINE_BLOCK_SIZE"));
return kBlock;
}

bool getEnvEplbForceGdrcopy() { return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); }

} // namespace tensorrt_llm::common
11 changes: 10 additions & 1 deletion csrc/nv_internal/tensorrt_llm/common/envUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ bool getEnvDisableKVCacheTransferOverlap();

bool getEnvEnableReceiveKVCacheParallel();

std::string getEnvKVCacheTransferOutputPath();
std::string const& getEnvKVCacheTimeOutputPath();

bool getEnvTryZCopyForKVCacheTransfer();

Expand Down Expand Up @@ -92,4 +92,13 @@ size_t getEnvKVCacheSendMaxConcurrenceNum();

size_t getEnvMemSizeForKVCacheTransferBuffer();

// Whether to use one block per token for MoE A2A kernels (default true).
bool getEnvMoeA2AOneBlockPerToken();

// TODO: For DEV purpose temporarily.
// Block size (threads per block) for MoE A2A Dispatch kernels (default 256 if unset or invalid)
int getEnvMoeA2ADispatchBlockSize();
// Block size (threads per block) for MoE A2A Combine kernels (default 256 if unset or invalid)
int getEnvMoeA2ACombineBlockSize();

} // namespace tensorrt_llm::common
Loading