Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@
), f"C16 prefix cache != No prefix cache,\n attn_out[hit_prefix_len:]: {attn_out_np},\nattn_out_prefix_cache: {attn_out_prefix_cache_np}"


print("\n-- C8 per channle prefix cache test --")
print("\n-- C8 per channel prefix cache test --")
print(
"attn_out_C8[hit_prefix_len:]'s mean:",
attn_out_C8[hit_prefix_len:].mean().item(),
Expand All @@ -318,9 +318,9 @@
attn_out_C8_np = attn_out_C8[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_prefix_cache_np, attn_out_C8_np, rtol=1e-1, atol=1e-2
), f"C8 per channle prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"
), f"C8 per channel prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"

print("\n-- C8 per channle zp prefix cache test --")
print("\n-- C8 per channel zp prefix cache test --")
print(
"attn_out_C8_zp[hit_prefix_len:]'s mean:",
attn_out_C8_zp[hit_prefix_len:].mean().item(),
Expand All @@ -333,4 +333,4 @@
attn_out_C8_zp_np = attn_out_C8_zp[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_zp_prefix_cache_np, attn_out_C8_zp_np, rtol=1e-1, atol=1e-2
), f"C8 per channle zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"
), f"C8 per channel zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"
2 changes: 1 addition & 1 deletion custom_ops/xpu_ops/test/test_moe_ep_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def create_moe_index(token_num, moe_topk, expand_token_num):
moe_index_pd.shape[1],
)

# comparation
# comparison
# print("moe_index:\n", moe_index)
# print("moe_weights:\n", moe_weights)
# print("combined_out_np:\n", combined_out_np)
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/xpu_ops/test/test_moe_ep_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def create_moe_index(token_num, topk, expert_num):
"weight_only_int8",
)

# comparation
# comparison
permute_input_xpu = permute_input_xpu.astype("float32").numpy()
permute_indices_per_token_xpu = permute_indices_per_token_xpu.numpy()
recv_num_tokens_per_expert_list_cumsum_xpu = recv_num_tokens_per_expert_list_cumsum_xpu.numpy()
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/xpu_ops/test/test_weight_only_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def batch_matmul(x, qw, wscale, algo, bias=None):
print(f"out_pd:\n{out_pd}")
print(f"out_np:\n{out_np}")

# comparation
# comparison
print(f"out_pd, mean={out_pd.mean()}, std={out_pd.std()}")
print(f"out_np, mean={out_np.mean()}, std={out_np.std()}")
sum_diff = np.sum(np.abs(out_pd.astype("float32").numpy() - out_np.astype("float32")))
Expand Down
4 changes: 2 additions & 2 deletions docs/features/plas_attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ During sparse attention computation, each query token may dynamically select dif

To optimize performance in both the prefill and decode stages, we design a special joint strategy to adapt to their respective characteristics:

* **Prefill Toke Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
* **Prefill Token Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
* **Decode Head Union**: Given the widespread adoption of GQA in modern models, we find that different heads within the same group often select overlapping key blocks. Thus, we combine the key blocks selected by all query heads within a group into a unified set and jointly calculate sparse attention. This way also reduces memory access overhead and further improves decoding efficiency.
* **Top-K Selection**: Conventional top-k algorithms based on sorting or direct calls to the cub library introduce significant runtime overhead. To mitigate this, we implemented an approximate top-k selection algorithm using binary search, which significantly reduces latency while maintaining accuracy, ultimately achieving significantly improved performance.

Expand Down Expand Up @@ -132,7 +132,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>QPS</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Decode Speed (token/s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time to First token(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Ouput Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Output Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>End-to-End Latency(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Input<br>Length</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Output Length</strong></td>
Expand Down
2 changes: 1 addition & 1 deletion docs/get_started/installation/iluvatar_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ docker exec -it paddle_infer bash
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
```
For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)

### Install or build FastDeploy
```bash
Expand Down
4 changes: 2 additions & 2 deletions docs/zh/features/plas_attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

为了优化预填充和解码阶段的性能,我们设计了一种特殊的联合策略来适应各自的特点:

* **Prefill Toke Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。

* **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。

Expand Down Expand Up @@ -136,7 +136,7 @@
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>QPS</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Decode Speed (token/s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time to First token(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Ouput Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Output Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>End-to-End Latency(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Input<br>Length</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Output Length</strong></td>
Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/engine/sched/resource_manager_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ def schedule(self):
# schedule when extend block tables is needed
for req in self.running:
num_prefill_blocks = req.need_prefill_tokens // self.config.cache_config.block_size
# alocate
# allocate
if req.use_extend_tables and req.request_id not in self.using_extend_tables_req_id:
llm_logger.info(
f"req {req.request_id} at batch id {req.idx} with num_prefill_blocks {num_prefill_blocks} is going to enable extend tables"
Expand Down Expand Up @@ -488,7 +488,7 @@ def schedule(self):
<= self.config.cache_config.prealloc_dec_block_slot_num_threshold
):
llm_logger.info(
f"req {req.request_id} is going to alocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
f"req {req.request_id} is going to allocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
)
if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
req.extend_block_tables.extend(
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/plugins/model_runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from fastdeploy.plugins.utils import load_plugins_by_group, plugins_loaded

# use for modle runner
# use for model runner
PLUGINS_GROUP = "fastdeploy.model_runner_plugins"


Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def parse_args():
"--moba_attention_config",
type=json.loads,
default=None,
help="Configation of moba attention.",
help="Configuration of moba attention.",
)
parser.add_argument(
"--guided_decoding_backend",
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_max_streaming_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ async def mock_process_response_chat_single(response, stream, enable_thinking, i
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
for chunk_dict in parsed_chunks:
Expand Down Expand Up @@ -260,7 +260,7 @@ async def test_integration_with_completion_stream_generator(self, mock_logger):
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
self.assertEqual(len(parsed_chunks), 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):


class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""

def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
Expand Down
10 changes: 5 additions & 5 deletions tests/graph_optimization/test_cuda_graph_recapture.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):


class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""

def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
Expand Down Expand Up @@ -123,9 +123,9 @@ def capture_and_replay(self, input_tensor1, forward_meta1):
assert (output1 == self.output_correct).all()

# Destroy
print_gpu_memory_use(0, "before destory")
print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory")
print_gpu_memory_use(0, "after destroy")

def recapture_and_replay(self, input_tensor1, forward_meta1):
""" """
Expand All @@ -139,9 +139,9 @@ def recapture_and_replay(self, input_tensor1, forward_meta1):
assert (output2 == self.output_correct).all()

# Destroy
print_gpu_memory_use(0, "before destory")
print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory")
print_gpu_memory_use(0, "after destroy")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion tests/graph_optimization/test_cuda_graph_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):


class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""

def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion tests/model_loader/test_model_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,6 @@ def test_model_cache(
check_tokens_id_and_text_close(
outputs_0_lst=fd_outputs_v1,
outputs_1_lst=fd_outputs_v1_with_cache,
name_0="default_v1 laoder",
name_0="default_v1 loader",
name_1="default_v1 loader using cache",
)
2 changes: 1 addition & 1 deletion tests/model_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def form_model_get_output_topp0(
fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
result_queue.put(fd_outputs)
except Exception:
print(f"Failed using {load_choices} laoder to load model from {model_path}.")
print(f"Failed using {load_choices} loader to load model from {model_path}.")
traceback.print_exc()
pytest.fail(f"Failed to initialize LLM model from {model_path}")

Expand Down
2 changes: 1 addition & 1 deletion tests/output/test_get_save_output_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
MAX_WAIT_SECONDS = 60

os.environ["LD_LIBRARY_PATH"] = "/usr/local/nccl/"
# enbale get_save_output_v1
# enable get_save_output_v1
os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"


Expand Down
Loading