diff --git a/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py b/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py
index 1a607e192a..000c0a3592 100644
--- a/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py
+++ b/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py
@@ -308,7 +308,7 @@
), f"C16 prefix cache != No prefix cache,\n attn_out[hit_prefix_len:]: {attn_out_np},\nattn_out_prefix_cache: {attn_out_prefix_cache_np}"
-print("\n-- C8 per channle prefix cache test --")
+print("\n-- C8 per channel prefix cache test --")
print(
"attn_out_C8[hit_prefix_len:]'s mean:",
attn_out_C8[hit_prefix_len:].mean().item(),
@@ -318,9 +318,9 @@
attn_out_C8_np = attn_out_C8[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_prefix_cache_np, attn_out_C8_np, rtol=1e-1, atol=1e-2
-), f"C8 per channle prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"
+), f"C8 per channel prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"
-print("\n-- C8 per channle zp prefix cache test --")
+print("\n-- C8 per channel zp prefix cache test --")
print(
"attn_out_C8_zp[hit_prefix_len:]'s mean:",
attn_out_C8_zp[hit_prefix_len:].mean().item(),
@@ -333,4 +333,4 @@
attn_out_C8_zp_np = attn_out_C8_zp[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_zp_prefix_cache_np, attn_out_C8_zp_np, rtol=1e-1, atol=1e-2
-), f"C8 per channle zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"
+), f"C8 per channel zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"
diff --git a/custom_ops/xpu_ops/test/test_moe_ep_combine.py b/custom_ops/xpu_ops/test/test_moe_ep_combine.py
index b71e05dae1..535ecba678 100644
--- a/custom_ops/xpu_ops/test/test_moe_ep_combine.py
+++ b/custom_ops/xpu_ops/test/test_moe_ep_combine.py
@@ -80,7 +80,7 @@ def create_moe_index(token_num, moe_topk, expand_token_num):
moe_index_pd.shape[1],
)
-# comparation
+# comparison
# print("moe_index:\n", moe_index)
# print("moe_weights:\n", moe_weights)
# print("combined_out_np:\n", combined_out_np)
diff --git a/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py b/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py
index 9b38bb34ee..f6f67662dd 100644
--- a/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py
+++ b/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py
@@ -117,7 +117,7 @@ def create_moe_index(token_num, topk, expert_num):
"weight_only_int8",
)
-# comparation
+# comparison
permute_input_xpu = permute_input_xpu.astype("float32").numpy()
permute_indices_per_token_xpu = permute_indices_per_token_xpu.numpy()
recv_num_tokens_per_expert_list_cumsum_xpu = recv_num_tokens_per_expert_list_cumsum_xpu.numpy()
diff --git a/custom_ops/xpu_ops/test/test_weight_only_linear.py b/custom_ops/xpu_ops/test/test_weight_only_linear.py
index fe3993e12b..651f1305df 100644
--- a/custom_ops/xpu_ops/test/test_weight_only_linear.py
+++ b/custom_ops/xpu_ops/test/test_weight_only_linear.py
@@ -130,7 +130,7 @@ def batch_matmul(x, qw, wscale, algo, bias=None):
print(f"out_pd:\n{out_pd}")
print(f"out_np:\n{out_np}")
-# comparation
+# comparison
print(f"out_pd, mean={out_pd.mean()}, std={out_pd.std()}")
print(f"out_np, mean={out_np.mean()}, std={out_np.std()}")
sum_diff = np.sum(np.abs(out_pd.astype("float32").numpy() - out_np.astype("float32")))
diff --git a/docs/features/plas_attention.md b/docs/features/plas_attention.md
index 8384de3b5d..dfd85e6769 100644
--- a/docs/features/plas_attention.md
+++ b/docs/features/plas_attention.md
@@ -32,7 +32,7 @@ During sparse attention computation, each query token may dynamically select dif
To optimize performance in both the prefill and decode stages, we design a special joint strategy to adapt to their respective characteristics:
-* **Prefill Toke Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
+* **Prefill Token Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
* **Decode Head Union**: Given the widespread adoption of GQA in modern models, we find that different heads within the same group often select overlapping key blocks. Thus, we combine the key blocks selected by all query heads within a group into a unified set and jointly calculate sparse attention. This way also reduces memory access overhead and further improves decoding efficiency.
* **Top-K Selection**: Conventional top-k algorithms based on sorting or direct calls to the cub library introduce significant runtime overhead. To mitigate this, we implemented an approximate top-k selection algorithm using binary search, which significantly reduces latency while maintaining accuracy, ultimately achieving significantly improved performance.
@@ -132,7 +132,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev
QPS |
Decode Speed (token/s) |
Time to First token(s) |
- Time per Ouput Token(ms) |
+ Time per Output Token(ms) |
End-to-End Latency(s) |
Mean Input Length |
Mean Output Length |
diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md
index 9b4c96f00f..e5fd46d966 100644
--- a/docs/get_started/installation/iluvatar_gpu.md
+++ b/docs/get_started/installation/iluvatar_gpu.md
@@ -33,7 +33,7 @@ docker exec -it paddle_infer bash
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
```
-For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
+For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
### Install or build FastDeploy
```bash
diff --git a/docs/zh/features/plas_attention.md b/docs/zh/features/plas_attention.md
index 09a98e6f47..f415f49b19 100644
--- a/docs/zh/features/plas_attention.md
+++ b/docs/zh/features/plas_attention.md
@@ -34,7 +34,7 @@
为了优化预填充和解码阶段的性能,我们设计了一种特殊的联合策略来适应各自的特点:
-* **Prefill Toke Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
+* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
* **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。
@@ -136,7 +136,7 @@
QPS |
Decode Speed (token/s) |
Time to First token(s) |
- Time per Ouput Token(ms) |
+ Time per Output Token(ms) |
End-to-End Latency(s) |
Mean Input Length |
Mean Output Length |
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index a6c7f355d9..f1a663d35c 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -455,7 +455,7 @@ def schedule(self):
# schedule when extend block tables is needed
for req in self.running:
num_prefill_blocks = req.need_prefill_tokens // self.config.cache_config.block_size
- # alocate
+ # allocate
if req.use_extend_tables and req.request_id not in self.using_extend_tables_req_id:
llm_logger.info(
f"req {req.request_id} at batch id {req.idx} with num_prefill_blocks {num_prefill_blocks} is going to enable extend tables"
@@ -488,7 +488,7 @@ def schedule(self):
<= self.config.cache_config.prealloc_dec_block_slot_num_threshold
):
llm_logger.info(
- f"req {req.request_id} is going to alocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
+ f"req {req.request_id} is going to allocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
)
if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
req.extend_block_tables.extend(
diff --git a/fastdeploy/plugins/model_runner/__init__.py b/fastdeploy/plugins/model_runner/__init__.py
index 8897abfbc0..eac59e0704 100644
--- a/fastdeploy/plugins/model_runner/__init__.py
+++ b/fastdeploy/plugins/model_runner/__init__.py
@@ -16,7 +16,7 @@
from fastdeploy.plugins.utils import load_plugins_by_group, plugins_loaded
-# use for modle runner
+# use for model runner
PLUGINS_GROUP = "fastdeploy.model_runner_plugins"
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index 93ae92261b..2b8118f02a 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -564,7 +564,7 @@ def parse_args():
"--moba_attention_config",
type=json.loads,
default=None,
- help="Configation of moba attention.",
+ help="Configuration of moba attention.",
)
parser.add_argument(
"--guided_decoding_backend",
diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py
index fe48be8c42..0b474d332e 100644
--- a/tests/entrypoints/openai/test_max_streaming_tokens.py
+++ b/tests/entrypoints/openai/test_max_streaming_tokens.py
@@ -181,7 +181,7 @@ async def mock_process_response_chat_single(response, stream, enable_thinking, i
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
- self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
+ self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
for chunk_dict in parsed_chunks:
@@ -260,7 +260,7 @@ async def test_integration_with_completion_stream_generator(self, mock_logger):
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
- self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
+ self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
self.assertEqual(len(parsed_chunks), 1)
diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
index b6e74753b1..4143fcfd6c 100644
--- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
+++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
@@ -89,7 +89,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):
class TestModel1(paddle.nn.Layer):
- """Tast Model"""
+ """Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py
index 126d231fe3..6b9fb5de5c 100644
--- a/tests/graph_optimization/test_cuda_graph_recapture.py
+++ b/tests/graph_optimization/test_cuda_graph_recapture.py
@@ -36,7 +36,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):
class TestModel1(paddle.nn.Layer):
- """Tast Model"""
+ """Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
@@ -123,9 +123,9 @@ def capture_and_replay(self, input_tensor1, forward_meta1):
assert (output1 == self.output_correct).all()
# Destroy
- print_gpu_memory_use(0, "before destory")
+ print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
- print_gpu_memory_use(0, "after destory")
+ print_gpu_memory_use(0, "after destroy")
def recapture_and_replay(self, input_tensor1, forward_meta1):
""" """
@@ -139,9 +139,9 @@ def recapture_and_replay(self, input_tensor1, forward_meta1):
assert (output2 == self.output_correct).all()
# Destroy
- print_gpu_memory_use(0, "before destory")
+ print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
- print_gpu_memory_use(0, "after destory")
+ print_gpu_memory_use(0, "after destroy")
if __name__ == "__main__":
diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py
index f4a95ceada..9162d7173b 100644
--- a/tests/graph_optimization/test_cuda_graph_spec_decode.py
+++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py
@@ -51,7 +51,7 @@ def forward_correct(self, ids_remove_padding, forward_meta: ForwardMeta):
class TestModel1(paddle.nn.Layer):
- """Tast Model"""
+ """Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
diff --git a/tests/model_loader/test_model_cache.py b/tests/model_loader/test_model_cache.py
index 342c901af2..e48a136b63 100644
--- a/tests/model_loader/test_model_cache.py
+++ b/tests/model_loader/test_model_cache.py
@@ -123,6 +123,6 @@ def test_model_cache(
check_tokens_id_and_text_close(
outputs_0_lst=fd_outputs_v1,
outputs_1_lst=fd_outputs_v1_with_cache,
- name_0="default_v1 laoder",
+ name_0="default_v1 loader",
name_1="default_v1 loader using cache",
)
diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py
index 19f7c51aad..97b1e8f58e 100644
--- a/tests/model_loader/utils.py
+++ b/tests/model_loader/utils.py
@@ -100,7 +100,7 @@ def form_model_get_output_topp0(
fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
result_queue.put(fd_outputs)
except Exception:
- print(f"Failed using {load_choices} laoder to load model from {model_path}.")
+ print(f"Failed using {load_choices} loader to load model from {model_path}.")
traceback.print_exc()
pytest.fail(f"Failed to initialize LLM model from {model_path}")
diff --git a/tests/output/test_get_save_output_v1.py b/tests/output/test_get_save_output_v1.py
index 29a47be46a..ddfd944ee0 100644
--- a/tests/output/test_get_save_output_v1.py
+++ b/tests/output/test_get_save_output_v1.py
@@ -28,7 +28,7 @@
MAX_WAIT_SECONDS = 60
os.environ["LD_LIBRARY_PATH"] = "/usr/local/nccl/"
-# enbale get_save_output_v1
+# enable get_save_output_v1
os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"