From cfac59e21a7ca2196761fc8d961d972ba9c6b200 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Feb 2026 09:23:05 +0000 Subject: [PATCH 01/13] feat(mllm_kernel): simplify JIT usage in README and update kernel example - Replaced the previous JIT utility functions with a streamlined `jit` decorator for kernel registration. - Updated the README.md to reflect the new recommended pattern for CPU kernel implementation. - Simplified the example for using the JIT compilation with a focus on clarity and ease of use. --- mllm-kernel/README.md | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mllm-kernel/README.md b/mllm-kernel/README.md index 14c8118f..0a458049 100644 --- a/mllm-kernel/README.md +++ b/mllm-kernel/README.md @@ -80,31 +80,30 @@ y = add_constant(x, 8) Use the helpers in `mllm_kernel.jit_utils`: -- `load_cpu_jit` -- `load_cuda_jit` +- `jit` - `make_cpp_args` -- `cache_once` -Example pattern: +Recommended pattern (CPU example): ```python import torch -from mllm_kernel.jit_utils import cache_once, load_cpu_jit, make_cpp_args - -@cache_once -def _jit_my_kernel_module(param: int): - args = make_cpp_args(param) - return load_cpu_jit( - "my_kernel", - *args, - cpp_files=["my_kernel.cpp"], - cpp_wrappers=[("my_kernel", f"my_namespace::my_kernel<{args}>")], - ) +import mllm_kernel + +@mllm_kernel.jit( + args=16, + device="cpu", + cpp_files=["my_kernel.cpp"], + cpp_wrappers=[("my_kernel", "my_namespace::my_kernel<16>")], + func_name="my_kernel", +) +def _my_kernel_16(compiled_module, dst: torch.Tensor, src: torch.Tensor) -> None: + compiled_module.my_kernel(dst, src) def my_kernel(src: torch.Tensor, param: int) -> torch.Tensor: + if param != 16: + raise ValueError("This demo only supports param=16.") dst = torch.empty_like(src) - module = _jit_my_kernel_module(param) - module.my_kernel(dst, src) + _my_kernel_16(dst, src) return dst ``` From 8f3485a3bb7a7e8e73066d993b312128130ed8ef Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 03:39:52 +0000 Subject: [PATCH 02/13] feat: update dependencies and refactor mobile module structure - Updated `apache-tvm-ffi` version to `0.1.8` in `pyproject.toml` and `mllm-kernel/pyproject.toml`. - Refactored mobile module imports and structure, moving scripts to `pymllm.mobile` and removing unused backends. - Introduced new classes and methods for quantization and model deployment in the Qualcomm backend. - Added new README files for mobile and Qualcomm transformer components. --- mllm-kernel/pyproject.toml | 2 +- mllm/ffi/Extension.cc | 5 +- mllm/ffi/vendors/tvm-ffi | 2 +- pymllm/__init__.py | 72 ++++++++----------- .../cuda/__init__.py => __main__.py} | 0 pymllm/backends/__init__.py | 4 -- pymllm/backends/cuda/tilelang_compile_test.py | 41 ----------- .../transformers/core => layers}/__init__.py | 0 pymllm/mobile/README.md | 3 +- pymllm/mobile/__init__.py | 45 ++++++++++++ .../spinquant => mobile/backends}/__init__.py | 2 + .../{ => mobile}/backends/qualcomm/README.md | 0 .../backends/qualcomm/__init__.py | 0 pymllm/{ => mobile}/backends/qualcomm/nn.py | 2 +- .../backends/qualcomm/qnn_aot_env.py | 4 +- .../backends/qualcomm/transformers/.gitignore | 0 .../backends/qualcomm/transformers/README.md | 0 .../qualcomm/transformers/__init__.py | 0 .../qualcomm/transformers/core}/__init__.py | 0 .../qualcomm/transformers/core/embedding.py | 0 .../qualcomm/transformers/core/observer.py | 0 .../qualcomm/transformers/core/qdq.py | 0 .../qualcomm/transformers/core/qlinear.py | 2 +- .../qualcomm/transformers/core/rms_norm.py | 0 .../transformers/llama/modeling_llama.py | 10 +-- .../qualcomm/transformers/llama/runner.py | 12 ++-- .../qualcomm/transformers/llama/train.py | 2 +- .../transformers/qwen2/modeling_qwen2.py | 10 +-- .../qualcomm/transformers/qwen2/runner.py | 12 ++-- .../qualcomm/transformers/qwen2/train.py | 2 +- .../transformers/qwen3/modeling_qwen3.py | 10 +-- .../qualcomm/transformers/qwen3/runner.py | 12 ++-- .../qualcomm/transformers/qwen3/train.py | 2 +- pymllm/{ => mobile}/convertor/__init__.py | 0 .../convertor/mllm_type_mapping.py | 0 .../{ => mobile}/convertor/model_file_v1.py | 0 .../{ => mobile}/convertor/model_file_v2.py | 0 pymllm/{ => mobile}/ffi/__init__.py | 0 pymllm/{ => mobile}/ffi/_ffi_api.py | 0 pymllm/{ => mobile}/ffi/base.py | 2 +- pymllm/{ => mobile}/nn/__init__.py | 0 pymllm/{ => mobile}/nn/_layers.py | 0 pymllm/{ => mobile}/nn/_module.py | 0 pymllm/{ => mobile}/nn/functional.py | 0 pymllm/{ => mobile}/quantize/__init__.py | 0 .../{ => mobile}/quantize/cast2fp32_pass.py | 0 .../quantize/gguf}/__init__.py | 0 pymllm/{ => mobile}/quantize/kai/__init__.py | 0 pymllm/{ => mobile}/quantize/kai/w4a32.py | 0 pymllm/{ => mobile}/quantize/pipeline.py | 0 pymllm/{ => mobile}/quantize/quantize_pass.py | 0 pymllm/{ => mobile}/quantize/solver.py | 0 .../quantize/spinquant}/__init__.py | 0 pymllm/{ => mobile}/service/__init__.py | 0 pymllm/{ => mobile}/service/models_hub.py | 0 pymllm/{ => mobile}/service/network.py | 0 pymllm/{ => mobile}/service/rr_process.py | 0 pymllm/{ => mobile}/service/tools.py | 0 .../tests/qualcomm/test_context_create.py | 4 +- pymllm/{ => mobile}/tests/test_nn.py | 4 +- pymllm/{ => mobile}/tests/test_tensor.py | 2 +- pymllm/{ => mobile}/utils/__init__.py | 0 pymllm/{ => mobile}/utils/adb.py | 0 pymllm/{ => mobile}/utils/error_handler.py | 0 pymllm/{ => mobile}/utils/mllm_convertor.py | 0 .../mllm_ir/trace.py => models/__init__.py} | 0 pymllm/utils/mllm_convertor_server/service.py | 2 - pyproject.toml | 8 +-- 68 files changed, 132 insertions(+), 146 deletions(-) rename pymllm/{backends/cuda/__init__.py => __main__.py} (100%) delete mode 100644 pymllm/backends/__init__.py delete mode 100644 pymllm/backends/cuda/tilelang_compile_test.py rename pymllm/{backends/qualcomm/transformers/core => layers}/__init__.py (100%) create mode 100644 pymllm/mobile/__init__.py rename pymllm/{quantize/spinquant => mobile/backends}/__init__.py (71%) rename pymllm/{ => mobile}/backends/qualcomm/README.md (100%) rename pymllm/{ => mobile}/backends/qualcomm/__init__.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/nn.py (75%) rename pymllm/{ => mobile}/backends/qualcomm/qnn_aot_env.py (83%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/.gitignore (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/README.md (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/__init__.py (100%) rename pymllm/{compile/mlir => mobile/backends/qualcomm/transformers/core}/__init__.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/embedding.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/observer.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/qdq.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/qlinear.py (99%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/rms_norm.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/modeling_llama.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/train.py (94%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/modeling_qwen2.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/train.py (94%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/modeling_qwen3.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/train.py (94%) rename pymllm/{ => mobile}/convertor/__init__.py (100%) rename pymllm/{ => mobile}/convertor/mllm_type_mapping.py (100%) rename pymllm/{ => mobile}/convertor/model_file_v1.py (100%) rename pymllm/{ => mobile}/convertor/model_file_v2.py (100%) rename pymllm/{ => mobile}/ffi/__init__.py (100%) rename pymllm/{ => mobile}/ffi/_ffi_api.py (100%) rename pymllm/{ => mobile}/ffi/base.py (90%) rename pymllm/{ => mobile}/nn/__init__.py (100%) rename pymllm/{ => mobile}/nn/_layers.py (100%) rename pymllm/{ => mobile}/nn/_module.py (100%) rename pymllm/{ => mobile}/nn/functional.py (100%) rename pymllm/{ => mobile}/quantize/__init__.py (100%) rename pymllm/{ => mobile}/quantize/cast2fp32_pass.py (100%) rename pymllm/{compile => mobile/quantize/gguf}/__init__.py (100%) rename pymllm/{ => mobile}/quantize/kai/__init__.py (100%) rename pymllm/{ => mobile}/quantize/kai/w4a32.py (100%) rename pymllm/{ => mobile}/quantize/pipeline.py (100%) rename pymllm/{ => mobile}/quantize/quantize_pass.py (100%) rename pymllm/{ => mobile}/quantize/solver.py (100%) rename pymllm/{quantize/gguf => mobile/quantize/spinquant}/__init__.py (100%) rename pymllm/{ => mobile}/service/__init__.py (100%) rename pymllm/{ => mobile}/service/models_hub.py (100%) rename pymllm/{ => mobile}/service/network.py (100%) rename pymllm/{ => mobile}/service/rr_process.py (100%) rename pymllm/{ => mobile}/service/tools.py (100%) rename pymllm/{ => mobile}/tests/qualcomm/test_context_create.py (89%) rename pymllm/{ => mobile}/tests/test_nn.py (83%) rename pymllm/{ => mobile}/tests/test_tensor.py (89%) rename pymllm/{ => mobile}/utils/__init__.py (100%) rename pymllm/{ => mobile}/utils/adb.py (100%) rename pymllm/{ => mobile}/utils/error_handler.py (100%) rename pymllm/{ => mobile}/utils/mllm_convertor.py (100%) rename pymllm/{compile/mllm_ir/trace.py => models/__init__.py} (100%) delete mode 100644 pymllm/utils/mllm_convertor_server/service.py diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index f64e1306..5fe07eea 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "packaging", "torch", "torch-c-dlpack-ext", - "apache-tvm-ffi", + "apache-tvm-ffi == 0.1.8", ] [project.optional-dependencies] diff --git a/mllm/ffi/Extension.cc b/mllm/ffi/Extension.cc index cb999191..f3f2d248 100644 --- a/mllm/ffi/Extension.cc +++ b/mllm/ffi/Extension.cc @@ -83,12 +83,12 @@ TVM_FFI_STATIC_INIT_BLOCK() { // Tensor related refl::GlobalDef().def("mllm.empty", mllm::ffi::empty); refl::GlobalDef().def("mllm.from_torch", [](const tvm::ffi::Tensor& t) -> mllm::ffi::Tensor { - auto dl_pack = t.get()->ToDLPack(); + auto dl_pack = t.ToDLPack(); return ::mllm::ffi::Tensor(mllm::ffi::__from_dlpack(dl_pack)); }); refl::GlobalDef().def("mllm.from_numpy", [](const tvm::ffi::Tensor& t) -> mllm::ffi::Tensor { - auto dl_pack = t.get()->ToDLPack(); + auto dl_pack = t.ToDLPack(); return ::mllm::ffi::Tensor(mllm::ffi::__from_dlpack(dl_pack)); }); @@ -345,6 +345,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::ObjectDef<::mllm::ffi::BaseOpObj>(); + refl::ObjectDef<::mllm::ffi::ParameterFileObj>(); refl::GlobalDef().def("mllm.BaseOp.load", [](const mllm::ffi::BaseOp& self, const mllm::ffi::ParameterFile& obj) -> void { self.get()->op_ptr_->load(obj.get()->pf_ptr_); }); diff --git a/mllm/ffi/vendors/tvm-ffi b/mllm/ffi/vendors/tvm-ffi index 46f73580..dcd07cfe 160000 --- a/mllm/ffi/vendors/tvm-ffi +++ b/mllm/ffi/vendors/tvm-ffi @@ -1 +1 @@ -Subproject commit 46f73580780f2973e6ea3afb6d3a9d6f6ffd02cc +Subproject commit dcd07cfe27465287ee5b203b742e85dcfb99606a diff --git a/pymllm/__init__.py b/pymllm/__init__.py index 1bd31cd6..3f2488d2 100644 --- a/pymllm/__init__.py +++ b/pymllm/__init__.py @@ -2,48 +2,32 @@ # Licensed under the MIT License. from __future__ import annotations +import os +import sys -from . import ffi -from . import convertor -from . import utils -from . import quantize -from . import nn -from . import compile -from . import service -from . import backends -from .ffi import ( - # Floating point types - float32, - float16, - bfloat16, - # Signed integer types - int8, - int16, - int32, - int64, - # Unsigned integer types - uint8, - uint16, - uint32, - uint64, - # Bool type - boolean, - # Devices - cpu, - cuda, - qnn, - # Tensor and utilities - Tensor, - empty, - echo, - device, - is_torch_available, - is_numpy_available, - from_torch, - from_numpy, - zeros, - ones, - arange, - random, -) -from .nn.functional import matmul +__all__ = [] + + +def _has_mobile_libs() -> bool: + parent_dir = os.path.dirname(os.path.realpath(__file__)) + + # Platform-specific library names + if sys.platform.startswith("win32"): + lib_name = "MllmFFIExtension.dll" + elif sys.platform.startswith("darwin"): + lib_name = "MllmFFIExtension.dylib" + else: + lib_name = "MllmFFIExtension.so" + + lib_path = os.path.join(parent_dir, "lib", lib_name) + return os.path.exists(lib_path) + + +def is_mobile_available() -> bool: + return _has_mobile_libs() + + +if _has_mobile_libs(): + from . import mobile + + __all__.append("mobile") diff --git a/pymllm/backends/cuda/__init__.py b/pymllm/__main__.py similarity index 100% rename from pymllm/backends/cuda/__init__.py rename to pymllm/__main__.py diff --git a/pymllm/backends/__init__.py b/pymllm/backends/__init__.py deleted file mode 100644 index 5e926d58..00000000 --- a/pymllm/backends/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. - -from . import cuda, qualcomm diff --git a/pymllm/backends/cuda/tilelang_compile_test.py b/pymllm/backends/cuda/tilelang_compile_test.py deleted file mode 100644 index 65a2e007..00000000 --- a/pymllm/backends/cuda/tilelang_compile_test.py +++ /dev/null @@ -1,41 +0,0 @@ -import tilelang -import tilelang.language as T - - -@tilelang.jit( - out_idx=[-1], compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"] -) -def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads): - @T.prim_func - def elem_add( - A: T.Tensor((M, N), in_dtype), - B: T.Tensor((M, N), in_dtype), - C: T.Tensor((M, N), out_dtype), - ): - with T.Kernel( - T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads - ) as (bx, by): - A_shared = T.alloc_shared((block_M, block_N), in_dtype) - B_shared = T.alloc_shared((block_M, block_N), in_dtype) - C_local = T.alloc_fragment((block_M, block_N), out_dtype) - C_shared = T.alloc_shared((block_M, block_N), out_dtype) - - T.copy(A[by * block_M, bx * block_N], A_shared) - T.copy(B[by * block_M, bx * block_N], B_shared) - for local_y, local_x in T.Parallel(block_M, block_N): - C_local[local_y, local_x] = ( - A_shared[local_y, local_x] + B_shared[local_y, local_x] - ) - T.copy(C_local, C_shared) - T.copy(C_shared, C[by * block_M, bx * block_N]) - - return elem_add - - -def compile_test(): - M = 1024 - N = 1024 - config = {"block_M": 128, "block_N": 128, "threads": 128} - kernel = elementwise_add(M, N, **config, in_dtype="float16", out_dtype="float16") - source = kernel.get_kernel_source() - print(source) diff --git a/pymllm/backends/qualcomm/transformers/core/__init__.py b/pymllm/layers/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/__init__.py rename to pymllm/layers/__init__.py diff --git a/pymllm/mobile/README.md b/pymllm/mobile/README.md index 29877ea0..ceb71a5d 100644 --- a/pymllm/mobile/README.md +++ b/pymllm/mobile/README.md @@ -1 +1,2 @@ -We should refactor current pymllm's src to mobile directory. And provide more functionalities for torch based VLA. +# Pymllm mobile + diff --git a/pymllm/mobile/__init__.py b/pymllm/mobile/__init__.py new file mode 100644 index 00000000..8796bbea --- /dev/null +++ b/pymllm/mobile/__init__.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from . import ffi +from . import convertor +from . import utils +from . import quantize +from . import nn +from . import service +from . import backends +from .ffi import ( + # Floating point types + float32, + float16, + bfloat16, + # Signed integer types + int8, + int16, + int32, + int64, + # Unsigned integer types + uint8, + uint16, + uint32, + uint64, + # Bool type + boolean, + # Devices + cpu, + cuda, + qnn, + # Tensor and utilities + Tensor, + empty, + echo, + device, + is_torch_available, + is_numpy_available, + from_torch, + from_numpy, + zeros, + ones, + arange, + random, +) +from .nn.functional import matmul diff --git a/pymllm/quantize/spinquant/__init__.py b/pymllm/mobile/backends/__init__.py similarity index 71% rename from pymllm/quantize/spinquant/__init__.py rename to pymllm/mobile/backends/__init__.py index ea8e2bec..1578a0d8 100644 --- a/pymllm/quantize/spinquant/__init__.py +++ b/pymllm/mobile/backends/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) MLLM Team. # Licensed under the MIT License. + +from . import qualcomm diff --git a/pymllm/backends/qualcomm/README.md b/pymllm/mobile/backends/qualcomm/README.md similarity index 100% rename from pymllm/backends/qualcomm/README.md rename to pymllm/mobile/backends/qualcomm/README.md diff --git a/pymllm/backends/qualcomm/__init__.py b/pymllm/mobile/backends/qualcomm/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/__init__.py rename to pymllm/mobile/backends/qualcomm/__init__.py diff --git a/pymllm/backends/qualcomm/nn.py b/pymllm/mobile/backends/qualcomm/nn.py similarity index 75% rename from pymllm/backends/qualcomm/nn.py rename to pymllm/mobile/backends/qualcomm/nn.py index 0ba9aef5..e4bc91ac 100644 --- a/pymllm/backends/qualcomm/nn.py +++ b/pymllm/mobile/backends/qualcomm/nn.py @@ -1,4 +1,4 @@ -from pymllm.nn._layers import Softmax, RoPE +from pymllm.mobile.nn._layers import Softmax, RoPE class QnnSoftmax(Softmax): diff --git a/pymllm/backends/qualcomm/qnn_aot_env.py b/pymllm/mobile/backends/qualcomm/qnn_aot_env.py similarity index 83% rename from pymllm/backends/qualcomm/qnn_aot_env.py rename to pymllm/mobile/backends/qualcomm/qnn_aot_env.py index 8b0c0d2e..bc48c7c9 100644 --- a/pymllm/backends/qualcomm/qnn_aot_env.py +++ b/pymllm/mobile/backends/qualcomm/qnn_aot_env.py @@ -1,7 +1,7 @@ -from pymllm.ffi import is_qnn_aot_on_x86_enabled +from pymllm.mobile.ffi import is_qnn_aot_on_x86_enabled if is_qnn_aot_on_x86_enabled(): - from pymllm.ffi import ( + from pymllm.mobile.ffi import ( QnnDeviceAndContext, QnnAOTEnv, QcomChipset, diff --git a/pymllm/backends/qualcomm/transformers/.gitignore b/pymllm/mobile/backends/qualcomm/transformers/.gitignore similarity index 100% rename from pymllm/backends/qualcomm/transformers/.gitignore rename to pymllm/mobile/backends/qualcomm/transformers/.gitignore diff --git a/pymllm/backends/qualcomm/transformers/README.md b/pymllm/mobile/backends/qualcomm/transformers/README.md similarity index 100% rename from pymllm/backends/qualcomm/transformers/README.md rename to pymllm/mobile/backends/qualcomm/transformers/README.md diff --git a/pymllm/backends/qualcomm/transformers/__init__.py b/pymllm/mobile/backends/qualcomm/transformers/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/__init__.py rename to pymllm/mobile/backends/qualcomm/transformers/__init__.py diff --git a/pymllm/compile/mlir/__init__.py b/pymllm/mobile/backends/qualcomm/transformers/core/__init__.py similarity index 100% rename from pymllm/compile/mlir/__init__.py rename to pymllm/mobile/backends/qualcomm/transformers/core/__init__.py diff --git a/pymllm/backends/qualcomm/transformers/core/embedding.py b/pymllm/mobile/backends/qualcomm/transformers/core/embedding.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/embedding.py rename to pymllm/mobile/backends/qualcomm/transformers/core/embedding.py diff --git a/pymllm/backends/qualcomm/transformers/core/observer.py b/pymllm/mobile/backends/qualcomm/transformers/core/observer.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/observer.py rename to pymllm/mobile/backends/qualcomm/transformers/core/observer.py diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/mobile/backends/qualcomm/transformers/core/qdq.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/qdq.py rename to pymllm/mobile/backends/qualcomm/transformers/core/qdq.py diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py similarity index 99% rename from pymllm/backends/qualcomm/transformers/core/qlinear.py rename to pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py index 9e90ba8a..35439180 100644 --- a/pymllm/backends/qualcomm/transformers/core/qlinear.py +++ b/pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.ao.quantization import FakeQuantize, PerChannelMinMaxObserver -from pymllm.backends.qualcomm.transformers.core.observer import ( +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ( PerBlockParamFakeQuantize, ) from torchao.quantization.quant_primitives import ( diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/mobile/backends/qualcomm/transformers/core/rms_norm.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/rms_norm.py rename to pymllm/mobile/backends/qualcomm/transformers/core/rms_norm.py diff --git a/pymllm/backends/qualcomm/transformers/llama/modeling_llama.py b/pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/llama/modeling_llama.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py index 119ec04b..6b65f34b 100644 --- a/pymllm/backends/qualcomm/transformers/llama/modeling_llama.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py @@ -52,16 +52,16 @@ from transformers.models.llama.configuration_llama import LlamaConfig # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver logger = logging.get_logger(__name__) diff --git a/pymllm/backends/qualcomm/transformers/llama/runner.py b/pymllm/mobile/backends/qualcomm/transformers/llama/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/llama/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/runner.py index 8aa4627b..730147d0 100644 --- a/pymllm/backends/qualcomm/transformers/llama/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.llama.modeling_llama import LlamaForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.llama.modeling_llama import LlamaForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/llama/train.py b/pymllm/mobile/backends/qualcomm/transformers/llama/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/llama/train.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/train.py index cd10befb..41ffc0e2 100644 --- a/pymllm/backends/qualcomm/transformers/llama/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.llama.runner import LlamaQuantizer +from pymllm.mobile.backends.qualcomm.transformers.llama.runner import LlamaQuantizer def main(): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py index 56b19c42..a43d8b7e 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py @@ -31,16 +31,16 @@ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver class Qwen2MLP(nn.Module): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/runner.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/qwen2/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py index d2f5be05..ce55fd06 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.qwen2.modeling_qwen2 import Qwen2ForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.qwen2.modeling_qwen2 import Qwen2ForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/train.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/qwen2/train.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py index fec5fdfc..1a8f25ce 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.qwen2.runner import Qwen2Quantizer +from pymllm.mobile.backends.qualcomm.transformers.qwen2.runner import Qwen2Quantizer def main(): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 2dabf5c9..6a8788ba 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -46,16 +46,16 @@ from transformers.models.qwen3.configuration_qwen3 import Qwen3Config # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver class Qwen3MLP(nn.Module): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/qwen3/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py index 02ea6a5f..0d7499c9 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/qwen3/train.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py index 63c6d0e8..f44fa67b 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer +from pymllm.mobile.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer def main(): diff --git a/pymllm/convertor/__init__.py b/pymllm/mobile/convertor/__init__.py similarity index 100% rename from pymllm/convertor/__init__.py rename to pymllm/mobile/convertor/__init__.py diff --git a/pymllm/convertor/mllm_type_mapping.py b/pymllm/mobile/convertor/mllm_type_mapping.py similarity index 100% rename from pymllm/convertor/mllm_type_mapping.py rename to pymllm/mobile/convertor/mllm_type_mapping.py diff --git a/pymllm/convertor/model_file_v1.py b/pymllm/mobile/convertor/model_file_v1.py similarity index 100% rename from pymllm/convertor/model_file_v1.py rename to pymllm/mobile/convertor/model_file_v1.py diff --git a/pymllm/convertor/model_file_v2.py b/pymllm/mobile/convertor/model_file_v2.py similarity index 100% rename from pymllm/convertor/model_file_v2.py rename to pymllm/mobile/convertor/model_file_v2.py diff --git a/pymllm/ffi/__init__.py b/pymllm/mobile/ffi/__init__.py similarity index 100% rename from pymllm/ffi/__init__.py rename to pymllm/mobile/ffi/__init__.py diff --git a/pymllm/ffi/_ffi_api.py b/pymllm/mobile/ffi/_ffi_api.py similarity index 100% rename from pymllm/ffi/_ffi_api.py rename to pymllm/mobile/ffi/_ffi_api.py diff --git a/pymllm/ffi/base.py b/pymllm/mobile/ffi/base.py similarity index 90% rename from pymllm/ffi/base.py rename to pymllm/mobile/ffi/base.py index 07a01c49..96aed242 100644 --- a/pymllm/ffi/base.py +++ b/pymllm/mobile/ffi/base.py @@ -8,7 +8,7 @@ def _load_lib(): file_dir = os.path.dirname(os.path.realpath(__file__)) - parent_dir = os.path.dirname(file_dir) + parent_dir = os.path.dirname(os.path.dirname(file_dir)) # Platform-specific library names if sys.platform.startswith("win32"): diff --git a/pymllm/nn/__init__.py b/pymllm/mobile/nn/__init__.py similarity index 100% rename from pymllm/nn/__init__.py rename to pymllm/mobile/nn/__init__.py diff --git a/pymllm/nn/_layers.py b/pymllm/mobile/nn/_layers.py similarity index 100% rename from pymllm/nn/_layers.py rename to pymllm/mobile/nn/_layers.py diff --git a/pymllm/nn/_module.py b/pymllm/mobile/nn/_module.py similarity index 100% rename from pymllm/nn/_module.py rename to pymllm/mobile/nn/_module.py diff --git a/pymllm/nn/functional.py b/pymllm/mobile/nn/functional.py similarity index 100% rename from pymllm/nn/functional.py rename to pymllm/mobile/nn/functional.py diff --git a/pymllm/quantize/__init__.py b/pymllm/mobile/quantize/__init__.py similarity index 100% rename from pymllm/quantize/__init__.py rename to pymllm/mobile/quantize/__init__.py diff --git a/pymllm/quantize/cast2fp32_pass.py b/pymllm/mobile/quantize/cast2fp32_pass.py similarity index 100% rename from pymllm/quantize/cast2fp32_pass.py rename to pymllm/mobile/quantize/cast2fp32_pass.py diff --git a/pymllm/compile/__init__.py b/pymllm/mobile/quantize/gguf/__init__.py similarity index 100% rename from pymllm/compile/__init__.py rename to pymllm/mobile/quantize/gguf/__init__.py diff --git a/pymllm/quantize/kai/__init__.py b/pymllm/mobile/quantize/kai/__init__.py similarity index 100% rename from pymllm/quantize/kai/__init__.py rename to pymllm/mobile/quantize/kai/__init__.py diff --git a/pymllm/quantize/kai/w4a32.py b/pymllm/mobile/quantize/kai/w4a32.py similarity index 100% rename from pymllm/quantize/kai/w4a32.py rename to pymllm/mobile/quantize/kai/w4a32.py diff --git a/pymllm/quantize/pipeline.py b/pymllm/mobile/quantize/pipeline.py similarity index 100% rename from pymllm/quantize/pipeline.py rename to pymllm/mobile/quantize/pipeline.py diff --git a/pymllm/quantize/quantize_pass.py b/pymllm/mobile/quantize/quantize_pass.py similarity index 100% rename from pymllm/quantize/quantize_pass.py rename to pymllm/mobile/quantize/quantize_pass.py diff --git a/pymllm/quantize/solver.py b/pymllm/mobile/quantize/solver.py similarity index 100% rename from pymllm/quantize/solver.py rename to pymllm/mobile/quantize/solver.py diff --git a/pymllm/quantize/gguf/__init__.py b/pymllm/mobile/quantize/spinquant/__init__.py similarity index 100% rename from pymllm/quantize/gguf/__init__.py rename to pymllm/mobile/quantize/spinquant/__init__.py diff --git a/pymllm/service/__init__.py b/pymllm/mobile/service/__init__.py similarity index 100% rename from pymllm/service/__init__.py rename to pymllm/mobile/service/__init__.py diff --git a/pymllm/service/models_hub.py b/pymllm/mobile/service/models_hub.py similarity index 100% rename from pymllm/service/models_hub.py rename to pymllm/mobile/service/models_hub.py diff --git a/pymllm/service/network.py b/pymllm/mobile/service/network.py similarity index 100% rename from pymllm/service/network.py rename to pymllm/mobile/service/network.py diff --git a/pymllm/service/rr_process.py b/pymllm/mobile/service/rr_process.py similarity index 100% rename from pymllm/service/rr_process.py rename to pymllm/mobile/service/rr_process.py diff --git a/pymllm/service/tools.py b/pymllm/mobile/service/tools.py similarity index 100% rename from pymllm/service/tools.py rename to pymllm/mobile/service/tools.py diff --git a/pymllm/tests/qualcomm/test_context_create.py b/pymllm/mobile/tests/qualcomm/test_context_create.py similarity index 89% rename from pymllm/tests/qualcomm/test_context_create.py rename to pymllm/mobile/tests/qualcomm/test_context_create.py index 18983daa..94f42b51 100644 --- a/pymllm/tests/qualcomm/test_context_create.py +++ b/pymllm/mobile/tests/qualcomm/test_context_create.py @@ -1,5 +1,5 @@ -import pymllm as mllm -from pymllm.backends.qualcomm.qnn_aot_env import ( +import pymllm.mobile as mllm +from pymllm.mobile.backends.qualcomm.qnn_aot_env import ( QnnAOTEnv, QnnDeviceAndContext, QcomTryBestPerformance, diff --git a/pymllm/tests/test_nn.py b/pymllm/mobile/tests/test_nn.py similarity index 83% rename from pymllm/tests/test_nn.py rename to pymllm/mobile/tests/test_nn.py index d9a3db2d..403060e9 100644 --- a/pymllm/tests/test_nn.py +++ b/pymllm/mobile/tests/test_nn.py @@ -1,5 +1,5 @@ -import pymllm as mllm -from pymllm import nn +import pymllm.mobile as mllm +from pymllm.mobile import nn class FooModule(nn.Module): diff --git a/pymllm/tests/test_tensor.py b/pymllm/mobile/tests/test_tensor.py similarity index 89% rename from pymllm/tests/test_tensor.py rename to pymllm/mobile/tests/test_tensor.py index e935f10b..474e1092 100644 --- a/pymllm/tests/test_tensor.py +++ b/pymllm/mobile/tests/test_tensor.py @@ -1,7 +1,7 @@ # Copyright (c) MLLM Team. # Licensed under the MIT License. -import pymllm as torch +import pymllm.mobile as torch def test_empty_tensor_create() -> bool: diff --git a/pymllm/utils/__init__.py b/pymllm/mobile/utils/__init__.py similarity index 100% rename from pymllm/utils/__init__.py rename to pymllm/mobile/utils/__init__.py diff --git a/pymllm/utils/adb.py b/pymllm/mobile/utils/adb.py similarity index 100% rename from pymllm/utils/adb.py rename to pymllm/mobile/utils/adb.py diff --git a/pymllm/utils/error_handler.py b/pymllm/mobile/utils/error_handler.py similarity index 100% rename from pymllm/utils/error_handler.py rename to pymllm/mobile/utils/error_handler.py diff --git a/pymllm/utils/mllm_convertor.py b/pymllm/mobile/utils/mllm_convertor.py similarity index 100% rename from pymllm/utils/mllm_convertor.py rename to pymllm/mobile/utils/mllm_convertor.py diff --git a/pymllm/compile/mllm_ir/trace.py b/pymllm/models/__init__.py similarity index 100% rename from pymllm/compile/mllm_ir/trace.py rename to pymllm/models/__init__.py diff --git a/pymllm/utils/mllm_convertor_server/service.py b/pymllm/utils/mllm_convertor_server/service.py deleted file mode 100644 index ea8e2bec..00000000 --- a/pymllm/utils/mllm_convertor_server/service.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. diff --git a/pyproject.toml b/pyproject.toml index 703d4456..efe4a14d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "scikit-build-core>=0.11.0", "apache-tvm-ffi" + "scikit-build-core>=0.11.0", "apache-tvm-ffi == 0.1.8" ] build-backend = "scikit_build_core.build" @@ -21,7 +21,7 @@ dependencies=[ "packaging", "pytest", "pytest-html", - "apache-tvm-ffi == 0.1.0b4", + "apache-tvm-ffi == 0.1.8", "pyyaml >= 6.0.2", "openai", "modelscope", @@ -36,8 +36,8 @@ dependencies=[ cuda = ["tilelang"] [project.scripts] -mllm-convertor = "pymllm.utils.mllm_convertor:main" -mllm-service = "pymllm.service.tools:cli_app" +mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" +mllm-service = "pymllm.mobile.service.tools:cli_app" [tool.setuptools.exclude-package-data] "*" = ["*.pyc"] From abf1fa4228e05cbd565c4b35e5c6dfeba804ec5f Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 04:46:41 +0000 Subject: [PATCH 03/13] feat: enhance configuration management and update dependencies - Added `flashinfer-python` to the optional `cuda` dependencies in `pyproject.toml`. - Introduced new configuration files for server, model, and layers to centralize runtime settings. - Created initial structure for various layers and components to support future development. --- pymllm/configs/__init__.py | 0 pymllm/configs/model_config.py | 0 pymllm/configs/server_config.py | 267 ++++++++++++++++++++++++++++ pymllm/layers/_layer.py | 0 pymllm/layers/attention/__init__.py | 0 pymllm/layers/attention/gdn.py | 0 pymllm/layers/attention/normal.py | 0 pymllm/layers/embedding.py | 0 pymllm/layers/mlp.py | 0 pymllm/layers/rms_norm.py | 0 pymllm/mem_cache/__init__.py | 0 pymllm/models/qwen3_moe.py | 0 pymllm/orchestrator/__init__.py | 0 pymllm/server/__init__.py | 0 pyproject.toml | 2 +- 15 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 pymllm/configs/__init__.py create mode 100644 pymllm/configs/model_config.py create mode 100644 pymllm/configs/server_config.py create mode 100644 pymllm/layers/_layer.py create mode 100644 pymllm/layers/attention/__init__.py create mode 100644 pymllm/layers/attention/gdn.py create mode 100644 pymllm/layers/attention/normal.py create mode 100644 pymllm/layers/embedding.py create mode 100644 pymllm/layers/mlp.py create mode 100644 pymllm/layers/rms_norm.py create mode 100644 pymllm/mem_cache/__init__.py create mode 100644 pymllm/models/qwen3_moe.py create mode 100644 pymllm/orchestrator/__init__.py create mode 100644 pymllm/server/__init__.py diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/configs/model_config.py b/pymllm/configs/model_config.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py new file mode 100644 index 00000000..56be4fc4 --- /dev/null +++ b/pymllm/configs/server_config.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Literal, Optional +from dataclasses import asdict, dataclass, field + + +@dataclass +class ServerConfig: + """ + Centralized runtime configuration for the MLLM server. + + The fields are grouped by operational concern so that: + - CLI args can map directly to this dataclass. + - YAML/JSON config files can be loaded and validated in one place. + - future extensions can follow a predictable structure. + """ + + # ------------------------------------------------------------------------- + # Model and tokenizer settings + # ------------------------------------------------------------------------- + # Required path to the model checkpoint directory or model identifier. + model_path: Path + # Optional tokenizer path; when omitted we fall back to `model_path`. + tokenizer_path: Optional[Path] = None + # Tokenizer bootstrap strategy: + # - "auto": infer tokenizer mode from model type. + # - "slow"/"fast": force a specific tokenizer implementation. + tokenizer_mode: Literal["auto", "slow", "fast"] = "auto" + # Number of worker threads/processes used by tokenizer service. + tokenizer_worker_num: int = 1 + # Skip tokenizer initialization at startup to reduce cold-start latency. + skip_tokenizer_init: bool = False + # Model loading format hint for loader backends. + load_format: Literal["auto", "pt", "safetensors", "gguf"] = "auto" + # Allow loading custom model code from remote repositories. + trust_remote_code: bool = False + # Explicit context length; `None` means infer from model config. + context_length: Optional[int] = None + # Model precision policy for weights and activations. + dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto" + # Quantization algorithm to apply at load time. + quantization: Optional[str] = None + # KV cache dtype; can differ from model dtype for better memory trade-offs. + kv_cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = ( + "auto" + ) + # HuggingFace revision/commit/tag for deterministic model resolution. + revision: Optional[str] = None + # Optional custom directory used to cache downloaded model artifacts. + download_dir: Optional[Path] = None + + # ------------------------------------------------------------------------- + # HTTP / API server settings + # ------------------------------------------------------------------------- + # Host address the HTTP server binds to. + host: str = "127.0.0.1" + # TCP port exposed by the HTTP server. + port: int = 30000 + # Optional FastAPI root path when running behind a reverse proxy. + fastapi_root_path: str = "" + # API key required by client-facing endpoints. + api_key: Optional[str] = None + # Admin API key for privileged management endpoints. + admin_api_key: Optional[str] = None + # Public model name returned in OpenAI-compatible API responses. + served_model_name: Optional[str] = None + # Path used for server-side file uploads or temporary user artifacts. + file_storage_path: Path = Path("mllm_storage") + + # ------------------------------------------------------------------------- + # Runtime and scheduling behavior + # ------------------------------------------------------------------------- + # Fraction of total GPU memory reserved for static allocations + # (primarily model weights + KV cache). + mem_fraction_static: Optional[float] = None + # Maximum number of requests concurrently executing in scheduler. + max_running_requests: Optional[int] = None + # Maximum queued requests waiting for execution. + max_queued_requests: Optional[int] = None + # Hard cap of total active tokens across all in-flight requests. + max_total_tokens: Optional[int] = None + # Prefill chunk size used to trade throughput vs memory pressure. + chunked_prefill_size: Optional[int] = None + # Upper bound for tokens accepted in a single prefill pass. + max_prefill_tokens: int = 16384 + # Scheduling policy: + # - "fcfs": first-come-first-served fairness. + # - "lpm": longest-prefix-match style cache locality optimization. + schedule_policy: Literal["fcfs", "lpm"] = "fcfs" + # Conservative multiplier for scheduler admission decisions. + # Values > 1.0 are safer for OOM avoidance but may reduce utilization. + schedule_conservativeness: float = 1.0 + # Enable low-power sleep while idle to reduce background GPU usage. + sleep_on_idle: bool = False + # Stream partial output every N decode steps when streaming is enabled. + stream_interval: int = 1 + # Enable token streaming in generation responses. + stream_output: bool = True + + # ------------------------------------------------------------------------- + # Parallelism and distributed deployment + # ------------------------------------------------------------------------- + # Tensor parallel size (intra-layer sharding). + tp_size: int = 1 + # Data parallel size (replicated model workers). + dp_size: int = 1 + # Expert parallel size for MoE-style models. + ep_size: int = 1 + # Pipeline parallel size (inter-layer partitioning). + pp_size: int = 1 + # Number of nodes participating in distributed serving. + nnodes: int = 1 + # Rank of current node in multi-node topology. + node_rank: int = 0 + # Torch distributed init address, e.g. "host:port". + dist_init_addr: Optional[str] = None + # Optional NCCL communication port override. + nccl_port: Optional[int] = None + # Timeout in seconds for distributed collectives. + dist_timeout: Optional[int] = None + # Base GPU index used for process-to-device mapping. + base_gpu_id: int = 0 + # Step size between logical workers when assigning GPU IDs. + gpu_id_step: int = 1 + + # ------------------------------------------------------------------------- + # Backend and acceleration toggles + # ------------------------------------------------------------------------- + # Attention kernel backend selection. + attention_backend: Optional[str] = None + # Sampling backend selection. + sampling_backend: Optional[str] = None + # Grammar-constrained decoding backend. + grammar_backend: Optional[str] = None + # Disable CUDA graph capture for debugging/compatibility. + disable_cuda_graph: bool = False + # Enable `torch.compile` acceleration path. + enable_torch_compile: bool = False + # Maximum batch size considered by `torch.compile` profiles. + torch_compile_max_bs: int = 32 + # Enable deterministic inference behavior where possible. + enable_deterministic_inference: bool = False + # Random seed for reproducible sampling and initialization. + random_seed: Optional[int] = None + + # ------------------------------------------------------------------------- + # Logging, metrics, and observability + # ------------------------------------------------------------------------- + # Global log level for server components. + log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" + # HTTP access log level; if None, inherits global log level. + log_level_http: Optional[str] = None + # Log each request payload/metadata for debugging. + log_requests: bool = False + # Verbosity level for request logging, larger means more detail. + log_requests_level: int = 2 + # Toggle built-in Prometheus/metrics endpoint. + enable_metrics: bool = False + # Include latency/time-cost summaries in logs. + show_time_cost: bool = False + # Optional OpenTelemetry traces endpoint ("host:port"). + otlp_traces_endpoint: str = "localhost:4317" + # Enable tracing export to OTLP collector. + enable_trace: bool = False + + # ------------------------------------------------------------------------- + # Feature switches and advanced decoding options + # ------------------------------------------------------------------------- + # Enable LoRA adapter serving support. + enable_lora: bool = False + # Maximum number of LoRA adapters loaded simultaneously. + max_loaded_loras: Optional[int] = None + # Maximum LoRA adapters that can be mixed in one batch. + max_loras_per_batch: int = 8 + # LoRA backend implementation. + lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" + # Enable multimodal processing pipeline. + enable_multimodal: bool = False + # Max concurrent multimodal tool calls. + mm_max_concurrent_calls: int = 32 + # Timeout (seconds) for each multimodal call. + mm_per_request_timeout: float = 10.0 + # Speculative decoding algorithm name (e.g. "eagle", "ngram"). + speculative_algorithm: Optional[str] = None + # Draft model path used in speculative decoding. + speculative_draft_model_path: Optional[Path] = None + # Number of speculative steps per target decode iteration. + speculative_num_steps: Optional[int] = None + # Number of proposed draft tokens per speculation step. + speculative_num_draft_tokens: Optional[int] = None + + # ------------------------------------------------------------------------- + # Internal bookkeeping (not usually set by users directly) + # ------------------------------------------------------------------------- + # Additional arbitrary key-value options for forward compatibility. + extra_options: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + """Normalize defaults and validate constraints after dataclass initialization.""" + if self.tokenizer_path is None: + self.tokenizer_path = self.model_path + if self.served_model_name is None: + self.served_model_name = str(self.model_path) + + self._validate_basic_constraints() + self._validate_parallelism_constraints() + self._validate_scheduler_constraints() + + def _validate_basic_constraints(self) -> None: + """Validate scalar ranges and common invariants.""" + if self.port <= 0 or self.port > 65535: + raise ValueError("`port` must be in range [1, 65535].") + if self.max_prefill_tokens <= 0: + raise ValueError("`max_prefill_tokens` must be greater than 0.") + if self.stream_interval <= 0: + raise ValueError("`stream_interval` must be greater than 0.") + if self.mem_fraction_static is not None and not ( + 0.0 < self.mem_fraction_static < 1.0 + ): + raise ValueError("`mem_fraction_static` must be in range (0.0, 1.0).") + + def _validate_parallelism_constraints(self) -> None: + """Validate distributed and parallel topology settings.""" + for key, value in { + "tp_size": self.tp_size, + "dp_size": self.dp_size, + "ep_size": self.ep_size, + "pp_size": self.pp_size, + "nnodes": self.nnodes, + }.items(): + if value <= 0: + raise ValueError(f"`{key}` must be greater than 0.") + + if self.node_rank < 0 or self.node_rank >= self.nnodes: + raise ValueError("`node_rank` must satisfy 0 <= node_rank < nnodes.") + + def _validate_scheduler_constraints(self) -> None: + """Validate scheduler-related soft limits.""" + if self.max_running_requests is not None and self.max_running_requests <= 0: + raise ValueError("`max_running_requests` must be greater than 0 when set.") + if self.max_queued_requests is not None and self.max_queued_requests < 0: + raise ValueError("`max_queued_requests` must be >= 0 when set.") + if self.max_total_tokens is not None and self.max_total_tokens <= 0: + raise ValueError("`max_total_tokens` must be greater than 0 when set.") + if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: + raise ValueError("`chunked_prefill_size` must be greater than 0 when set.") + if self.schedule_conservativeness <= 0: + raise ValueError("`schedule_conservativeness` must be greater than 0.") + + def to_dict(self) -> dict[str, Any]: + """ + Serialize config to a plain dictionary. + + Path values are converted to string for easier JSON/YAML serialization. + """ + data = asdict(self) + for key in [ + "model_path", + "tokenizer_path", + "download_dir", + "file_storage_path", + "speculative_draft_model_path", + ]: + if data.get(key) is not None: + data[key] = str(data[key]) + return data diff --git a/pymllm/layers/_layer.py b/pymllm/layers/_layer.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/attention/gdn.py b/pymllm/layers/attention/gdn.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/attention/normal.py b/pymllm/layers/attention/normal.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/mlp.py b/pymllm/layers/mlp.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/mem_cache/__init__.py b/pymllm/mem_cache/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/models/qwen3_moe.py b/pymllm/models/qwen3_moe.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/orchestrator/__init__.py b/pymllm/orchestrator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/server/__init__.py b/pymllm/server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyproject.toml b/pyproject.toml index efe4a14d..89d69947 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies=[ ] [project.optional-dependencies] -cuda = ["tilelang"] +cuda = ["tilelang", "flashinfer-python"] [project.scripts] mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" From ec71258940415868080af0a070e637e4cf0ca2f9 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 11:40:56 +0000 Subject: [PATCH 04/13] feat: add main entry points and configuration for pymllm and mllm-kernel - Added main entry points for `pymllm` and `mllm-kernel` in their respective `pyproject.toml` files. - Implemented a configuration module for `pymllm` to manage global settings, including server, model, runtime, and cache configurations. - Introduced the `VocabParallelEmbedding` layer and utility functions for weight management in the layers module. - Created initial tests for the `VocabParallelEmbedding` layer to validate functionality with tensor parallelism. --- mllm-kernel/mllm_kernel/__main__.py | 2 +- mllm-kernel/pyproject.toml | 3 + pymllm/__main__.py | 39 ++ pymllm/configs/__init__.py | 21 ++ pymllm/configs/global_config.py | 349 ++++++++++++++++++ .../configs/quantization_config.py | 0 pymllm/layers/__init__.py | 11 + pymllm/layers/base.py | 27 ++ pymllm/layers/{_layer.py => custom_event.py} | 0 pymllm/layers/embedding.py | 152 ++++++++ pymllm/layers/utils.py | 45 +++ pymllm/orchestrator/__init__.py | 48 +++ pymllm/orchestrator/group_coordinator.py | 98 +++++ pymllm/orchestrator/parallel_state.py | 207 +++++++++++ pymllm/quantization/__init__.py | 0 pymllm/quantization/methods/__init__.py | 0 pymllm/quantization/methods/awq_w4a16.py | 0 pymllm/quantization/quant_recipe.py | 3 + pymllm/tests/README.md | 0 pymllm/tests/test_vocab_parallel_embedding.py | 310 ++++++++++++++++ pyproject.toml | 1 + 21 files changed, 1315 insertions(+), 1 deletion(-) create mode 100644 pymllm/configs/global_config.py rename mllm-kernel/requirements.txt => pymllm/configs/quantization_config.py (100%) create mode 100644 pymllm/layers/base.py rename pymllm/layers/{_layer.py => custom_event.py} (100%) create mode 100644 pymllm/layers/utils.py create mode 100644 pymllm/orchestrator/group_coordinator.py create mode 100644 pymllm/orchestrator/parallel_state.py create mode 100644 pymllm/quantization/__init__.py create mode 100644 pymllm/quantization/methods/__init__.py create mode 100644 pymllm/quantization/methods/awq_w4a16.py create mode 100644 pymllm/quantization/quant_recipe.py create mode 100644 pymllm/tests/README.md create mode 100644 pymllm/tests/test_vocab_parallel_embedding.py diff --git a/mllm-kernel/mllm_kernel/__main__.py b/mllm-kernel/mllm_kernel/__main__.py index d4888b86..e5f0779d 100644 --- a/mllm-kernel/mllm_kernel/__main__.py +++ b/mllm-kernel/mllm_kernel/__main__.py @@ -388,7 +388,7 @@ def main() -> None: logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") parser = argparse.ArgumentParser( - prog="python -m mllm_kernel", + prog="mllm_kernel", description="mllm-kernel helper commands.", ) parser.add_argument( diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index 5fe07eea..a8dbd98e 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -27,6 +27,9 @@ dev = [ "pytest-html", ] +[project.scripts] +mllm-kernel = "mllm_kernel.__main__:main" + [tool.scikit-build] # Build configuration wheel.py-api = "py3" diff --git a/pymllm/__main__.py b/pymllm/__main__.py index e69de29b..0b427fce 100644 --- a/pymllm/__main__.py +++ b/pymllm/__main__.py @@ -0,0 +1,39 @@ +def show_config() -> None: + from . import is_mobile_available + + mobile_enabled = str(is_mobile_available()).lower() + print(f"mllm mobile: {mobile_enabled}") + + # try import mllm_kernel, if true, print mllm_kernel config + try: + import mllm_kernel + + print(f"mllm_kernel: {mllm_kernel.__version__}") + except ImportError: + print("mllm_kernel: not found") + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + prog="pymllm", + description="pymllm helper commands.", + ) + parser.add_argument( + "command", + nargs="?", + choices=["show-config"], + help="Run helper command. Use 'show-config' to print config details.", + ) + args = parser.parse_args() + + if args.command == "show-config": + show_config() + return + + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py index e69de29b..86af57be 100644 --- a/pymllm/configs/__init__.py +++ b/pymllm/configs/__init__.py @@ -0,0 +1,21 @@ +"""Configuration module for pymllm.""" + +from pymllm.configs.global_config import ( + CacheConfig, + GlobalConfig, + ModelConfig, + RuntimeConfig, + get_global_config, +) +from pymllm.configs.server_config import ServerConfig + +__all__ = [ + # Main singleton + "GlobalConfig", + "get_global_config", + # Sub configs + "ServerConfig", + "ModelConfig", + "RuntimeConfig", + "CacheConfig", +] diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py new file mode 100644 index 00000000..43783e94 --- /dev/null +++ b/pymllm/configs/global_config.py @@ -0,0 +1,349 @@ +"""Global configuration singleton with all server, model and runtime configs.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Literal, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + +@dataclass +class ModelConfig: + """Model-specific configuration parsed from HF config. + + This is a lightweight wrapper around HuggingFace config with + additional derived fields for efficiency. + """ + # Original HF config (populated after loading) + hf_config: Optional[Any] = field(default=None, repr=False) + hf_text_config: Optional[Any] = field(default=None, repr=False) + + # Model architecture + model_type: str = "unknown" + architectures: list[str] = field(default_factory=list) + + # Dimensions + hidden_size: int = 0 + num_hidden_layers: int = 0 + num_attention_heads: int = 0 + num_key_value_heads: Optional[int] = None + intermediate_size: int = 0 + vocab_size: int = 0 + + # Context length + max_position_embeddings: int = 0 + context_length: int = 0 # effective context length + + # Normalization + rms_norm_eps: float = 1e-6 + tie_word_embeddings: bool = False + + # RoPE + rope_theta: float = 10000.0 + rope_scaling: Optional[Dict[str, Any]] = None + + # Quantization + quantization: Optional[str] = None + + def __post_init__(self): + """Set default kv heads if not specified.""" + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + +@dataclass +class RuntimeConfig: + """Runtime state that changes during execution.""" + + # Distributed state + tp_rank: int = 0 + tp_size: int = 1 + dp_rank: int = 0 + dp_size: int = 1 + pp_rank: int = 0 + pp_size: int = 1 + world_rank: int = 0 + world_size: int = 1 + local_rank: int = 0 + + # Device + device: str = "cuda" + + # Memory pools + max_num_seqs: int = 0 + max_model_len: int = 0 + + # Scheduler state (mutable during runtime) + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + num_swapped_reqs: int = 0 + + +@dataclass +class CacheConfig: + """KV cache configuration.""" + + block_size: int = 16 + num_gpu_blocks: int = 0 + num_cpu_blocks: int = 0 + + # Cache dtype + cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = "auto" + + # Sliding window + sliding_window: Optional[int] = None + + # Prefix caching + enable_prefix_caching: bool = False + + +@dataclass +class GlobalConfig: + """Global configuration singleton containing all configs. + + This is the single source of truth for all configuration in pymllm. + It aggregates ServerConfig, ModelConfig, RuntimeConfig, and CacheConfig. + + Usage: + >>> from pymllm.configs import get_global_config + >>> config = get_global_config() + >>> + >>> # Access server config + >>> config.server.model_path + >>> config.server.tp_size + >>> + >>> # Access model config + >>> config.model.hidden_size + >>> config.model.vocab_size + >>> + >>> # Access runtime config (mutable) + >>> config.runtime.tp_rank + >>> config.runtime.device + >>> + >>> # Access cache config + >>> config.cache.block_size + >>> + >>> # Update with new server config + >>> config.load_server_config(server_config) + >>> + >>> # Update with HF model config + >>> config.load_hf_config(hf_config) + """ + + # Sub-configs + server: "ServerConfig" = field(default=None, repr=False) + model: ModelConfig = field(default_factory=ModelConfig) + runtime: RuntimeConfig = field(default_factory=RuntimeConfig) + cache: CacheConfig = field(default_factory=CacheConfig) + + # Additional metadata + _initialized: bool = field(default=False, repr=False) + + def __new__(cls): + if not hasattr(cls, '_instance') or cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __post_init__(self): + # Lazy import to avoid circular dependency + if self.server is None: + from pymllm.configs.server_config import ServerConfig + self.server = ServerConfig( + model_path=Path("."), # placeholder + ) + + @classmethod + def get_instance(cls) -> "GlobalConfig": + """Get the singleton instance.""" + if not hasattr(cls, '_instance') or cls._instance is None: + cls._instance = cls() + return cls._instance + + def load_server_config(self, server_config: "ServerConfig") -> None: + """Load server configuration and sync related fields.""" + self.server = server_config + + # Sync tp/dp/pp sizes to runtime + self.runtime.tp_size = server_config.tp_size + self.runtime.dp_size = server_config.dp_size + self.runtime.pp_size = server_config.pp_size + self.runtime.device = "cuda" if server_config.base_gpu_id >= 0 else "cpu" + + self._initialized = True + + def load_hf_config(self, hf_config: "PretrainedConfig") -> None: + """Load HuggingFace model configuration.""" + from transformers import PretrainedConfig + + # Store original + self.model.hf_config = hf_config + + # Get text config (for multimodal models) + if hasattr(hf_config, "text_config"): + self.model.hf_text_config = hf_config.text_config + text_config = hf_config.text_config + else: + text_config = hf_config + self.model.hf_text_config = hf_config + + # Extract fields + self.model.model_type = getattr(text_config, "model_type", "unknown") + self.model.architectures = getattr(text_config, "architectures", []) + + self.model.hidden_size = getattr(text_config, "hidden_size", 0) + self.model.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) + self.model.num_attention_heads = getattr(text_config, "num_attention_heads", 0) + self.model.num_key_value_heads = getattr(text_config, "num_key_value_heads", None) + self.model.intermediate_size = getattr(text_config, "intermediate_size", 0) + self.model.vocab_size = getattr(text_config, "vocab_size", 0) + + # Context length + self.model.max_position_embeddings = getattr( + text_config, "max_position_embeddings", 0 + ) + self.model.context_length = self._get_context_length(text_config) + + # Normalization + self.model.rms_norm_eps = getattr(text_config, "rms_norm_eps", 1e-6) + self.model.tie_word_embeddings = getattr( + text_config, "tie_word_embeddings", False + ) + + # RoPE + self.model.rope_theta = getattr(text_config, "rope_theta", 10000.0) + self.model.rope_scaling = getattr(text_config, "rope_scaling", None) + + # Sync to cache config + self.cache.sliding_window = getattr(text_config, "sliding_window", None) + + def _get_context_length(self, config: "PretrainedConfig") -> int: + """Extract effective context length from config.""" + # Try various fields + for key in ["max_position_embeddings", "n_positions", "seq_length"]: + if hasattr(config, key): + value = getattr(config, key) + if isinstance(value, int) and value > 0: + return value + return 2048 # default + + def update_runtime(self, **kwargs) -> None: + """Update runtime configuration.""" + for key, value in kwargs.items(): + if hasattr(self.runtime, key): + setattr(self.runtime, key, value) + else: + raise AttributeError(f"RuntimeConfig has no attribute '{key}'") + + def update_cache(self, **kwargs) -> None: + """Update cache configuration.""" + for key, value in kwargs.items(): + if hasattr(self.cache, key): + setattr(self.cache, key, value) + else: + raise AttributeError(f"CacheConfig has no attribute '{key}'") + + def temp(self, **kwargs): + """Context manager for temporary config changes. + + Usage: + # Modify runtime config temporarily + with config.temp(runtime=config.runtime): + config.runtime.tp_size = 2 + # ... do something with tp_size=2 + # runtime restored to original values + """ + return _TempGlobalConfig(self, **kwargs) + + def to_dict(self) -> Dict[str, Any]: + """Serialize all configs to dictionary.""" + return { + "server": self.server.to_dict() if self.server else {}, + "model": self._model_to_dict(), + "runtime": self._runtime_to_dict(), + "cache": self._cache_to_dict(), + } + + def _model_to_dict(self) -> Dict[str, Any]: + """Convert model config to dict.""" + return { + "model_type": self.model.model_type, + "architectures": self.model.architectures, + "hidden_size": self.model.hidden_size, + "num_hidden_layers": self.model.num_hidden_layers, + "num_attention_heads": self.model.num_attention_heads, + "num_key_value_heads": self.model.num_key_value_heads, + "intermediate_size": self.model.intermediate_size, + "vocab_size": self.model.vocab_size, + "context_length": self.model.context_length, + } + + def _runtime_to_dict(self) -> Dict[str, Any]: + """Convert runtime config to dict.""" + return { + "tp_rank": self.runtime.tp_rank, + "tp_size": self.runtime.tp_size, + "world_rank": self.runtime.world_rank, + "world_size": self.runtime.world_size, + "device": self.runtime.device, + } + + def _cache_to_dict(self) -> Dict[str, Any]: + """Convert cache config to dict.""" + return { + "block_size": self.cache.block_size, + "num_gpu_blocks": self.cache.num_gpu_blocks, + "cache_dtype": self.cache.cache_dtype, + } + + +class _TempGlobalConfig: + """Context manager for temporary global config changes. + + Supports nested keys like "runtime.tp_size" to modify sub-configs. + """ + + def __init__(self, config: GlobalConfig, **kwargs): + self.config = config + self.temp_values = kwargs + self.old_values = {} + + def _get_nested_attr(self, key: str): + """Get attribute, supporting dot notation for nested access.""" + if "." in key: + parts = key.split(".") + obj = self.config + for part in parts[:-1]: + obj = getattr(obj, part) + return getattr(obj, parts[-1]) + return getattr(self.config, key) + + def _set_nested_attr(self, key: str, value): + """Set attribute, supporting dot notation for nested access.""" + if "." in key: + parts = key.split(".") + obj = self.config + for part in parts[:-1]: + obj = getattr(obj, part) + setattr(obj, parts[-1], value) + else: + setattr(self.config, key, value) + + def __enter__(self): + for key, value in self.temp_values.items(): + self.old_values[key] = self._get_nested_attr(key) + self._set_nested_attr(key, value) + return self.config + + def __exit__(self, exc_type, exc_val, exc_tb): + for key, value in self.old_values.items(): + self._set_nested_attr(key, value) + return False + + +# Convenience function +def get_global_config() -> GlobalConfig: + """Get the global config singleton instance.""" + return GlobalConfig.get_instance() diff --git a/mllm-kernel/requirements.txt b/pymllm/configs/quantization_config.py similarity index 100% rename from mllm-kernel/requirements.txt rename to pymllm/configs/quantization_config.py diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index e69de29b..6f70a4d1 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -0,0 +1,11 @@ +"""Layers module for pymllm.""" + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.utils import set_weight_attrs + +__all__ = [ + "MllmBaseLayer", + "set_weight_attrs", + "VocabParallelEmbedding", +] diff --git a/pymllm/layers/base.py b/pymllm/layers/base.py new file mode 100644 index 00000000..5dc519f4 --- /dev/null +++ b/pymllm/layers/base.py @@ -0,0 +1,27 @@ +import torch +from torch import nn +from torch.nn import Parameter +from pymllm.layers.utils import set_weight_attrs +from pymllm.quantization.quant_recipe import QuantRecipe + + +class MllmBaseLayer(nn.Module): + def __init__(self): + super().__init__() + self.quant_recipe: QuantRecipe = None + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load weights into a parameter. + + This is the default implementation that directly copies the loaded weight + into the parameter. Subclasses should override this method to implement + custom loading logic (e.g., tensor parallelism sharding). + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint. + """ + param.data.copy_(loaded_weight) + + def forward(self, *args, **kwargs): + raise NotImplementedError("Subclasses must implement forward method") diff --git a/pymllm/layers/_layer.py b/pymllm/layers/custom_event.py similarity index 100% rename from pymllm/layers/_layer.py rename to pymllm/layers/custom_event.py diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py index e69de29b..0442caa4 100644 --- a/pymllm/layers/embedding.py +++ b/pymllm/layers/embedding.py @@ -0,0 +1,152 @@ +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs +from pymllm.orchestrator import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) + + +class VocabParallelEmbedding(MllmBaseLayer): + """Embedding layer with vocabulary parallelism. + + This layer shards the embedding table along the vocabulary dimension + for tensor parallelism. + + Args: + num_embeddings: Size of the vocabulary. + embedding_dim: Size of the embedding vector. + padding_idx: Index for padding token (optional). + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int = None, + ): + super().__init__() + + # Get TP info from global state + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + + # Calculate sharded size + if self.num_embeddings % self.tp_size != 0: + raise ValueError( + f"num_embeddings ({num_embeddings}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + + self.num_embeddings_per_partition = divide(num_embeddings, self.tp_size) + + # Create sharded weight + self.weight = Parameter( + torch.empty(self.num_embeddings_per_partition, embedding_dim) + ) + + # Calculate shard range + self.vocab_start_index = self.tp_rank * self.num_embeddings_per_partition + self.vocab_end_index = ( + self.vocab_start_index + self.num_embeddings_per_partition + ) + + # Set weight attributes for loading + set_weight_attrs( + self.weight, + { + "output_dim": 0, # Shard along vocab dimension + "input_dim": 1, # Embedding dimension + "weight_loader": self.weight_loader, + }, + ) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + output_dim = getattr(param, "output_dim", None) + + if output_dim is None or self.tp_size == 1: + # No sharding, direct copy + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + # Sharded loading: slice the loaded weight + assert loaded_weight.shape[output_dim] == self.num_embeddings, ( + f"Loaded weight vocab size {loaded_weight.shape[output_dim]} " + f"does not match expected {self.num_embeddings}" + ) + + # Slice along vocab dimension + if output_dim == 0: + shard_weight = loaded_weight[ + self.vocab_start_index : self.vocab_end_index, : + ] + else: + shard_weight = loaded_weight.narrow( + output_dim, + self.vocab_start_index, + self.num_embeddings_per_partition, + ) + + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass of the embedding layer with TP support. + + Args: + x: Input tensor of token ids. + + Returns: + Embedded representation (all-reduced across TP group if needed). + """ + if self.tp_size > 1: + # Create mask for valid vocab range + vocab_mask = (x >= self.vocab_start_index) & (x < self.vocab_end_index) + + # Adjust indices to local vocab space + masked_input = torch.where( + vocab_mask, + x - self.vocab_start_index, + torch.zeros_like(x), # Invalid indices become 0 (will be masked) + ) + else: + masked_input = x + vocab_mask = None + + # Lookup embeddings + output = F.embedding( + masked_input.long(), + self.weight, + padding_idx=self.padding_idx if self.padding_idx is not None else None, + ) + + # Mask invalid positions (for TP) + if vocab_mask is not None: + output.masked_fill_(~vocab_mask.unsqueeze(-1), 0) + + # All-reduce across TP group + if self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output) + + return output diff --git a/pymllm/layers/utils.py b/pymllm/layers/utils.py new file mode 100644 index 00000000..0dcbd1ac --- /dev/null +++ b/pymllm/layers/utils.py @@ -0,0 +1,45 @@ +"""Utility functions for layers.""" + +from typing import Any, Dict + +import torch + + +def set_weight_attrs( + weight: torch.Tensor, + weight_attrs: Dict[str, Any] | None, +) -> None: + """Set attributes on a weight tensor. + + This method is used to set attributes on a weight tensor. This method + will not overwrite existing attributes. + + Args: + weight: The weight tensor or parameter. + weight_attrs: A dictionary of attributes to set on the weight tensor. + Common attributes include: + - output_dim: The dimension along which to shard the weight (typically 0 for output dim) + - input_dim: The input dimension (typically 1 for input dim) + - weight_loader: A callable to load weights into this parameter + - packed_dim: The dimension along which the weight is packed (for quantization) + - packed_factor: The packing factor (for quantization) + + Example: + >>> weight = nn.Parameter(torch.empty(100, 64)) + >>> set_weight_attrs(weight, { + ... "output_dim": 0, + ... "input_dim": 1, + ... "weight_loader": my_loader_func, + ... }) + """ + if weight_attrs is None: + return + + for key, value in weight_attrs.items(): + if hasattr(weight, key): + raise AttributeError( + f"Overwriting existing tensor attribute: {key}. " + f"Existing value: {getattr(weight, key)}, " + f"New value: {value}" + ) + setattr(weight, key, value) diff --git a/pymllm/orchestrator/__init__.py b/pymllm/orchestrator/__init__.py index e69de29b..f1716d79 100644 --- a/pymllm/orchestrator/__init__.py +++ b/pymllm/orchestrator/__init__.py @@ -0,0 +1,48 @@ +"""Orchestrator module for distributed computation.""" + +from pymllm.orchestrator.group_coordinator import ( + GroupCoordinator, + divide, + split_tensor_along_dim, +) +from pymllm.orchestrator.parallel_state import ( + data_parallel_all_reduce, + get_data_parallel_rank, + get_data_parallel_world_size, + get_dp_group, + get_pipeline_model_parallel_rank, + get_pipeline_model_parallel_world_size, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, + initialize_model_parallel, + model_parallel_is_initialized, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) + +__all__ = [ + # GroupCoordinator + "GroupCoordinator", + "divide", + "split_tensor_along_dim", + # TP + "get_tp_group", + "get_tensor_model_parallel_rank", + "get_tensor_model_parallel_world_size", + "tensor_model_parallel_all_reduce", + "tensor_model_parallel_all_gather", + # DP + "get_dp_group", + "get_data_parallel_rank", + "get_data_parallel_world_size", + "data_parallel_all_reduce", + # PP + "get_pp_group", + "get_pipeline_model_parallel_rank", + "get_pipeline_model_parallel_world_size", + # State + "initialize_model_parallel", + "model_parallel_is_initialized", +] diff --git a/pymllm/orchestrator/group_coordinator.py b/pymllm/orchestrator/group_coordinator.py new file mode 100644 index 00000000..d0624473 --- /dev/null +++ b/pymllm/orchestrator/group_coordinator.py @@ -0,0 +1,98 @@ +"""GroupCoordinator for distributed communication.""" + +from typing import List, Optional +import torch +import torch.distributed as dist + + +class GroupCoordinator: + """Manages a group of processes for distributed communication. + + Lightweight wrapper around torch.distributed.ProcessGroup. + + Args: + ranks: List of global ranks in this group + local_rank: Local rank for device assignment + backend: Backend to use (nccl, gloo, etc.) + """ + + def __init__( + self, + ranks: List[int], + local_rank: int, + backend: str = "nccl", + ): + self.ranks = ranks + self.local_rank = local_rank + self.backend = backend + self.world_size = len(ranks) + + # Get rank in this specific group + self.rank_in_group = ranks.index(dist.get_rank()) if dist.is_initialized() else 0 + + # Create process group + if dist.is_initialized() and self.world_size > 1: + self.device_group = dist.new_group(ranks, backend=backend) + else: + self.device_group = None + + def all_reduce(self, tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across the group.""" + if self.device_group is not None: + dist.all_reduce(tensor, group=self.device_group) + return tensor + + def all_gather(self, tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: + """All-gather across the group.""" + if self.device_group is None: + return tensor + + world_size = self.world_size + if dim == 0: + shape = list(tensor.shape) + shape[0] = shape[0] * world_size + output = torch.empty(shape, dtype=tensor.dtype, device=tensor.device) + dist.all_gather_into_tensor(output, tensor, group=self.device_group) + return output + else: + # For non-dim-0 gathers, use tensor list + tensor_list = [ + torch.empty_like(tensor) for _ in range(world_size) + ] + dist.all_gather(tensor_list, tensor, group=self.device_group) + return torch.cat(tensor_list, dim=dim) + + def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor: + """Broadcast from source rank to all.""" + if self.device_group is not None: + dist.broadcast(tensor, src=src, group=self.device_group) + return tensor + + +def divide(numerator: int, denominator: int) -> int: + """Divide and ensure divisibility.""" + assert numerator % denominator == 0, ( + f"{numerator} is not divisible by {denominator}" + ) + return numerator // denominator + + +def split_tensor_along_dim( + tensor: torch.Tensor, + dim: int, + world_size: int, + rank: int, +) -> torch.Tensor: + """Split tensor along a dimension for tensor parallelism.""" + dim_size = tensor.size(dim) + assert dim_size % world_size == 0, ( + f"Dimension {dim} ({dim_size}) not divisible by world_size {world_size}" + ) + + chunk_size = dim_size // world_size + start = rank * chunk_size + end = start + chunk_size + + slices = [slice(None)] * tensor.ndim + slices[dim] = slice(start, end) + return tensor[tuple(slices)] diff --git a/pymllm/orchestrator/parallel_state.py b/pymllm/orchestrator/parallel_state.py new file mode 100644 index 00000000..545c74a8 --- /dev/null +++ b/pymllm/orchestrator/parallel_state.py @@ -0,0 +1,207 @@ +"""Parallel state management for tensor and pipeline parallelism.""" + +import logging +import torch +import torch.distributed as dist +from typing import Optional + +from pymllm.configs.global_config import get_global_config +from pymllm.orchestrator.group_coordinator import GroupCoordinator + +logger = logging.getLogger(__name__) + + +# Global groups +_TP_GROUP: Optional[GroupCoordinator] = None +_DP_GROUP: Optional[GroupCoordinator] = None +_PP_GROUP: Optional[GroupCoordinator] = None + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + data_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + backend: str = "nccl", +) -> None: + """Initialize model parallel groups. + + Args: + tensor_model_parallel_size: Number of GPUs for tensor parallelism + data_parallel_size: Number of GPUs for data parallelism + pipeline_model_parallel_size: Number of stages for pipeline parallelism + backend: Communication backend (nccl for GPU, gloo for CPU) + """ + global _TP_GROUP, _DP_GROUP, _PP_GROUP + + if not dist.is_initialized(): + return + + world_size = dist.get_world_size() + world_rank = dist.get_rank() + local_rank = int(torch.cuda.current_device()) if torch.cuda.is_available() else 0 + + config = get_global_config() + + # Update runtime config + config.runtime.world_size = world_size + config.runtime.world_rank = world_rank + config.runtime.local_rank = local_rank + config.runtime.tp_size = tensor_model_parallel_size + config.runtime.dp_size = data_parallel_size + config.runtime.pp_size = pipeline_model_parallel_size + + # Logging + logger.info( + "Model parallel runtime config set: world_size=%s, world_rank=%s, " + "local_rank=%s, tp_size=%s, dp_size=%s, pp_size=%s", + config.runtime.world_size, + config.runtime.world_rank, + config.runtime.local_rank, + config.runtime.tp_size, + config.runtime.dp_size, + config.runtime.pp_size, + ) + + # Validate parallelism setup + assert ( + tensor_model_parallel_size * data_parallel_size * pipeline_model_parallel_size + == world_size + ), ( + f"TP({tensor_model_parallel_size}) * DP({data_parallel_size}) * " + f"PP({pipeline_model_parallel_size}) != World({world_size})" + ) + + # Create TP groups (intra-layer sharding) + if tensor_model_parallel_size > 1: + num_tp_groups = world_size // tensor_model_parallel_size + for i in range(num_tp_groups): + ranks = list( + range( + i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size + ) + ) + if world_rank in ranks: + _TP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.tp_rank = _TP_GROUP.rank_in_group + break + else: + _TP_GROUP = None + config.runtime.tp_rank = 0 + + # Create DP groups (data replication) + if data_parallel_size > 1: + num_dp_groups = world_size // data_parallel_size + for i in range(num_dp_groups): + ranks = list(range(i, world_size, num_dp_groups)) + if world_rank in ranks: + _DP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.dp_rank = _DP_GROUP.rank_in_group + break + else: + _DP_GROUP = None + config.runtime.dp_rank = 0 + + # Create PP groups (inter-layer partitioning) + if pipeline_model_parallel_size > 1: + num_pp_groups = world_size // pipeline_model_parallel_size + for i in range(num_pp_groups): + start = i * pipeline_model_parallel_size + ranks = list(range(start, start + pipeline_model_parallel_size)) + if world_rank in ranks: + _PP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.pp_rank = _PP_GROUP.rank_in_group + break + else: + _PP_GROUP = None + config.runtime.pp_rank = 0 + + +def get_tp_group() -> Optional[GroupCoordinator]: + """Get the tensor model parallel group.""" + return _TP_GROUP + + +def get_dp_group() -> Optional[GroupCoordinator]: + """Get the data parallel group.""" + return _DP_GROUP + + +def get_pp_group() -> Optional[GroupCoordinator]: + """Get the pipeline parallel group.""" + return _PP_GROUP + + +# Convenience functions for tensor parallelism +def get_tensor_model_parallel_rank() -> int: + """Get current tensor model parallel rank.""" + return get_global_config().runtime.tp_rank + + +def get_tensor_model_parallel_world_size() -> int: + """Get tensor model parallel world size.""" + return get_global_config().runtime.tp_size + + +def get_data_parallel_rank() -> int: + """Get current data parallel rank.""" + return get_global_config().runtime.dp_rank + + +def get_data_parallel_world_size() -> int: + """Get data parallel world size.""" + return get_global_config().runtime.dp_size + + +def get_pipeline_model_parallel_rank() -> int: + """Get current pipeline parallel rank.""" + return get_global_config().runtime.pp_rank + + +def get_pipeline_model_parallel_world_size() -> int: + """Get pipeline parallel world size.""" + return get_global_config().runtime.pp_size + + +def model_parallel_is_initialized() -> bool: + """Check if model parallel is initialized.""" + return _TP_GROUP is not None or _DP_GROUP is not None or _PP_GROUP is not None + + +# Communication helpers +def tensor_model_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across TP group.""" + group = get_tp_group() + if group is None: + return tensor + return group.all_reduce(tensor) + + +def tensor_model_parallel_all_gather( + tensor: torch.Tensor, + dim: int = 0, +) -> torch.Tensor: + """All-gather across TP group.""" + group = get_tp_group() + if group is None: + return tensor + return group.all_gather(tensor, dim=dim) + + +def data_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across DP group.""" + group = get_dp_group() + if group is None: + return tensor + return group.all_reduce(tensor) diff --git a/pymllm/quantization/__init__.py b/pymllm/quantization/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/quantization/methods/__init__.py b/pymllm/quantization/methods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/quantization/methods/awq_w4a16.py b/pymllm/quantization/methods/awq_w4a16.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/quantization/quant_recipe.py b/pymllm/quantization/quant_recipe.py new file mode 100644 index 00000000..a5b493be --- /dev/null +++ b/pymllm/quantization/quant_recipe.py @@ -0,0 +1,3 @@ +class QuantRecipe: + def __init__(self): + pass diff --git a/pymllm/tests/README.md b/pymllm/tests/README.md new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/tests/test_vocab_parallel_embedding.py b/pymllm/tests/test_vocab_parallel_embedding.py new file mode 100644 index 00000000..e22b52a5 --- /dev/null +++ b/pymllm/tests/test_vocab_parallel_embedding.py @@ -0,0 +1,310 @@ +"""Tests for VocabParallelEmbedding layer. + +This module tests the VocabParallelEmbedding layer with and without +tensor parallelism. +""" + +import os +import logging +import pytest +import torch +import torch.nn as nn +import torch.multiprocessing as mp +from typing import Callable + +from pymllm.configs import get_global_config +from pymllm.layers import VocabParallelEmbedding +from pymllm.orchestrator import ( + initialize_model_parallel, +) + +# Show runtime init logs during test execution. +logging.basicConfig(level=logging.INFO, force=True) +logging.getLogger().setLevel(logging.INFO) + + +# ============================================================================= +# Helper: weight loading +# ============================================================================= +def load_weight(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + """Load weight using the weight_loader attached to param attribute.""" + weight_loader = getattr(param, "weight_loader", None) + if weight_loader is None: + # Fallback: direct copy + param.data.copy_(loaded_weight) + else: + # Call the loader attached to param + weight_loader(param, loaded_weight) + + +# ============================================================================= +# Real distributed tests with world_size=8 on CUDA +# ============================================================================= +def run_worker_tp8_cuda( + rank: int, + local_rank: int, + world_size: int, + local_world_size: int, + test_func: Callable, + return_dict: dict, +): + """Worker function for multi-process testing with TP=8 on CUDA. + + Args: + rank: Global rank across all nodes + local_rank: Local rank within this node (used for GPU binding) + world_size: Total number of processes across all nodes + local_world_size: Number of processes on this node + test_func: Test function to run + return_dict: Shared dict for returning results + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29500" + + # Set device using local_rank (binds to GPU 0,1,2,3 on this node) + torch.cuda.set_device(local_rank) + + torch.distributed.init_process_group( + backend="nccl", + rank=rank, + world_size=world_size, + ) + + initialize_model_parallel(tensor_model_parallel_size=8) + + try: + result = test_func(rank, local_rank, world_size) + return_dict[rank] = result + except Exception as e: + import traceback + + return_dict[rank] = f"ERROR: {e}\n{traceback.format_exc()}" + finally: + torch.distributed.destroy_process_group() + + +def embedding_forward_tp8_worker_cuda(rank: int, local_rank: int, world_size: int): + """Test forward pass with real TP=8 on CUDA. + + Args: + rank: Global rank + local_rank: Local rank within this node (for logging/debugging) + world_size: Total world size + """ + config = get_global_config() + + assert config.runtime.tp_size == 8, f"Rank {rank}: tp_size should be 8" + assert config.runtime.tp_rank == rank, f"Rank {rank}: tp_rank mismatch" + + vocab_size = 1024 + embed_dim = 64 + # .cuda() uses the device set by torch.cuda.set_device(local_rank) + layer = VocabParallelEmbedding(vocab_size, embed_dim).cuda() + + # Verify the layer is on the correct GPU + assert layer.weight.device.index == local_rank, ( + f"Rank {rank}: weight should be on GPU {local_rank}, got {layer.weight.device}" + ) + + expected_shard_size = vocab_size // 8 + assert layer.num_embeddings_per_partition == expected_shard_size + assert layer.weight.shape == (expected_shard_size, embed_dim) + + # Each rank initializes its own shard with known pattern + with torch.no_grad(): + layer.weight.fill_(float(rank + 1)) # Rank 0: 1.0, Rank 1: 2.0, ... + + # Create input on the correct GPU + input_ids = torch.tensor([[0, 128, 256, 384], [512, 640, 768, 896]], device="cuda") + + output = layer(input_ids) + assert output.shape == (2, 4, embed_dim) + + # Verify output is on correct GPU + assert output.device.index == local_rank, ( + f"Rank {rank}: output should be on GPU {local_rank}, got {output.device}" + ) + + if rank == 0: + # Each token is owned by exactly one TP rank. Since each rank fills its + # local shard with (rank + 1), post-all-reduce output must match below. + expected_token_values = torch.tensor( + [[1, 2, 3, 4], [5, 6, 7, 8]], + device=output.device, + dtype=output.dtype, + ) + expected_output = expected_token_values.unsqueeze(-1).expand(-1, -1, embed_dim) + + if torch.equal(output, expected_output): + return "PASSED" + return "FAILED: embedding output does not match expected TP aggregation" + + return "OK" + + +def weight_loading_tp8_worker_cuda(rank: int, local_rank: int, world_size: int): + """Test weight loading with real TP=8 on CUDA. + + Args: + rank: Global rank + local_rank: Local rank within this node (for GPU binding verification) + world_size: Total world size + """ + vocab_size = 1024 + embed_dim = 64 + layer = VocabParallelEmbedding(vocab_size, embed_dim).cuda() + + # Verify the layer is on the correct GPU + assert layer.weight.device.index == local_rank, ( + f"Rank {rank}: weight should be on GPU {local_rank}, got {layer.weight.device}" + ) + + full_weight = torch.randn(vocab_size, embed_dim) + load_weight(layer.weight, full_weight.cuda()) + + shard_size = vocab_size // 8 + start_idx = rank * shard_size + end_idx = start_idx + shard_size + expected_shard = full_weight[start_idx:end_idx] + + if not torch.allclose(layer.weight.cpu(), expected_shard): + return f"FAILED: shard mismatch at rank {rank}" + + if rank == 0: + gathered_shards = [layer.weight.cpu().clone()] + for other_rank in range(1, 8): + other_shard = full_weight[ + other_rank * shard_size : (other_rank + 1) * shard_size + ] + gathered_shards.append(other_shard) + + reconstructed = torch.cat(gathered_shards, dim=0) + if torch.allclose(reconstructed, full_weight): + return "PASSED" + else: + return "FAILED: reconstruction mismatch" + + return "OK" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="Requires at least 8 GPUs") +class TestVocabParallelEmbeddingRealTP8: + """Real distributed tests with world_size=8 and TP=8 on CUDA.""" + + def test_forward_pass_tp8_real(self): + """Test forward pass with real TP=8 using 8 processes on CUDA.""" + world_size = 8 + local_world_size = 8 # Single node with 8 GPUs + + mp.set_start_method("spawn", force=True) + + manager = mp.Manager() + return_dict = manager.dict() + + processes = [] + for rank in range(world_size): + # In single-node setup, local_rank == rank + local_rank = rank + p = mp.Process( + target=run_worker_tp8_cuda, + args=( + rank, + local_rank, + world_size, + local_world_size, + embedding_forward_tp8_worker_cuda, + return_dict, + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join(timeout=120) + if p.is_alive(): + p.terminate() + p.join() + + for rank in range(world_size): + result = return_dict.get(rank, "TIMEOUT") + if rank == 0: + assert result == "PASSED", f"Rank {rank} failed: {result}" + else: + assert "ERROR" not in str(result), f"Rank {rank} error: {result}" + + def test_weight_loading_tp8_real(self): + """Test weight loading with real TP=8 using 8 processes on CUDA.""" + world_size = 8 + local_world_size = 8 # Single node with 8 GPUs + + mp.set_start_method("spawn", force=True) + + manager = mp.Manager() + return_dict = manager.dict() + + processes = [] + for rank in range(world_size): + # In single-node setup, local_rank == rank + local_rank = rank + p = mp.Process( + target=run_worker_tp8_cuda, + args=( + rank, + local_rank, + world_size, + local_world_size, + weight_loading_tp8_worker_cuda, + return_dict, + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join(timeout=120) + if p.is_alive(): + p.terminate() + p.join() + + for rank in range(world_size): + result = return_dict.get(rank, "TIMEOUT") + if rank == 0: + assert result == "PASSED", f"Rank {rank} failed: {result}" + else: + assert "ERROR" not in str(result), f"Rank {rank} error: {result}" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +class TestVocabParallelEmbeddingCUDA: + """Tests for non-parallel TP=1 mode on CUDA.""" + + @pytest.fixture(autouse=True) + def setup_config(self): + config = get_global_config() + config.runtime.tp_size = 1 + config.runtime.tp_rank = 0 + yield + config.runtime.tp_size = 1 + config.runtime.tp_rank = 0 + + def test_cuda_forward(self): + layer = VocabParallelEmbedding(1000, 512).cuda() + input_ids = torch.randint(0, 1000, (4, 32), device="cuda") + + output = layer(input_ids) + + assert output.device.type == "cuda" + assert output.shape == (4, 32, 512) + + def test_cuda_weight_loader(self): + layer = VocabParallelEmbedding(100, 64).cuda() + + cpu_weight = torch.randn(100, 64) + load_weight(layer.weight, cpu_weight.cuda()) + + assert torch.allclose(layer.weight.cpu(), cpu_weight) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/pyproject.toml b/pyproject.toml index 89d69947..160341ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies=[ cuda = ["tilelang", "flashinfer-python"] [project.scripts] +pymllm = "pymllm.__main__:main" mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" mllm-service = "pymllm.mobile.service.tools:cli_app" From 731ea71892fd6c9be64a0449d7d11f84948f03f1 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Thu, 19 Feb 2026 08:53:02 +0000 Subject: [PATCH 05/13] feat: enhance layer implementations and add new components - Updated `.codespellrc` to include 'flashinfer' in the ignore words list. - Introduced new files for `launch_server`, `prepare`, and various layer implementations including `LayerNorm`, `RMSNorm`, and `MLP`. - Added `ColumnParallelLinear` and `RowParallelLinear` classes for efficient linear operations in tensor parallelism. - Implemented rotary embedding functions in `rope.py` for enhanced model performance. - Created caching mechanisms in `param_disk_cache.py` and `radix_cache.py` for improved memory management. - Refactored `GroupCoordinator` to enhance broadcasting functionality in distributed settings. --- .codespellrc | 2 +- pymllm/executor/__init__.py | 0 pymllm/executor/cuda_graph_runner.py | 0 pymllm/launch_server.py | 0 pymllm/layers/__init__.py | 24 ++ pymllm/layers/base.py | 3 +- pymllm/layers/embedding.py | 10 +- pymllm/layers/layer_norm.py | 43 ++++ pymllm/layers/linear.py | 263 +++++++++++++++++++++ pymllm/layers/mlp.py | 199 ++++++++++++++++ pymllm/layers/rms_norm.py | 64 ++++++ pymllm/layers/rope.py | 276 +++++++++++++++++++++++ pymllm/mem_cache/param_disk_cache.py | 0 pymllm/mem_cache/radix_cache.py | 0 pymllm/orchestrator/group_coordinator.py | 12 +- pymllm/prepare.py | 0 16 files changed, 890 insertions(+), 6 deletions(-) create mode 100644 pymllm/executor/__init__.py create mode 100644 pymllm/executor/cuda_graph_runner.py create mode 100644 pymllm/launch_server.py create mode 100644 pymllm/layers/layer_norm.py create mode 100644 pymllm/layers/linear.py create mode 100644 pymllm/layers/rope.py create mode 100644 pymllm/mem_cache/param_disk_cache.py create mode 100644 pymllm/mem_cache/radix_cache.py create mode 100644 pymllm/prepare.py diff --git a/.codespellrc b/.codespellrc index 9ddb9d85..bbf02bd1 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, bfloat, constexpr, cuda, dlpack, expt, forceinline, ifndef, linalg, LPBQ, mllm, pymllm, Quantizaton, Qwen, ROCM, silu, torchao +ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, bfloat, constexpr, cuda, dlpack, expt, forceinline, ifndef, linalg, LPBQ, mllm, pymllm, Quantizaton, Qwen, ROCM, silu, torchao, flashinfer skip = *.json,*.jsonl,*.patch,*.txt diff --git a/pymllm/executor/__init__.py b/pymllm/executor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/executor/cuda_graph_runner.py b/pymllm/executor/cuda_graph_runner.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/launch_server.py b/pymllm/launch_server.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index 6f70a4d1..fd9a070e 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -2,10 +2,34 @@ from pymllm.layers.base import MllmBaseLayer from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.layer_norm import LayerNorm +from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear +from pymllm.layers.mlp import MLP, ParallelMLP +from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rope import ( + apply_llama31_rope, + apply_llama31_rope_pos_ids, + apply_rope, + apply_rope_pos_ids, + apply_rope_with_cos_sin_cache, +) from pymllm.layers.utils import set_weight_attrs __all__ = [ "MllmBaseLayer", "set_weight_attrs", "VocabParallelEmbedding", + "ColumnParallelLinear", + "Linear", + "RowParallelLinear", + "MLP", + "ParallelMLP", + "LayerNorm", + "RMSNorm", + "GemmaRMSNorm", + "apply_rope", + "apply_llama31_rope", + "apply_rope_pos_ids", + "apply_llama31_rope_pos_ids", + "apply_rope_with_cos_sin_cache", ] diff --git a/pymllm/layers/base.py b/pymllm/layers/base.py index 5dc519f4..3044e206 100644 --- a/pymllm/layers/base.py +++ b/pymllm/layers/base.py @@ -3,12 +3,13 @@ from torch.nn import Parameter from pymllm.layers.utils import set_weight_attrs from pymllm.quantization.quant_recipe import QuantRecipe +from typing import Optional class MllmBaseLayer(nn.Module): def __init__(self): super().__init__() - self.quant_recipe: QuantRecipe = None + self.quant_recipe: Optional[QuantRecipe] = None def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): """Load weights into a parameter. diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py index 0442caa4..ec99c5b2 100644 --- a/pymllm/layers/embedding.py +++ b/pymllm/layers/embedding.py @@ -120,6 +120,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: Returns: Embedded representation (all-reduced across TP group if needed). """ + local_padding_idx = self.padding_idx if self.tp_size > 1: # Create mask for valid vocab range vocab_mask = (x >= self.vocab_start_index) & (x < self.vocab_end_index) @@ -130,6 +131,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x - self.vocab_start_index, torch.zeros_like(x), # Invalid indices become 0 (will be masked) ) + # F.embedding expects indices in local weight-table space. + # Only pass padding_idx on the owning rank, remapped to local offset. + if self.padding_idx is not None: + if self.vocab_start_index <= self.padding_idx < self.vocab_end_index: + local_padding_idx = self.padding_idx - self.vocab_start_index + else: + local_padding_idx = None else: masked_input = x vocab_mask = None @@ -138,7 +146,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: output = F.embedding( masked_input.long(), self.weight, - padding_idx=self.padding_idx if self.padding_idx is not None else None, + padding_idx=local_padding_idx, ) # Mask invalid positions (for TP) diff --git a/pymllm/layers/layer_norm.py b/pymllm/layers/layer_norm.py new file mode 100644 index 00000000..54d94c19 --- /dev/null +++ b/pymllm/layers/layer_norm.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import torch +import flashinfer +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + + +class LayerNorm(MllmBaseLayer): + """LayerNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + # flashinfer.norm.layernorm expects gamma/beta in fp32. + self.weight = Parameter(torch.ones(hidden_size, dtype=torch.float32)) + self.bias = Parameter(torch.zeros(hidden_size, dtype=torch.float32)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + if x.dtype != torch.bfloat16: + raise TypeError( + "flashinfer.norm.layernorm requires bfloat16 input, " + f"but got {x.dtype}" + ) + + if x.dim() == 2: + return flashinfer.norm.layernorm(x, self.weight, self.bias, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.layernorm(x_2d, self.weight, self.bias, self.eps) + return out.reshape(original_shape) diff --git a/pymllm/layers/linear.py b/pymllm/layers/linear.py new file mode 100644 index 00000000..dc583e93 --- /dev/null +++ b/pymllm/layers/linear.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs +from pymllm.orchestrator import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) + + +class ColumnParallelLinear(MllmBaseLayer): + """Linear layer with column parallelism (output-dimension sharding). + + The weight matrix is split along the output dimension across TP ranks. + Each rank holds ``out_features / tp_size`` rows of the weight. + + Args: + in_features: Size of each input sample. + out_features: Size of each output sample (before sharding). + bias: If ``True``, adds a learnable bias. + gather_output: If ``True``, all-gather the output across TP ranks + so every rank gets the full ``out_features``. Set to ``False`` + when the next layer is a :class:`RowParallelLinear` that expects + a split input. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + gather_output: bool = True, + ): + super().__init__() + + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.in_features = in_features + self.out_features = out_features + self.gather_output = gather_output + + if out_features % self.tp_size != 0: + raise ValueError( + f"out_features ({out_features}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + self.out_features_per_partition = divide(out_features, self.tp_size) + + self.output_start_index = self.tp_rank * self.out_features_per_partition + self.output_end_index = self.output_start_index + self.out_features_per_partition + + self.weight = Parameter( + torch.empty(self.out_features_per_partition, in_features) + ) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias_flag = True + self.bias = Parameter(torch.empty(self.out_features_per_partition)) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.bias_flag = False + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + output_dim = getattr(param, "output_dim", None) + + if output_dim is None or self.tp_size == 1: + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + shard_weight = loaded_weight.narrow( + output_dim, + self.output_start_index, + self.out_features_per_partition, + ) + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = F.linear(x, self.weight, self.bias) + + if self.gather_output and self.tp_size > 1: + output = tensor_model_parallel_all_gather(output, dim=-1) + + return output + + +class RowParallelLinear(MllmBaseLayer): + """Linear layer with row parallelism (input-dimension sharding). + + The weight matrix is split along the input dimension across TP ranks. + Each rank holds all ``out_features`` rows but only + ``in_features / tp_size`` columns. + + Typically placed after a :class:`ColumnParallelLinear` whose + ``gather_output=False``, so the input is already split. + + Args: + in_features: Size of each input sample (before sharding). + out_features: Size of each output sample. + bias: If ``True``, adds a learnable bias (applied after all-reduce). + reduce_output: If ``True``, all-reduce the output across TP ranks. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + reduce_output: bool = True, + ): + super().__init__() + + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.in_features = in_features + self.out_features = out_features + self.reduce_output = reduce_output + + if in_features % self.tp_size != 0: + raise ValueError( + f"in_features ({in_features}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + self.in_features_per_partition = divide(in_features, self.tp_size) + + self.input_start_index = self.tp_rank * self.in_features_per_partition + self.input_end_index = self.input_start_index + self.in_features_per_partition + + self.weight = Parameter( + torch.empty(out_features, self.in_features_per_partition) + ) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias_flag = True + self.bias = Parameter(torch.empty(out_features)) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + else: + self.bias_flag = False + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + input_dim = getattr(param, "input_dim", None) + + if input_dim is None or self.tp_size == 1: + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + shard_weight = loaded_weight.narrow( + input_dim, + self.input_start_index, + self.in_features_per_partition, + ) + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = F.linear(x, self.weight) + + if self.reduce_output and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output) + + if self.bias is not None: + output = output + self.bias + + return output + + +class Linear(MllmBaseLayer): + """Linear layer with simple quant dispatch.""" + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + + self.weight = Parameter(torch.empty(out_features, in_features)) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias = Parameter(torch.empty(out_features)) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + else: + self.register_parameter("bias", None) + + def _forward_torch_linear(self, x: torch.Tensor) -> torch.Tensor: + return F.linear(x, self.weight, self.bias) + + def _forward_quant_linear(self, x: torch.Tensor) -> torch.Tensor: + # TODO(wch): Implement quantized linear path. + raise NotImplementedError("quant_linear is not implemented yet.") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.quant_recipe is None: + return self._forward_torch_linear(x) + return self._forward_quant_linear(x) diff --git a/pymllm/layers/mlp.py b/pymllm/layers/mlp.py index e69de29b..1a40db92 100644 --- a/pymllm/layers/mlp.py +++ b/pymllm/layers/mlp.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +import logging +from typing import Callable, Literal, Optional + +import flashinfer +import torch + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear + +logger = logging.getLogger(__name__) + +MLPActivation = Literal["silu", "gelu", "gelu_tanh"] + +_ACTIVATION_MAP: dict[MLPActivation, Callable[..., torch.Tensor]] = { + "silu": flashinfer.activation.silu_and_mul, + "gelu": flashinfer.activation.gelu_and_mul, + "gelu_tanh": flashinfer.activation.gelu_tanh_and_mul, +} + + +def _validate_mlp_args( + hidden_size: int, intermediate_size: int, activation: str +) -> None: + if hidden_size <= 0: + raise ValueError(f"hidden_size must be > 0, but got {hidden_size}") + if intermediate_size <= 0: + raise ValueError( + f"intermediate_size must be > 0, but got {intermediate_size}" + ) + if activation not in _ACTIVATION_MAP: + raise ValueError( + f"Unsupported activation '{activation}'. " + f"Expected one of: {list(_ACTIVATION_MAP)}" + ) + + +def _run_gated_activation( + gate_up: torch.Tensor, + intermediate_size: int, + activation: MLPActivation, + enable_pdl: Optional[bool], +) -> torch.Tensor: + if gate_up.shape[-1] != 2 * intermediate_size: + raise ValueError( + "Expected last dim of gate_up tensor to be " + f"{2 * intermediate_size}, but got {gate_up.shape[-1]}" + ) + return _ACTIVATION_MAP[activation](gate_up, enable_pdl=enable_pdl) + + +class MLP(MllmBaseLayer): + """Feed-forward MLP block with FlashInfer fused gated activations. + + Non-parallel version (TP=1). Uses :class:`Linear` for all projections. + + Supported activations: ``silu``, ``gelu``, ``gelu_tanh``. + """ + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + activation: MLPActivation = "silu", + use_fused_gate_up_proj: bool = True, + use_bias_gate_up: bool = False, + use_bias_down: bool = False, + enable_pdl: Optional[bool] = None, + ): + super().__init__() + _validate_mlp_args(hidden_size, intermediate_size, activation) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.activation = activation + self.use_fused_gate_up_proj = use_fused_gate_up_proj + self.enable_pdl = enable_pdl + + if not use_fused_gate_up_proj: + logger.warning( + "MLP with use_fused_gate_up_proj=False uses a lower-efficiency path. " + "Use use_fused_gate_up_proj=True for better performance.", + ) + + if use_fused_gate_up_proj: + self.gate_up_proj = Linear( + hidden_size, 2 * intermediate_size, bias=use_bias_gate_up, + ) + self.gate_proj = None + self.up_proj = None + else: + self.gate_up_proj = None + self.gate_proj = Linear( + hidden_size, intermediate_size, bias=use_bias_gate_up, + ) + self.up_proj = Linear( + hidden_size, intermediate_size, bias=use_bias_gate_up, + ) + + self.down_proj = Linear( + intermediate_size, hidden_size, bias=use_bias_down, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + if self.use_fused_gate_up_proj: + assert self.gate_up_proj is not None + gate_up = self.gate_up_proj(x) + else: + assert self.gate_proj is not None and self.up_proj is not None + gate_up = torch.cat([self.gate_proj(x), self.up_proj(x)], dim=-1) + + hidden = _run_gated_activation( + gate_up, self.intermediate_size, self.activation, self.enable_pdl, + ) + return self.down_proj(hidden) + + +class ParallelMLP(MllmBaseLayer): + """Tensor-parallel MLP with column-sharded intermediate dimension. + + Projection layout (Megatron-style): + + - ``gate_proj``: :class:`ColumnParallelLinear` + ``(hidden_size → intermediate_size, gather_output=False)`` + - ``up_proj``: :class:`ColumnParallelLinear` + ``(hidden_size → intermediate_size, gather_output=False)`` + - ``down_proj``: :class:`RowParallelLinear` + ``(intermediate_size → hidden_size, reduce_output=True)`` + + Gate and up projections are kept separate so that each TP rank holds a + correctly paired ``[gate_shard, up_shard]`` for the gated activation. + + Cost: **1 all-reduce** (inside ``down_proj``). + + Input shape : ``(*, hidden_size)`` — full / replicated. + Output shape: ``(*, hidden_size)`` — full / replicated. + + Args: + hidden_size: Model hidden dimension. + intermediate_size: Intermediate (expanded) dimension **before** TP + sharding. + activation: Gated activation type. + use_bias_gate_up: Add bias to the gate/up projections. + use_bias_down: Add bias to the down projection. + enable_pdl: FlashInfer PDL flag. + """ + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + activation: MLPActivation = "silu", + use_bias_gate_up: bool = False, + use_bias_down: bool = False, + enable_pdl: Optional[bool] = None, + ): + super().__init__() + _validate_mlp_args(hidden_size, intermediate_size, activation) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.activation = activation + self.enable_pdl = enable_pdl + + self.gate_proj = ColumnParallelLinear( + hidden_size, intermediate_size, + bias=use_bias_gate_up, gather_output=False, + ) + self.up_proj = ColumnParallelLinear( + hidden_size, intermediate_size, + bias=use_bias_gate_up, gather_output=False, + ) + + self.down_proj = RowParallelLinear( + intermediate_size, hidden_size, + bias=use_bias_down, reduce_output=True, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + gate_up = torch.cat([self.gate_proj(x), self.up_proj(x)], dim=-1) + + shard_inter = self.down_proj.in_features_per_partition + hidden = _run_gated_activation( + gate_up, shard_inter, self.activation, self.enable_pdl, + ) + return self.down_proj(hidden) diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py index e69de29b..b55a0ea6 100644 --- a/pymllm/layers/rms_norm.py +++ b/pymllm/layers/rms_norm.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import torch +import flashinfer +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + + +class RMSNorm(MllmBaseLayer): + """RMSNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + self.weight = Parameter(torch.empty(hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + # FlashInfer rmsnorm accepts 2D/3D input; flatten higher-rank tensors to 2D. + if x.dim() in (2, 3): + return flashinfer.norm.rmsnorm(x, self.weight, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.rmsnorm(x_2d, self.weight, self.eps) + return out.reshape(original_shape) + + +class GemmaRMSNorm(MllmBaseLayer): + """Gemma-style RMSNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + self.weight = Parameter(torch.empty(hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + # gemma_rmsnorm is defined on 2D input; flatten other ranks to 2D. + if x.dim() == 2: + return flashinfer.norm.gemma_rmsnorm(x, self.weight, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.gemma_rmsnorm(x_2d, self.weight, self.eps) + return out.reshape(original_shape) diff --git a/pymllm/layers/rope.py b/pymllm/layers/rope.py new file mode 100644 index 00000000..045774e9 --- /dev/null +++ b/pymllm/layers/rope.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +from typing import Optional, Tuple + +import torch +import flashinfer + + +def apply_rope( + q: torch.Tensor, + k: torch.Tensor, + indptr: torch.Tensor, + offsets: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 1.0, + rope_theta: float = 1e4, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor). + + cos/sin values are computed on the fly inside the kernel. Position offsets + are provided per-segment via ``indptr`` and ``offsets``. + + Args: + q: Query ragged tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key ragged tensor, shape ``(nnz, num_k_heads, head_dim)``. + indptr: Indptr tensor, shape ``(batch_size + 1,)``. The i-th segment + spans ``q[indptr[i]:indptr[i+1]]``. + offsets: Relative position offsets per segment, shape ``(batch_size,)``. + inplace: If ``True``, apply RoPE in-place and return ``None``. + If ``False``, return new ``(q_rope, k_rope)`` tensors. + rotary_dim: Number of dimensions to apply RoPE to. ``None`` means + the entire ``head_dim``. + interleave: If ``True``, rotate even/odd dims (``[..., ::2]`` / + ``[..., 1::2]``). If ``False``, rotate first/second half dims. + rope_scale: Scaling factor for position indices. + rope_theta: Base frequency theta. + + Returns: + ``None`` when *inplace* is ``True``, otherwise a tuple + ``(q_rope, k_rope)`` of rotated tensors with the same shapes as + the inputs. + """ + if inplace: + flashinfer.rope.apply_rope_inplace( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + return None + + return flashinfer.rope.apply_rope( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + + +def apply_llama31_rope( + q: torch.Tensor, + k: torch.Tensor, + indptr: torch.Tensor, + offsets: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 8.0, + rope_theta: float = 5e5, + low_freq_factor: float = 1.0, + high_freq_factor: float = 4.0, + old_context_len: int = 8192, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply Llama 3.1 style rotary embedding to a batch of queries/keys. + + This variant adjusts frequencies with ``low_freq_factor``, + ``high_freq_factor``, and ``old_context_len`` following the Llama 3.1 + RoPE recipe. cos/sin values are computed on the fly. + + Args: + q: Query ragged tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key ragged tensor, shape ``(nnz, num_k_heads, head_dim)``. + indptr: Indptr tensor, shape ``(batch_size + 1,)``. + offsets: Relative position offsets per segment, shape ``(batch_size,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. ``None`` means + the entire ``head_dim``. + interleave: If ``True``, rotate even/odd dims; otherwise first/second + half dims. + rope_scale: Scaling factor for position indices (default ``8``). + rope_theta: Base frequency theta (default ``5e5``). + low_freq_factor: Low frequency factor for Llama 3.1 RoPE. + high_freq_factor: High frequency factor for Llama 3.1 RoPE. + old_context_len: Original context length for Llama 3.1 RoPE. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_llama31_rope_inplace( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + return None + + return flashinfer.rope.apply_llama31_rope( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + + +def apply_rope_pos_ids( + q: torch.Tensor, + k: torch.Tensor, + pos_ids: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 1.0, + rope_theta: float = 1e4, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding using explicit per-token position IDs. + + Unlike :func:`apply_rope` which derives positions from ``indptr`` / + ``offsets``, this function takes a flat ``pos_ids`` tensor that supplies + an explicit position for every token. + + Args: + q: Query tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key tensor, shape ``(nnz, num_k_heads, head_dim)``. + pos_ids: Position indices, shape ``(nnz,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. + interleave: Interleaved layout flag. + rope_scale: Scaling factor for position indices. + rope_theta: Base frequency theta. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_rope_pos_ids_inplace( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + return None + + return flashinfer.rope.apply_rope_pos_ids( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + + +def apply_llama31_rope_pos_ids( + q: torch.Tensor, + k: torch.Tensor, + pos_ids: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 8.0, + rope_theta: float = 5e5, + low_freq_factor: float = 1.0, + high_freq_factor: float = 4.0, + old_context_len: int = 8192, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply Llama 3.1 style RoPE using explicit per-token position IDs. + + Combines Llama 3.1 frequency adjustments with explicit ``pos_ids``. + + Args: + q: Query tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key tensor, shape ``(nnz, num_k_heads, head_dim)``. + pos_ids: Position indices, shape ``(nnz,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. + interleave: Interleaved layout flag. + rope_scale: Scaling factor (default ``8``). + rope_theta: Base frequency theta (default ``5e5``). + low_freq_factor: Low frequency factor for Llama 3.1 RoPE. + high_freq_factor: High frequency factor for Llama 3.1 RoPE. + old_context_len: Original context length for Llama 3.1 RoPE. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_llama31_rope_pos_ids_inplace( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + return None + + return flashinfer.rope.apply_llama31_rope_pos_ids( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + + +def apply_rope_with_cos_sin_cache( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + inplace: bool = False, + is_neox: bool = True, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding with precomputed cos/sin cache. + + Compatible with SGL/vLLM implementations. Note that ``query`` and ``key`` + use a **flattened** head layout ``(nnz, num_heads * head_size)`` instead + of the 3-D layout used by the other ``apply_rope*`` functions. + + Args: + positions: Position indices, shape ``(nnz,)``. + query: Query tensor, shape ``(nnz, num_q_heads * head_size)``. + key: Key tensor, shape ``(nnz, num_k_heads * head_size)``. + head_size: Size of each attention head. + cos_sin_cache: Precomputed cos/sin tensor, shape + ``(max_seq_len, rotary_dim)``. The first half of ``rotary_dim`` + stores cosine values, the second half stores sine values. + inplace: If ``True``, apply in-place and return ``None``. + is_neox: If ``True`` (default), use GPT-NeoX style (rotate + first/second half dims). If ``False``, use interleaved style + (rotate even/odd dims). + + Returns: + ``None`` when *inplace* is ``True``, otherwise + ``(query_out, key_out)`` with the same shapes as the inputs. + """ + if inplace: + flashinfer.rope.apply_rope_with_cos_sin_cache_inplace( + positions, query, key, head_size, cos_sin_cache, + is_neox=is_neox, + ) + return None + + return flashinfer.rope.apply_rope_with_cos_sin_cache( + positions, query, key, head_size, cos_sin_cache, + is_neox=is_neox, + ) diff --git a/pymllm/mem_cache/param_disk_cache.py b/pymllm/mem_cache/param_disk_cache.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/orchestrator/group_coordinator.py b/pymllm/orchestrator/group_coordinator.py index d0624473..2fec3078 100644 --- a/pymllm/orchestrator/group_coordinator.py +++ b/pymllm/orchestrator/group_coordinator.py @@ -1,6 +1,6 @@ """GroupCoordinator for distributed communication.""" -from typing import List, Optional +from typing import List import torch import torch.distributed as dist @@ -63,9 +63,15 @@ def all_gather(self, tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: return torch.cat(tensor_list, dim=dim) def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor: - """Broadcast from source rank to all.""" + """Broadcast from source rank to all. + + Args: + tensor: Tensor to broadcast. + src: Source rank relative to this group (0 <= src < world_size). + """ if self.device_group is not None: - dist.broadcast(tensor, src=src, group=self.device_group) + global_src = self.ranks[src] + dist.broadcast(tensor, src=global_src, group=self.device_group) return tensor diff --git a/pymllm/prepare.py b/pymllm/prepare.py new file mode 100644 index 00000000..e69de29b From 02255d8cea835927f7b0307d7218a7312cd43dc7 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Thu, 19 Feb 2026 09:28:47 +0000 Subject: [PATCH 06/13] feat: add initial files for pymllm architecture and launch functionality - Introduced a new architecture diagram image `pymllm-arch.png` in the assets directory. - Updated `README.md` to include the architecture diagram. - Created initial `launch.py` files in both the engine and server directories for future functionality. - Added an empty `scheduler.py` file in the orchestrator directory to support scheduling features. --- assets/pymllm-arch.png | Bin 0 -> 388499 bytes pymllm/README.md | 3 +++ pymllm/engine/launch.py | 1 + .../scheduler.py} | 0 pymllm/{prepare.py => server/launch.py} | 0 5 files changed, 4 insertions(+) create mode 100644 assets/pymllm-arch.png create mode 100644 pymllm/engine/launch.py rename pymllm/{launch_server.py => orchestrator/scheduler.py} (100%) rename pymllm/{prepare.py => server/launch.py} (100%) diff --git a/assets/pymllm-arch.png b/assets/pymllm-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..37c48b2a087b35d0693646566dc9870c50786b6f GIT binary patch literal 388499 zcma%EcOaGT7mw&mv?#P_Tx5h2vTuv7-9WaocXqCM+e_-&WR$&SW^2lpy;b(k-u#~D za=H9k|9rpK?Y{5(JZGQJInQ|?$;pUs+q!S-rcIl+p)Q`kylE48`KC=|Dr8&0H!rJS z^=#U-a}(|g{y5c7vvrSnz;1Insz*QO zE?f$|Lgjhnv8uqXdygL<@UCJ>pYGdz7Daou)8g`ri=laS(F>_li)AH&4feXjzC5x! z`UR6~XJgA}vxAJ{l#FV#r}Tt}%odL#T{aOBlaTE^wEUruXv(O|~4WiH=(f%N=Eg5_fbC@u!ZjMeT@VJ0r3TGy!)%gY9dJx z<=ej8uaqL#_v{s941Sd>7hz^89zMq}AUKd1+ufI;cHx4{CQRYhYtQiiu=3FVr}-v$ zed0E_twStl*R5@VJ=o+|D_i+dPcGQjkbm?G+%u%4^DF=GV?2dR-XhE0yLE-mZo_|r zmjE<}Ch2c)(8qMhF5Xn-s?hQ;kbET7f2AW=2cX6M8~^P@{F;~R_KIh@)fMEZDj!b` zpx^RU6~aV3yEd#9|6}~pb>)zw&6Gb>CT@~H^!hi7KL>Ot0}@!Cwwru)c^FfQNVh*; zPZW&yWrZY>_3y9ex(gX~Glaqy?)|;#NEhS= z<%QdjLU2tNT2f!o7a`mfSOnKJ@sSNnCF#Rk53W-1sPMhbeRw6>#f4`g{+o@=B^yr% zG+%5Lxl<`HxMJJzzg@fW8YfnWm@h|uF>rr$>G5lLB4Y|jvj5M7;up4Z{VCe#LUqlb z4-nsB3{L$ADl8ihDgXQw=JL`Y(LCeiQeK^?Ft3wbi97258`wNaH0R=&FQUm1If*&y zCS=jb;Sv{_N0M^?k5V=slb^y=-YNbSQO1ag*Eo~%1pNHU&ctIl!e9RO@aVJ1#rzxC zCn`@@?~`w}^&0I4RzRb<{SmyEn)#F9qndj-^F?1)LiSvA3bS7{1A ztdm521Xo<4r4yw|3b_;{ASQlmz77%nrPeawI_H9 znBIvt-mgb+%i`)GZj{B_gB3>oztz_KMml@MZpcf;lx0Cs-`_-wfq(_UO>_AJ4axs4 zo>kricPht6H1b=)qhqU}Ewt3~*IwPEW)U&#Z)F-UdOlbnrz|5%w=zj$|KgG0z3_fx z0Rc>FP}=>p)%8@PknHQ(u~7r_Hv#Xt|C?(>FG;v!e&agUt=*aM9?Sfp+XX!Jk?)?X zQUg{&{_x@2pExe9=cUwTU|bR3*~P$(eRn3x{7F()Xav;#{o+obNeoSA*a#OVr$}V} z2V<^z2*e-TI&`b_;X{1*iHv?PBdhWNL^TW$<9-~w*t(_BnG${dz0Iq51DN%!kbQ-D zB`GcwzyC3w!#%0sB6SK`)jTRf+JasG%0l8rluHxXye-Ce3^-PFAXtWD{H{h9NtSE`1k&M(E>PHp1N~2*WhC4bT*MjL3MV}G@CIQuYpdcoWM|q~7`K{GtRoh9K3))3 zzYlJMIolk~zPpnKNB@!F;Ai)xfc8E&x%qvS#`9|Vd4FUsSi1rcB;{Toam||~DIUKi z*y<{lB^hG?>E~LnK%lJo`b5Kj6no^-Rv;5?CLuZ`WR!j`ldH@8wF!d@$3;7?z(T)j zs3s;bN-p{DiNjZ&>o2ZHY60zaRg;3(VBYPnF#Iih*67p6$u(Osd<~BU9N#4o=ue}| zc6dz!Acsmxq<$|e!3>ad?FwI+aL?oVLfj~`%GTu%|8Jp!w*VF})j!nzySKvU%wSU^ z;G)k@=!S0YBFGt}jT!B;%1 zw6fl{^S@P*SpKlV5^z<3^=(0q83J$niGnha^>Bmh{}CsQM;0XR=Xea-c!SQ(l3k z_;|W1k8pd|P^QsLb%cmK$J=kWS$)G8Zd7a`b|t~4Sa*6VdhKwY{-6VX$7V7KOH~$)u)+)!BKfsu%o)d6(R!34_ zT%6dp#aeNuM_g1{LCv~mG~PBR24T<{?r&A}`t{ftN#W3?lJlfH|KQ5hv`Q^>CGv9YOUz^abjXc;}I569vz{wnV>xNxZQ>%i4 zhT}w1m5iXLQ9@4Ibb=%1G`&^D8J#L=zPOUSi8wP)6|)@cS6fHndfN8%%HW=)X*ai6RS)$f%MP&FOnJ@V5OK+aS|*9_8-zyoJHh zgr(Vp&d88whNqfR%!yGBlAM;zK@OvD`CY{eWJ~ob?c3&BN8DXEs{Cg5;NJhIIn}F< zjAL|L#17?@hIvOsA!vBEPR3{~7|s-HELQ4!8IYhkp0{?Twb(SVK1%kVn`9Old4yH3 zMAPSQO8E9naOjwsW4}s-N373JhFjk-*>reAvA`Pq2j?$!77zhZ!G4mPIB2#yBD1nd zHMv%Fam3Axx7dnUnELHlnP9Pkiiv#gRL<~(_Ce9Ak&57hd zY^Hjk`*%y;820I~gYAFM+ytx(Cf`tLQLa(QnZ4J`U{TQ9a)%vSynmlg`ux zN0XU`awlF-LuHPv=^szhUy7D+GdTQwXYrn#+3KEVyDQ(2gfQ!>;f{RcQi0#{3UOBh zvMn-Tlldk|-;I7kL$dw!HC zd)Uy?I(i_pKfYi2bb`}FOqPQ8V=~IeE;fJLD1dkE;)hh;ZW#%17%haRX-Eg5<1WqH zYp}q|%%y%vm}T+j1JMJb8U5{?4&N!V)b-R@vuFEavZ#tc7Ob>56SS1UW*tRfurPsJ zBVYd7epS4H3DMi#fuL?@X}~>N8Do{zJtW)Ql6dL&abWfT6(lOB~~A)Scw{K ze#9JjEq5$n*n_)VC^@tsA0RAAv1yHloi3ZVA7hJudxP<3te&vZtB;w6Vt1U?fz>2CI4am6p&$yg^4=1r;`KL zz_zG8udYeG7eQshm;I1Xa1oN3sq#X40MK(k&(%^t{!80em+O6~41T zbkTgKlhauVJ3R+vH*{ZlO7;@8Jfprgz{-0hI4|^5c#YvuqAe=O{`IzY6*|DJJUADa z*uE?iqg_f^IVS2afK3|Pg~uyKOU;~;Us~5H;k2W{B)z2wE6Kqho;iMCBFQ_)xFn_U z8Dban;KEF~v-(ip(qf@PvSvoxd>5ZQRoRCfqS=~_=gk&p>+%$YC&IP?|HB|bZ!h@u zECmxLdX+P}x)=N$&)km|D+K1L5rsvY7~pZa{Ee2mW)%r zis{tGDp7l&jVEK3l8f8}$3sUum_;-C9&r@f+U0#Y^2DP+c*xvs&%paE2a#qVag1|Q z!1>kR+oz)VqTHv+EH4MK_a|^Kta*zQ3B}+Cb?C#E>j?R~(m~Vio}q+kllZGAhzGYK zFwreZA;mevj>BwCCh2NQz-brlGm5Y^ArtSnJZ_l()N#K$OH;Lp@;w zEw2D>JsJ5n=-t&Pq7^+Y%Z1Z>e~fjPbK0HnPAcY5snu{^!15wa+yo&3N6<`(o&?sY zbt2x@vo2@Ino(FktOQ_OlC|!2l)hewSI5(vlR`^N6(+OV7UB=s)L)l(Q+XuZ$=BvE zhUL}$GACukMz_FLz>~v;R(%9p9OAEt_P3%n;zB*n}^wNHrd=1*} z*z;Ai+22TOoW=sDvLm38533?cOK|99s}b?j(g8+-EsAqa605qO(V6j5KXj;qLvOYA z8pwK;I=x)bo(!2A@dv^@3Sxz~z$b4zeok5?dDyAv90TiKj-2sIuTF8@F7r4RQS@|u zfk>L%Km*O(aGtZZacB;|NLqugKUV8-!rnH6hyaPnkrb`aMH`KFOJL3wo0xSUi3aW9 ze>P?}@}4#~v5|Ks+h5SuRDLL>N+yP?xlyfP+G2*XtXmab|HMv2>qaiPnWrmnV(3ebaz1RO~M?e@Rj>V!d}cD9@pm<0dprEBOrGN1%@aU%(e{IEc-^5;*FCs_{U$ zi&2PtCzp-HzQIyr=}WQK>LOKR3}Ddd8R@R=iST9g4ZAdhh}BWA^+e<#d>uxfXt4LP zL|>Yb%9^NEi8=4j?mzP4FzQU!aIQTHD~DAQ3y&y>K*S8DN@2Z;* z^eHg~C|zNcL=k$_?{F3N>BI+y#Y7-n=yJ)|t#1nGRj?#p)0aiu4omOn?r;oeG#GqO z!D;L}?tS`)d0$k7S8APl7UCUe4rzFcFCY3#N{VQqBI=OjZB451Bw;b{{R<)|7|mJC zp9Kcm$b8}+JgBCTHLO)>+Az|IJ%4L)rrSF%y){!pL4s2kidHcc3hdoa?y@2o+KaY% zb1hiJhmm60Q;T;P=7Z6@4X6^HiOlwRJ<=a35lL+_&5|d#)&MG}JXCkIKex-zNCL}{ ztR@!T$un1+Z8s)CH7SNYG7Mx0dp@Nh8Hle#$I;FoPdH`1-n&1LMXP3Y33b`nt321& z(eTx6H<^nWB}>iSB8n%41ft@Wl83|Qdk&+b+NqVH?fNo%dtA2z0G(7 z+m}R_#zbRCY8}cL!^bDxq~}h=83yn_bN}X}HQZ>FV3PK5L;R=tmh}8aqu2}U_iy(X{EAyUO$nb8aa8`t-|5M{#H3`B6dXKgPYTMNHkxzg3jl1 z$=eOn278x)rF-F>_5hK=!}gLS>UdiH>jO9Qi6|z1)EN7Itx7(aB#iyysa_>kUOcC6 z$J^wjq!)7-B^}qr=kJXOZFdvR6#_xHnL?-^jI6w5+KozvbZ6C>AxlI zJeh=NFnE`oEL3>HWFB8Lk1z`tPenMIWRHB7KcXBn>e>}1o{n&5dS^C2UoGm?c=fZp zi=by`lM*k=73CyJtP?GCD}$Y!A+ z^aLj=*if&naKzKZx9-FqD=sYJ)40TGrx#Jx{JHT@+w6yP?0OrXs}EqSPkV-DjJQgk zZBl8Qt;Gg*el9P*MW5~$+H4xf-8FdNfs$>|HotR!hcZCj>s`H{QGABnBf;Tp5Eo_$^_jo&b$|I2L|BoA!dB&@5PF1B5uC)i>rquVAX6FZmzG9^!h(X)hcn4nIh zgb$YIKEIp$l?^6TC(%|#!!Vm*srwpx`-%+c0(lWr+BIc3OK(4LBCNG*UuwTnIa(p4 zFnqP{VWoz=_p6u_60C&=0No~+CamQ(3mLr!BpgwF4Q6=@67;@k@VC`fJ+x*c2nz(2 z!&~0lvH!T(fS3&R4u3IXnhT>1cB9xgV&d}Q528mX5HmL&-nUJk3@PAiv8p<`IP%s` zN*{f>|3WOlGc+^01MQ=k#v7BBQU5K0kT0({cwFm(ZKFX%fmr4=2g_C{71PkQOP@n?`yU(|Otm!clV2fZ^~H3f6rSOfvji#H9QBn#O0nTK9=73N&=Q`)X}9>*O%n4C3_;nHYT~r;$&z zX8g`4xragQoCM+|8QrfeoR)xu(izW2_JnqS)Dvpo+s?cnkGcVbQiTY!DhBI-$O%y3 zP)rz9D%Q7Ywd;;Am;lK^hH+|*g3jx^f#Y7*-|x}dI4*>+Wq-R(5&acLU^PaZi%KS` zHPMEA^_qA3Qd^uqPo1?ME9Fbi(wFCOD~@ZOZx2GS(>qOnU!v?C@_Ovv>_w89JzR4WB*jmo1PcANn4~^jq(X{kEfwd@ODfjQ z-lokCj+qEc5>n4pEX<6`s{t_;Uy5JLNVf5quLQ}!+cz|z{^RSi!6R*`jA=%~0>MNV z!_#lxO1i#D0J5^pd4r;~?_|?ZGR^D_54R4w_2ZFrx3$sdi#ybC&K>|zHF1Wt)ya#q z{obEwRATpO=ml6u*V!y(Og0)7>|Ov4=Y(!{%|)s_5EgX1vS0)A(E%x*B*6%&;{*KR zEbjDs2OCg~?!9(Hzy&)o$LPllXe`avS>T z_L5(I$5Zz>qj3vKjtuPvt14-6S-;UagtyadRZyUXuzOr^RY$!;-~q?+ubu4-OSc*T z&PYXu1H3XY}W`ym- zl4WRik(g~4`k8M5uoMou(YKw53hBga;>nzO^P?xP|Gq%|GvDY??nfd`r9mWEL^KgSBA?AdOOXa>14dLMyqh@SzK<7f-*cm@^Y| zI*mLjHZM3>S9rw4djTSYB6AJp99I{C2T;;ijv@MU{ zXLElWU-M0=H*-T-2_MlVA8C#PKP8DOw*`QmMMhtAjKmFtOAaNmFV3f`+doq>*dc~~ zaj~Y5myW#G)yzT54mb^UfkARka^0iWLY*wKEWPq^RRUsO08be#hJiHH{}(8eoL1+h z1rtyI$ciIR*e}zlS$!ovhvt2$?<>0E=0EuZx@;VhNb@|pnj{FcxIlCFRD`? z-8KJ_S>zH{U+D79j7~2$n9|tQWwpu!v&wU_fo&a4DScgTZir(*>{87LN@Ugz*mFAz zVn7-i$XL<&<__5N2f9yRABk$|o}yC%%_PMDQtTtOJejMheVY%p)t{$-7{ye6>oa$e zV(zGCv|_A``}I+>Pl&+)yDneG9HWE44t3t8mN_QtH0oZEqIqWO%d}9>63N(ZET8FfagCu(&DhzEAi!t6RFlgOsxNja7{-yx}FB z&NIa)+FR&)we!kmWdsLPu#UYH8q;TIfIm|bXxQ$pX4Bx)QS7`>=X~5-9c_^6Y>Peb z9a;^5q|odxo&=Wv4qC~HlZQ7vpe{H>>8)yk5-*jyZ9xAn3*cD_!uhTPQwQZwM*q~N zkF&47Brfk}Bs}`%Xz{S~(%hv~J4sxCW5{@jZk?R2E6?AD(C~B9=`-5|$*oWv{fLq9 zmTTnzX+Lp6u=kabgFQWfSWj4|?n5P@G?1n6Z znhb4J#u_J#W0J%vjsqOcZLz6g3B&?dP?b0IBvUyYp|9ZJGBiaWm%7~bB^0r#%lzfh zcIq4^a}%osZE8<5-X;EWyD}f8fPoJ!ll!Zlb3E(#`n8?}8Dfq@ldL6{PlL<6pKC5HMoCwMd zo)nzfiJ^62rdy^$7#8%UtO0IT=QE;klvefTnTh(3<22sl$KnB)6!+N8cM7E7H%q3q z#XmFut{i^p!neybkzyed5)iotGRnk_lwqH_fvKtlTT;O z)l^3>?C*VJ`tBnt+n_GFC7Rh?$U9RIr7Sh`=)O$ahc17!Tsdo0xa5!8v)q1C)4UtUE{5+_ds>;HnUqF&gMwS@T8ak*dSW|s#|Rf@QsoYIX%*LpYUx%zpt19 zJI4*WHcYov#98za+wJZJ=-M?#ePa-~TuJ!(b)Ojen*F%RgfnxYrP3j{w33@Y9x;0X zPa){uxG&RFXbRMfQsX9oddFk~)X_M`c&i$40(L9`i1IjC?>QN=)I0v-*_)E*1!rw4 zk1Arj#S(0jhMOjg(-#XtmzjCgV0=h`1+01reu_TXZtqb15y51Gy;g5<`C9FXryqo; znyW6=bKZJKA;8)u^L@PDCp6opD2>&KpJBs3`#-jO*#-c@_s;%j+_TSGc)oiOaGZWH`H{g+#{~9G(;69>#3|a=<$X$Uz1$_IR1|nm$g#e)a-#6atjkfwo5rk zM3MmR;00lf`BCpYV>%@Ii$KTob44@VK^g+RdyE`(Afiemr@7vHcYP(DCqcQT3t4sX0o5?i2#fH)C>C3ZwgjL2thJ!9V zYRQ~@+jR09&nR81O{4kJ!W19-B#0TKzK@Gh2Tgxu=JhjfJdG(3j!&*Kr&2DE|LXbm zKm@Soijq|JS(m=U=!2(DwaPWR`b_YD#L<6ix=f+z8K|4Bwf2<^?kCGEaIQ2>i|Na9k zUUzeVJ9g5xdS3@3>UeFR#BU*_Nv`ycNfRF1?@mr$>a4 zPP0dC2$Vn-TA%mL2)jTCmV;n&WVfN6@mGLEEWX(_Oi&B*@pB3Le}4&@zcB`TK!-zI zvqEiDzOUXbnlAx(EFe(n^OS|BI1w~got8m!sxa}Qi3ZE_R@GO|q7GafkKjDkh{Aro zeXr4B?3rzL2)fzW|C9aYSaW)1UTJnJ1|$^a`loaDQ$rBfVzxKJ zHZDEZG$rcNL0A{wFKuen$OkZu(LTkzOx0W8?{!W#nQ0_SQFRSvkCj<~5-$#q9!UJK z9al_-4)XTNkTxp|2UUlWPa(8%txEu8XDP+@MH9C#z7XQu9$Kg$QD~XoiZ*ih%jC9$ zkZ?n{$WP1;sf_oaVk(=leGkZMG>BgOIJP z;>9*uKS4NLXX6PmIu#vQt@2lQ~YkPpG7sFEjAr}6E zQ5;U9E;p+tCg(tVsOdxA$>4i_1rBNG`HqW1s6@e@ z;5$Y3tz&`E(|Ya)wP$Sw@q}Ij(>EYLN-U8Cr7qIpCc9)1$9R4H0QDJEoJK~&I31%9 zGwi}PjNdukq)Z}m)xDKK#V#xNF(~A`H3*1_O_HdxbJflMzRirT-M4jCWd8dO?{W}T zbl#@mh@?O2A6jTCtekXdf2Qythz(>=oOUgde8NObzVS+GsA~d!ks&mGEvR8zCGoU# z=+t77ms;!vO=mrMeV=Rfz-pL#29q1jHLI@|B;SNnCw0dKnVz(?_l;u zJU;+}QpQ53<7_YQ`}X!mH<4RCN>!?IzgT0e9@2~Yv3GE9n-KOCIhrjt@hplH15oJU zB9@4kmu=b{r?A?^ViV&{DdQbHY9<2T?@|{rfLiT|n0_lk#2L<<#<04NGF073mE+I7 z>kJNVxup|;;AJWqKjm{i9Yhh~q2limQo6I+_YqS}dwGhP3`T!`*ugnNDV6XHc2O{S zBNmws-#XdEQ;3mgTwJYPFSqHm?3DZeejALzW4Y1hQ)g79yNN>>Xd7lK12|8J9M*<* zB@2u2?>OO`4pOt*I-?JZ5w>B>*|1X~d&oM0ZNEv%CH@}|k4j*#a%6O1NpCtZj{jpV(vz1jfJDh&rIng-`qLb0Fpr!0SJLsI)P$Yss^EVd*)7Lc8 ze!k$Ukn2&@wWl>!aieS+Geth~aiEl@_1T^uTTnQew3t~V++gC4$OKW~O^E}{0ale~ zi@PlcVJK+VP^_*X%zB#q9jM0H$EY*VvI@Psi^!Id`)G^{D!HnOZ?C~E%PmA}kyo%N zEH!uAe2o?-b((8x6PXfM$NlEh7l^Ybz1yg?hoHO zPet{N>>qm)j=}aG{^9Jg&#}YMlt^M7KT8c%y%2J;$@H6q*mg;DAn2R$8cb_4ElE*- z+Hv9wKRx=Aty;gsM2v>;(O00Nd#-aJx!g2k2-Y?sj9H@<*>t+>b9E}j$;Ykubzkq= z5L#eAczr`S zAP*$8HgStKRAJcZwE59u+mtdy-$(9*IGU1q%QF55`4_pq-t=RWZO%&u2IUiV8XBLc ze)ufp)UlPn8N16Y^r|ij)(JEzxn(<%Fs=C(q~e07<6s~##_QR5zuyFc0}{!ucN&+xI5nih?%sn z9BB^K#bMQ`thx^oA`?+q`XIkJy>j6xc4)4X8?N|?h(sAkcuHG*2Xu4BN{Xb_>-w80 zG!E(*!|EpN8`7e%9chtuo~`1jt!;{?2S(Am9H28g0_aZk&v*aPZtVi~E=?4+=`439 z*Y|oYHs$+CkDv#_8U=;$;V`;~PyBl9^y#J2j!N{8Qy zvW)wKNZqOFNMkbcn}MvoGCwfNv`?pgNMo<^0w~B!p1H6W^eeIU#>MEoLzTebf%(l< zTUuXX`$-SBX8jOXf#`M|tOMLL0ILJugZ%Wo;HTMesmT$Dl6SlH8CO5in#=(nDn7I> z-}JTvtjtz1*@l*E>( zo2thd7J9^sxdTU@nqOHG$prlYo-hfy(z@uJ5a)69+3DA?jzPb7LI8!iK%*4;5}>U* zPmZVgXm1NR?nUw)I@*Bar_S^ubm%QhP6(rl{55L`8$G!gi70-ZWrYBdbFKZE%Sj4! zEFIVSkv;v1b4X$i%Vh_gC+-S(m|{(kCK7DDB|T#Y(akbHUMr}h@13W_^yj}buTb1$=-fs#KZ7`z@{Vx*+mW<>M?xy)66yU50L}Yae4%#|c8UI`?KWf%p>aPfj zG)eJ{QmzBtX6Pz^<=fdk7dTnF18mjpK$);2Nj1iopSN6StYj24ljO>u7!%letI|F2 zst5oaucEAa-^Jazz$|Yym|DjH@&+3ikuayu_uLyj_5-$}F}ejG6mGExmaXKWizmJy zjt|S%C7Bo&(DLt39C#r@_7IA$ZTd4`5pCY|C)P8pIs2@WCOwvLy*My`uI6qK_Y?vB zBW!{LiRWfuG8$c%f-p&KdZs5j*U%|EY$uxm%xX~~bC4>N;W_jj!}OsjFZ*JZ*2|0) zpQ79|sX&|IV0GaU3Qy?-YaBB1uvG^ALsF$`2=;BfbY7gH7TIDWAWXY8sg%4N1Z&ZE zqd=eE5-Tcvf6&#_mv~SltAKtDNi0Qj?u+jk<&LH*Ih~AAmc3!#^h(CB>c$bi+HX8m^t(c+ zE8|l*3&T)Uj#ih8m;7t0^j{$k@cM~3H3EyLjO&?#=$Q@(zY<g5y_mvOW8}wU(mJOEFPwxpqSvp;!H@^_b>r3$HMUcrWMrtwy zMi-hh5N6$+`Re9$H69?X7NDG=KyD2RDaSCn`P@K>Nin z#Oalwy*X)A8%th$YGeQr8w$AuPud-@A9zzUi+$@sKR5|uNNivQ7U6vo=7_~0gH)dc z*09p_GK%UFY&euBmqv-9)a*RkA$PHFM@6cz1275m0Qtq-5|imIcPdmzq1JvJt{nuD zvd-w3cvfI6Y>Pk9YN#>}Ie>V917;b8XzV;~xyJktTE;gJ3^2KDH0SPKpVsN6;srz^ zP8H-4j3)fRTNlHs>%>mFI@q{&X0-j{ewYELwH*C!g-&CzYsdD!xCtx=V~%YYU#4;Q zKszp|S(t|v1kY?=@g#cNW^?%@)ie}(dlM+X#(XSMN)~7%QRIv`jfd<(tyu-=P+UiBdiSM?A&>kFf0;c46?+MP%r1+6@eHZ7FM42$Q zCGWS1SvV{K41*nj!sjMtJlaJHZr;x>M_=v+Eqkht96XV)zlVMd3j7X|&%EdygDZ^! zmovS0Zt3V0^fycvHH;R6F!O2>)Pv%7w)LY zCc~JxXp$hrEaF{z%dnGe&AArnZy6OCXF?Tk^g0gmB-lx(<{~&4Uv;PrqZMnY8$pGKT=5zf<4S}^mhK@Nx6OU3DMZjBR{)F%5;_aN0#r`M0K@D{XzWLPaJ2y}oAg!uA(9d-9_ zLq|bU!P?raLy>uqBgeT`xAEO|WqR*GOXX$Q=Q;|y!C%4LHcoN2rx^tJS@RHU8dtMj za?=sfa!2@o>~Kia)wo#x5Vafj&TBY+)(Pqz@~Z*o1_>i3-OKrzgn2LQs-LBp7k*!` zu<&W+017gXOx3Q_lTmn%(e6~C5ef^FLP<0x8~EfR5ErS_qkCS#lt@w7yz51LK#E$D z3bXR^7y-;aGBV7%Cla>kV^Hwa&~E>GIRAkhCK-+F`kLB(DG?kOxYYofy^3g5j4w^_ zPQ!9%sN$D#2j7CaXB=&Pj_7{L9qw^O<8wlTe>+4B4hbCuk!$$5j2VHF<>ZWld>+)~ z)by3pnV@&_!4h!lMulWIm1tr9L$n@lJ%A61dhLd!lW> z0sXANsgU?{T%T|(1iiahp}Lq^(a{%f9v5iUQWR@qz>oz4nnZ!$Y-mLig3|$g(ETws z#3YT>a*sD*oXHRwCIFLPD!4YK4oaJxFcf)fzbE<;Au9tYuKn`dB&7#b`9{@!xSGs| zLy166Y>t-zNy$|OSVNejH@_MAsHkkep$cW5zz@nxixh23i>OQXS&pX&~>f;7Q`An^%K6oFH3z)O3^ zqL+EuO}x$fn(>=YMZohZ^MalNSe*9;`DpwA0Kj!1Rsx)mZ3D0Y#n3P+sG46Ox7e*31`dwd z?|xOb^yIgR<(fk;w@dKNoJex|!dCy#&UL8=ltt%!k=RKvn!dZ&buJ;#W(9 z47?>R@)kaZ1#R-@kOw!?Y~9QS%`dQTVrKgx2jUM$4*la9?6brzz`cVr1qZK{+64_7wv zk>3MFE32R*0rGbz(Q`J#wQ++NBh;e}9LF4n^t-!X?aRtyFxP2!isFB?B6ZATU97{eQ*Y_%U!%(5bF6up=@pO zU>y^*K51l(<&nGTEsh0)bf7ZZAf89KsKE$DAx3#upd>8~5PMHXJ zMbvuB4|hhkicRxRqml4_xLQLK0lHa^Qf)Y`h+H5!e}0|V5mF`}0RcR!vKz~s%9kR$ z#{3uk1%lrTXKM(Txz^oa{p*jRm@kjGq6~$hOSADlyrce?sTY)Yz(v>Vi*6V*YOYqI zS9ExzWr4d~d6)oA&y$V}SYM!{TQ{7BF3?ePaen5Y##?306ql(#3n(%H&RO9Q4cym7 zyM^+;4TFzaQ`01F(2ZvOBt~7MBYG8b4&Fg}YFz{eZjG~Mls-re;Lv=0y9n(`@F6_o z@M5>Bbo52y6{x}vGU{c1rc>P_;amS!-4#5#NmR%*i!3K|4A4%zsT&y_q zf$I~y!e6K0jLT>$yLWeild@f<{&iXU)yuwWixRSlobnA4+oj69ePsI~E3mGW#eYB? z0=LEgh5Ha}MuPSds$+kjVc89i=w*b#4!A8_R;L6lPpqMs#2_m6EJ01o&nMUC`kgpo zGPg?Ek2aHOJ00LD;hh_MQFm7Zce1cp{H~nU?c&yjG1!^7b^WsB$V;w`s;1g3 zjbB4ftY7f|j^1+3Y)73U%s0i0fqfzjjj!#pxsf;oP5ujT9S zUjt5gr|`b22Hmu2rAaD#kwO;H*uQQ$1`_$wq;7xuI_ zNhZX`;VC)CdI@QQ*c61ptz z`-;epzi=D#2%31~P!h}{O@#n)2#FtKA&*4l{#2jzbyrpl6E_kBS2!EHEdEkifFK6U z>2T@}-dg?N2fUSE-n|nXE;g)kaSjZ0BOx+bO*K}G9-*yUVHc#}fs;UC&ZWJ(x7={J z{0vMU#q=MUtqeS~QJ5CaJ5WAz@mg;LR$T(&ishU!22Wv;_=d`!mFKUxNV?TXgX&^xNSLFBJ+&`O(4gb!C-yc#=EL+!H4*W z5E&6D?20k_)u>Rn3&-q+PEc&f7mzt5=YJnH=7&Np^((s#p420Y0Y~%rjO*uT#`eqb zy~saAuD`yW`By~6^upPsYLyfT1-t)^C^&tllLI=pIrIA|e5H|#o48hD0}t?S5O2%r zwu6g4ya~9_AeB!L2H?!YFEjw3QQ#*htoavBCem2^gYh@tkOQuaKv3CCL>xz2LOARO znyBAc5(L*xpbJ|}-39%cW_2@?OVc~vaASTI-Ffh7kK?&oD-12NzyGmcJT{joe} zz9v$}lV582ExS_Qq#Ob3rIR9xBe>@;9Y_j1{C<#8hip8-#cSImB>Hx7#j$2x@|&(j z&jM>%drd(VHRK6;{lRB$HqBq7f%%lMNr1>I1`s@7njQ+|HL&uaA>21cg@Bvo4^p# z>|a=w_piVdD?TfK)qIfACLxnQz6Fy@UWX)pX0CKquRp3HZggvTXMJpkPX0+F^^XHG zrMhJRs`jv9OocO3A=f&xbIn4PI|MjV`cdWHI;h01-g)Km79 zM$9_GNvSWu@StTO`pdw))ox#Cx{#XDIpsY4)*LaTUuuuhF&)=D^QT=blZppMCfZtl zAj!r!?a{9tQ*CTwJ;5=?eTuCrx|LmT`B!i?a^gfMeybV!aBQ{jQ+Ot^M+>9-$W&3) z{L|ey)5%+kZpH!6WdLY=DA;vnQ+Q?je*>ZhSHpO*X8bHNfQ0N6HBO+FdgVEQK)cBp z^Oykvbo2VT4&)72$)Ej=#x1abRf(oQQ2^@>4nv{?iz_1Kk^?(0r$L5gCOyT~F9mkX z{bp1T-k(O^11X5IvDlfNJpnF$bBe+8!n*XaAs`X~j(V0Ba-vOQ_uv34sr*?bv zG5UtaM_x(GY(h2>Z(Z~1guB4~yJkKNPp7w3JAha~{ocY09_4$4+8=9BF8kj!m9tNT{9)yg@DYw%$S~59z93%}$ zd*c7A;cTEo)nPzR0gt#ln=oC(T~^Y;?t$F`EKR9fqSsHE{awZnSO}!yn^=`Lc>XDB zeLfbz4}bDy40*ZqynfyIC=Dm2AdI$)Uu?PtUJpR|0-5DzdT{DX?O`ga4frmdG7;t% zso?boVCw=TSD+Fvus;v%KAB?Jett~lxDg(*4R$6;AsEhuTM6L;a=@vQIu(snD2t7x zOnr`E@ez*hDd}g!@B+JHy6|}_p(O)8AP`h~YkIOM^*H0hFs$T zF(Uaff`Oud$ZL|1*N7f!Ai<7FhfjqB62Cse0Q(=2-3SnI#N=7FmnT%_wjtq%$EyF3t2pA}=%$0?T%(MOXfkJryF z&paIb2PombDhL;!4{v%8gBr~BT{~BCU>q}km5vh;2ar+fj;b>6V%?a9P=>oG{7w>f z`ka9g?nAShWzot1m-jq?8MQ$7%hLMNy$qo@A!O|D(H9`oJB5 zK7F2v{l)Jx(?lwrGzO;*JnQ&&jqZzMK7a8W`%o?0LvCtKgBP1h-!$E1xT$Vwe4w{p z|CaGh{vPFYeF3RFHSV3;bajz~=tN+hS*)R%A^0rX5qO0-b9m~i4TUVCo^UP5*sXMmWrF@3< zgH=kEz=0i;nE2t%@pD->Tg>tX@)k~Zr@jnvpzY5yX}|SG8uw`AnB$?g7$!^36W!S_ z=aPilx^j){VIAsR?vCk2(GP7)mhol$_9xq}FlUv3cNQ>$2Y->`JzcyS-cde7=03G< zRIc+5c+4WJc|IQiKv#D8l<>0^zhyqT$GRB0SY!f;#k0N?)3#3nd#=D_lABSke4;C> zM!QVIT`4^9_UmQ9YT@{EOPjK(s+uAL=|IVwReO9UV04tflbp3v_4C8dcCYeE_eZSbZG=l5+}TX;b5`Grdg|)3uiWvD$#a=d@Q~U?P27 z^Wv!yUd@kRa2HvbG5l1EwfCBo^ja?R%}!#(^8_ZwotrFX(>hC(p1jDxz2c&$+c7wT zd6t)T7*rWyfh%J}Wh*>w2jz2({dG}SSbsEH3{?zBTK=b>S67=cDLL1V9NK(E;IMS! zO)9#sM$Ro3d5JAuji%4k=JH*JHiN>Q@3( zdvQjEJ(1Ch5nH{-uicE+Q4b-$+7BCMo1t+dpO1r)2( zwDGt9H5uURxHdEtK=-J7Hh=lW1S`H0x=?&C+6q}#8VYL9y8IQ8SiT8Pp8v0*0NS(? z@@s#ZE=ac8Pyj(FHli^?^8WwIaLyDyOb~W5p714I_jAo~(%97ofR%uVP}6eYMMl?` zU$+73a3236PUR$MaFyOJ!;Q0~DmD|z6DC&clBr1e<{-40#@7`s9^88D?-HU3n@KOx zS)%`iiqo9p|K*VTy{i~zxpvwDgZCR zD>MK&cE>fC9IOa%pl6+Q{GXVw2orMQh=-73o(b!bbwu}xcIo^*73t8x3ZBcO*aSrg zbA9K|e|Z5y06FE2QB}QNtoV)_y!`#KXUyjkj6jRbR$9ICf08}?>kAMxpTh{WvbHZG z&2U|-6QBq|1RCUdXLWYqDic>Ss&y59$bkFO$n4c0Eq_?GXe-}?3O|TO*}@KB7O5-N zlk>Np0Y;;sudSK#oyuCjC+7R^ogmR#_c(%egYW4m0afn22}r?0n683j{cnHkhSt4M z1F#N0gEHBr{`Z^)8SoA8DC+$zxPtM*@nGw9%K~DzAts0S1npmq&~vT{$4yk^MlFt|MG$sx&2^MsSO^) zhyV66mKD>mEF2Y}D@9d{mJo6NJIb#H`vLZdc>=x!D-!3#O@Z-!Dj8t-La~|Cn7;)$ zP3Q%N-pO`?BVoOR5;ep1$-Sg5#FvPgw|c%_%&q0+TB^myY%7Fp>NSGI1Xq0qy!F=c$!rlLM12q&i-D zSYFE7rR%HOuA`!(RB$QFqG(soGZ(Ey!Dl(qpE6<kiXkSfy+coD>KTuEI$lVao78 zSDY7|XznXQaM1Aug&lq{+(07jI(9%zz7v{``8!EWeunmUW0GQ z=o9j?Y=8$EV7DFa{Ttx0bzY|FTQGudj>E$#H!TAU)hX5kXHiE#^X0xN2^7xCRCw4A zCK_BB%>;W;n1u5WtW9jYeD%CkHQ4VOpKT%r8?Y_Z6N@*{0Asr4>uzw`Hxx_UA1Kez zZ34sMV-{xM6f)G==8K*1fJkb8bX0?yD}ZA%_ii}C*e#@R9NO@24zKAgqB1jMiwjQ?2^LpZN)Hn1jmt9vHG9dFSi|JRI#K>TvVGwwm~p*AvThsHv=TfB^T0UWu>BFfO|59-W^N zdye;$5$E%dZ{d0B&M!yZ*i}qY<*gZx)LCcbmRYM033lY3|7@A34~~{{>b*3-KX)Ou zjeC|4ImH*@-1Ih&f1#_k-Vqp<7Cw34u0u#HlA6J3mgtUj_;Q3!Fe*hM1H7goW`AV{ z%l~*f>#(NRKklDnd+e50DM3OSM(I&X7^nzHi{waU8!&2eY`PIfsFd_!lp|FX7-O5{ zB&V>Tk&PPp+~4y&zvmDB(CgaZ+V1b?{?z;R&RYD>q66aQqaU z0gm5>@cYZ55++0$ERGFL*n+5U|7l_Xd*wf@x=ov|^Il)T9LED4YbyB0D;gn+y8@|} z&vv@9a^wwU3}HzD-*w`i=iaSigW}eU;EM?@{TbhQN&=kaOYveK&oxczF(G|038?d| zMz7u)4@|W7zj4{ft1vs*Gv)9$z?!^YZwv$aPzMKY1}%Bh zmv|Bc2rca&u`e=yh6P7=s60wMz)4S-sj{nzag?EX1LQy$S}4&gB%SoHhJs1)>r8(=X7@5cZKZ zyhEq#aLn-Apv(q@ZfOl&?32W)tw(|}eie)9CmzY+_yXI1c70T{DM4C`ri_^(8eGF(S~`OFu*=9(N(4_+%YO3ZIMPoJgcJ-d+NpG z!(`|;{By`#WZxoFJF|HeRoe7|bM0Wi?AkIRO+(N%&PNJjpaoq?Usbn}Ot~%YUh+J8P+x zaCyMWWAojaEX)n*c!w$uk52lX^nH-D-VcAExC@dc8*j&7(EToc5u=EXmPz_rzI2-n z^Sqh2>=j%}#moUsC%|ca9?9I#HMq#jP2N`p=fC?W8-H>1_ZK;et5V%5^)zy3lS-i8 z+me(`z`rbmH5gX-6HIZpnBe0QS@*>gCB%);M)5hf!cxV*8-Cg%>q|G~u)t*$Q|1&5 zQc8G5T@6T89t`dA-@Oh%g~mijYG6oymKYWlGCD0_`RC(PZ!i&%(7Q9naF=~Uie1QD zO}+r}*eV^1Z78|t`)bdBb`olT(}ChJMXVWpWhqLo{=+n1rR{QmW%iF2fQ-0Ih|ua^ z281=8`!^lQQMp%6Z^=aiqCRieX>hQ~*KkX4s$?KM^l+t-U~ zuMqD-FG8PqldYafDrg_H89Mh!&Wm0=ES^QGmo09-ezrqohKV>ky50I>WH)ib4gxZb z*Zak<#&3Iu%vTBlj`O!I@)TXJG7yVgA1$l=zcR()j{#6J^W8ZDV$Ct$yDDOrM@@Ty zQnWH4QVN=`*c?Pnw3!M&NnOneWJDLl}=pq zj9wGENJ*Q=azm+&TLbu9{hb(Cu+;6|ddP4mLtmow`sogLPhrvi7+sjh=^dbi{NaH| z#Ql}Wj&D7xIolm*zO%Cz@c8~{FwH5j}D zjfS~RTC8FIzMJivEL;*?6U$dZi!8JN-z|da;Nd#&CwKGV-$y`|(*Hp8FKFsKv6o{- zhahQdM0)|p;Hmuo+-i=W7mRT7L#QRP9|i%47{m38Q*oStfOQ0eaN=Oa3Bv^qz_&vZ z$(_-w5AOpONXQ)@pJ^>-vU&YUW8t{fwRYWR$cN@y;jU4bWI(+G^I-HjLo(@vE;eC?2%!%rYHy#kKeWVI>- zgiCqA+cpCnDuV)alclOk1oXT>WJuTi6iaEyIc&Ke1ZGPbhamD1au1 zxL*$r^TAYaY`{38W$9503XA+$yLwgc3dZ+MZSDPo$It&20pUEfe0CBGddVFPDy|z0BQlnHd(_8-VR&V|K?o`K-awx}SKdxwApVoG4e_Q?&fYFVt zGMX8Q#;K5O7y7QjW(@?bCCa$*?Xo{*D8C=Pnbp)XngZx*7J#iQi7FdKmt2nHo7Fdg zC?5^r890Cv`)N0DBJGRz@i%LB16e7sw3+~voY$PQ@7CVLueE>(NHKhinVkUnk(%gC zZ4dDvWNX3%jyHZh?O7nwwKg()k!0KY)tJztQF*7r_+!f;=rmEGj!t@F9T;`qa{7*r zfIM!PAYje~H|1UlB^o>2&E6Pc$7VC2CFd7^zqKVIKf}NW&+S2}17uSMWd@~RfxS0h zt=&nD{seI=;kFA}yvJ^C?`L;v#=u`Aq@rl#5209zGn0EEBnq&rk&FRv2*`jzKzgrL zHdBS?KY-GB2CP0rfQaOFX(792u{Cr7$iCa@3)Wb@Kv)TB;O1Gp6FFr8m?@oLKc+pOj);>|J-X!eZvTuq{cl}pqq%#iyn;(CE23F#dsbTX-=45W1!|wy1 z!GOlQ8R3YN+4zcZmQG6VoxN_oVbPLFQOY%RX_AUbmAi~D72K+SxWogv%hMElne#lpc zG>QBU7;JVEH`A>5K@B+|OVMYDoNC`Bq6_ZeKs z+1NTxJ_o!9{&P|g$#{PxOeQ|WbKY;IeP3s-N1gb*U+tCtWq;|+XW$t$dTZ_Cwg()f zmK3PsMsXsvdV1_o`G>ke#Pe|Jxz@~z8i9yP{(zHJJ-YodKh*ap9kAE)&7aDMtmf=& zvh#6qC0?3O;Dfe2?0UhO=X$Bnt(2olQc)2X(bDs zccJ(2X8wZ}AbaQje0t1M_j2J8jpiZyauL$jVjNcHwP*oIZ+4P#=Sh7uVl6QMsw_+9 z)}dq$g_kWDd2r_Vn|%?!Is03H*NC$B^7C^JKOU{SNCr%BWpjo#%8}X$UI~yh)`V9i zv067r6u45mhf*G6bkl?DD@_j9W9FyzaYJRI+CsIq0n$v!+N<17(j|wSi%j%RcETL*&T#HaqAgv~>xo>O%U{iKq;Fp3cS94ExtVRHX&B|(} zMgTgL!z?|ldGHkLpXp*tW>gNmwLT)b{19upfYVq3-IDuV@vCORJxTnvPMcrtuxvYl z_G;k7;x=I|*Fqek)0KIcR+7P|{qAYj3u9yJ@P3eQlDD4(z%w zN{BpWwyf@H>6bf2nD5Y#T@A0V^x-daUaBsYZ=W>$+b1?Rb4EXHG29lK6|Fvj&LUQ+6wFMwfC(!nSpRqo+E;Hld1bwB%8w zN)}M>nU^}T&QZ5YGB^CQ(k`yb`)|=>QuD{I^RlyoxJ{xJqk<#!%?3B!*+X;TSmM_4 zRkfYwQI21@ZAViCsKT2TLidwVH~v2`3BHH(`SGVe_a{SyoILmEn-)Kffprn5GA^u? z@q)hg&0kbSS6>8Nd*ZKkq#A%TQ(7ni^ZAg&ZN+WKj0|cLostO13|krOkJ}IRE^0>p zYubQOzGuhAVT}IjjKgYuqAKf)M`(zC8+Ugv)JeLlLyD=8oCEew=L-N$sp-}Oerr{Q zxhkrTILQZ82Cg88myJOtaOL5D2ui7OGLkIAgK|Ls=e1R*A1JZKm)kF37c~RtNBjW< zQ)a2-W7qoi@)Y7FIjkp4P|^7cXCYZ5|sVC^}dHF`s0s z{VY?I*VuVUrqWP*tOmFbrOs9%utQ^SgDe*QBj5``mW(A|KUZIDw~CDc?r3izZw7In z*F^m|;T?pn4i+!ZP`Oc%9~zsarC-)vuPFOi1eldtgh9op@$?9Y;l?X*hcPWoUE!8g z=N%B}g3sa}b&$7ay+Fb2vz(qR@DPNb!B~VBs|k>>2UxN1%XN!fl&`C3(an*Cr98 zK{;MeO{a;=pP;@SWDsYSV?~mwG9f;0`jtyNML7E(jME>UK+Vwdzd}2zfepaNy16F3 zN%Kk$EoUfc(-e3zlsG3$j@Jh}5mnn}TrZbzEDC9*HvGfXp1?vB+EpnIwrK>iMuMUK z-K+rAcPyECp5W>~+63eSG0_1_88|HD?Yexq$D5v+L<3oA;<~?1XN_$EJK%(amqxpoAVDq>wa(DVBNYcSF4mG{%k4etKq2uBB{>?sJq zL{zyRTan;8j3K*!hG3T`ZzLM}cd@w_+8@@!y~h(~(yM=oitr2}o0*dK#Ts*Lt|G6L ztU>|!XB+zp=K7`KE89GojvCJa9D?9LV&7}7`}Y1$SV4_s{Fd0pea%*2RY8GhXV0ax zr<{G>?Tf*EpV`uA4{WYuS4p$~6~tWJE&^;A&up=?D%^+$b9qDT!ur25IO zjp6iqlRR~76O%I+O3w z0X2coD^>Ix#>UAg1$V6WEt2v^?#1RVF2{&akDb5BhOu~adi_L<3Gj-8op z2?x}PbmRg#AHYZ>4sx%1(!2(8&mUv5D4~6YgoV6N~K<*~t)SM|gF* z6B2lsoQunBmxN>Az9t;&NKk}KC`+QK35$(x>6st=E8hrQ6xHq&XKntQ9QE1dSD{^J zoUlJqooGbBtPatd10t^b4z-CnI-D{KKQGi|H&z(Kdn|n})7mUoQHmyi!_*YeSRIMZ z#rkzp!xz<%oEwwr?%CGl&h1o@jl|>edq^V}%G{zi++9NcFQH8-(I@SaQ9#E^$FZc! zIQN1$VSE@j*H5m;U$M*aZCti)|0FC*4ASr;|D@-R<&~wUiClY_EbnQA>Il~<{_&>nwbVJ1b-+?| zwUXm)?sIirkGL+D&Eb|UTOvGPOIR`IPx3UCt*68GpNr?Zmsi?^QItz0d%3*C<@{{BT;+8; zBf`{q&R`2<>g|x(P>{QoOG^-#;f0E5myNFSE~=@SCH@hQ!xH2=@K9}S-2Ah!a7gU- z4h_3ymBL8DWF4L&%j%wo9A(AT4)~8r%WjiuEW_`a+HzhhYDm^zb|BwM&n)q8E|&m? zS(zlHHcp@ns`2gNM#!C5UN?ymrUiVZR2hqWgsIep^E~E<8R!4?0uB>Zy;hYjs?~s` zXQzs>u0>NHT=g$YK`Zk=OrI%v<08Nac~I_L4vgjXzrklJuf;~BYYo^uSF_(h>YU=_ z-dWkmZ1vw{g2a46p%+hOH9QR76sEHTNj$k^XV0a^KG2;WP6-_w0gSKy(ojiqoE8X> zeq!wpRGde2d7ISA^aUpkeSMvUwlla#%5)M(dm5UGl9uk= z%sK!yxapQI_9oOIh=R;E_m`JnjrGS)LXE-nE-XR92UV7`pv1=!x=cgxVv}Y*9>(Ns z?Bg}3&@U)rm+Fzn#IWi~UL;UP1z%~u`=V5Sg23vkc{HRGJgC3W+Z?(8#~X)VA%GsP*Eo$X!NHNc;6mJUO3}M20z{e!kC` zH@k~BCou%|u9!}JwQ_Sgk=q!bK##stgB|z@te{Bx*sO{OBRwr!#Xnbb0e6IV9CDp3 z5fMUuPCZoV=GtyXeSKjc@10bI0-s~SU{EPH_kQ@lo|bL}h3xhSJ9RZL1JhfT#)pB2 zl5KIzdup{C6TIZe;^t&VlCN)Mn@ecA+e9UWJeT$M!PmS%6P9q+&1k9+%dPnt$2j#g@Thie~XV^0EoQpqH zY{_k>lJ?FR?wVP{@a}|cR$ERC=46mtIGtCF6Iq;Gn<_=12W<(bZ;V;_XBKAbGbN^% z<#e0b`r)Z?stRxq61WxqM^{kEyTNrES*6|F(!@{Cc4@>)$n-fyBv3yjeA~vZ%OyV1}<>{eRh1=+h#t3!$oHtqfU$Ys5ZYk4ELpI*Ok`g$bcQjgf@3nwqBoZkJa z=h@AhH{&lIH_U65jC_0PLMYGg2QF+iYcMN4%cqLE5XhdN@6vFmVBE$ulz?E-s7YmW zjOh;B3pbWmv+4bfu4@?exw?y8{d5J^c!N!?# zy<{hp5W-Gy%YHQkMqz46nvqQ-kyFNw!B@Y0%e(4Ix(f=Wd%owBWph;JX$Ol=nYWnz zg*z(fdgM~8`ksZa6Th1sX+YIY86r5HX9y3vh}pfzPiEEUHL^!-Xb*qFS+>{YI6$`9Be2&3gt&^nKG-xmyRWCZP zYGnI^xQn!HThckIxWX=OG|9*bEjXHeS|X;RC>PlX18yh*S{GEMd{db}QYY;7T@0Ol zS#TUD-U6n$)LRCw{#=T~`C_eW3-`7(rq8Qp`O z0}1+%yv7;DtnrG^$ z%@GONIejfo!P0+MpZJa`kI7I!bfV+H-l^A)?ygfY!?oik>zFglHS$as^;9d|l*9RY z&lGkl5Y^cTGn0*OBOYOFmWMo*NwE<;Sx8P6fcuP#n@aymOz_8hO!Y$74z2S>XS}3| z=U(AM$*1CL)y|$$Q?&0B1T(o`^e8+VrUui*@~j zH@v-y-C6zoK64|ls^_Oc`wo1%I?*B@X)o8oQGO6kRm?Xn9k$k(Ay$gD^9s}gPt}nS zC(u@DyfWNCHO19p%7rmz?NE0o#h1nY9M`VJQsBXQlUs-=vGl%2)~kXlS&MfKYRR`@ zldfN97h7tgdxliW_#llPDIC4O(at>ogw~N8_W9}ia!__3Jl{|;Hn16B>xy=3`G7R- z{Z^m%IQA2MXMjQ^YS2?Fb#tm#-00H|N(}Xs35%+}x|gQz1JTmeVe=G6o^7YcjEbhl zg2@9HQ%+`ePXfXvV4`9wGspQ>?<+oQxn;fHAoLe^x>lc6`1OB!$!W3~7I<#%&()4G|xFb;VyJwTNgqx@T? z*)R(dj(_KQ zn(IrO^6I_ov%pXH`x|+}Bn-TFxxn1Ce=Ew#23qCVUm-5M?aSeT-fg4N@8>V!0&Z4U z!k~NEB;=}iI~z8GqY;dTiic+k8#=Sybx&15iJbZY%re&TaD z``RLw&%-M9sA0QnYy!*ep^V{$n0AE8{qNI}sDCxKmU`>Te7(F%I!aE~k~4pc^5D(w zPhA|2W=wEP!WV{WN;+=0TaPh`_j_=(>cE9DjJURWePxbR#9lqh%(eRa7Q+-|E86|z z&{8BdNxrJ_?+c>M%;oE?DEF-{x+Sb7W-Q+HM$jmV8~I)W`}ADHk8hFD@we5so?RKH z`S1GS(!^Kxla1YGCymrf4PO$i^TugR#NqEq&y2n5Joq5{crosI<5-37{a*)4@z-E) z$f8)n=X4Zzh$Yb;{k2GwiGG#zg{2ZzHRd7&*UiEskE)b$y-xQ2C1#4n zBa@AC&4ov#*qZivDkh@2<5n@(XtkTI)wpn0#K%Q-A74~Vp~8AdPq;dN`P6FTxHPS7 zDyUC+Ugnq5&_mz*EN?}7wcb4fyk6Y+V(6X8yth@i9Ccgyjac+y%+y;Tt?^73vckcVQ z3xrw4GW)~!^7Ax4C9=u4}va>(?#J2u<5sB0*<4)SDy=%3#;d`SF7*t$eeN{Wk;^}q%|vRj~l+S9T`6>+CtgRz!5c6v$jfx zcetL_PLYVobgROw5<8}JYyXjIbw0lNAl-b~B(%xAXNw!mDdR(3dy?0S#5 z$gA2m-CvLDEV(6_Tg=#ZEf#6vbXEm>2Tgn5sZ^h>DrBQ5jiep`}2M(z3@TiWG0x_+Og{!%psn^@FQ;tzIU(4(@nLeLJ(Jc~iw=pte z2#P92qKWTqsvt{*fxp!FFUOEEN2Z{I#eQK1+X+Y*2;#?RPqu85TD);t-QB(OcpuhU zTrAf88AI#1YQ`+EX_)WMQNrqifutsqf8yX*>0=$N0_qwbvm@+NHR)*&7nz;$Y?^r! zTklP^SFXHKQzmd0Rj26lc*unaBV184|4V3n1(*C`|EM?Y=#vW}p%f zPE|ms*6p4*S)A24wP2nj^mRw*&?HcDd|=dr@qYjO-IBDsz9Z3U$KCgQjNWG<|NQ;s z5WVktRz@lLny?-N*x{7t*55Ewf7IUMb$Y0lB3cbH7fJmQ@wusC>g7Pl-O>bHw=xD zQ>dRe%91i@Wrp-hvAd>b75Ept95;q0sn0{roX4Ope%Q=(ebuo1R*(FgaRW(SgKqee zCf8ez`3Ok9Np*cA{vM6le*Wqhp$(leIz`$}!-6U)lhjqCw@( zu3yO90+|FJ7P}y$^^5`KbDvBr<>hx$lBHRFK%d% z_i$tzFhgAL*7Bad^1ohywLQ)2T>baw`bi6n5d9S@#!d3IiXj7X&-0HNy>P4S?Xl3G zyS<*D`f$0=l{9*&&vkjT-*q{&{&2orNd=*_zBK&k^VwT($(5J0*cqie{IrkyNk5rPVfVX3RH09%!T4X&sT;FRUDQ{X|dd zodD`u>BDt3gFmy_tXEg22KR(7p2I$_-DA@hPWE!B5_NNMCOEr&i`F`Fb?E+YkbX9J zC<=l)AttqH_Dt-V5{LC!38vjmH~B{qhh!eAWR>O4-5$*4GhEogp#JDcT7WiGu4pEq^XzFi=rtN*#w=nGpIOK7c8=JZhdD9i+yu zUeWj_us!h?ylM-q(xCdY!Yejz+7w<%IUf$Qd)6y(ItSKB1JS`4gy^n0wfDnoO( zqb&iP^!@s*K_2q1M6bl{)8Fzg#H6`?O@+JyVM?)=jtniG5y@5CSE6sr2gfdLfVE)N^{S7+$EbO3ReGo6wU2{6UK(R(IpIIdvC0xt zgD=Ph5{w^rM>$j@s*8R>p6=%$BZoy`i}hC7Ti=ymWHDd)XSC8w(R_QbE}pHB&zXEF z=-<)27Y?I}_pBZH(49n*$x$8!7HFfDK_`TaU$lQfHJY@47D|8fSpP6>&hK&CrKmCU z2-W3bu$+-I#BX87mnt6b{CIHFh}%Z>IjLGv9sKQzLqY{I3(Q|v&rD>!?hVSqv61$V zW4@lV5A5%_TX=m)RnC6);kg-Q5hI!0PYjT4!v29KD5G_eTVPH}U)2leE8FS)kWo~! zT*uqUshyvWRqTC8Y2F0G-@Catdv0$TV3!QOf%Rdq6cmRssq~_i`z%6hG-rJHC+oz= z4f3yTw{@N^|3V+S>iGZ8&>(vMdl<&hptbL5t`o8qwf2ZSxmn%GEr^%&;fqUE| z6L=B`nuEcpyi_(Px$rbp`b3SG2udUdboZfK={gN3J`CyhxmIliY^~Z!8cgG1x0p_# z9F>E!i%F4x+Yg{BtYdy_8mMGvR}`?{`;?Q1Yn|cLFtv+43Cx7JP)74tMa6qV2!FXc zkQrZMFJL+(*yxR2aNga1IlMGmB6pQka5{frdE-Th?W) zO2dp>ro{+Te{uTSvb94p^h?g@h`Jkv-gv6TzDK`Zk8%r zVKxYcobQ-abh|GZ!MAq^l`Wv1-Su(ZryD9_{Wk2`27?_)KH0#f^S33lYMt5~vsB`& zY-)(Z?`O(YF`43~(}@L(y?^Qo7311)r?;Vvy~iA)q}g9RgZMz;2er}D>2ZTQdA`|i z?-@r$&eEf9C~vt$+wGo>aZ8v2OYY;3y8EcnhFLR-zmiQqKsK|5ZC)oY2;I@Iu+R89 zcgef|$ZT=762~DnVzr*u6bX5zeKV5Gy2n^cOu#7K@@H9nQ^5vAMAurRc1mxSi93?U za*|%wn)B-qw`GYtbm>pHD-U$j(ar3F$Nb?l&xBs^2I`tsHissw$na)NV;km#(Ba)& zu;x!VKlk$o<8$=ag@^oD`+LG`CSj>X2!S@+F;42k`-My;zwjSJ(<>?UMd#`-2jY~6 zqVI)GzI%B#uyH~t>LrlLd;|KC*-N=c3X+1#b0HDO8N{kbQd&%$(x2DtRsVEv)0@Tz zKbgjF^gW%vnp|J1Qvj0x{`!~R>qA}1bqIsJ?w7kxwosB)|AD-s(ErHEnoxQn1;=U3 zRdt`+klWWjwtvz7-a%jmMGYRPiqtxAM=}YKBYTBgZ0Ei+wy@=pa9y}Z>Z*B`V$2KT z*4J(+c4IDSCY08UA^g3;VrYx|2#~z#rS)weI?-dWH+90Sb(oMpGz0zk-x4ZA_7P#8 zs*opL&f}i>j`3}7MjC}Bog`nsly&{AJxi-bwVBc|I_Eo4C5?3#OA-9g+rb0<&O*hJ zwSP)Qg_iZ%pdq;`=l?|oRx!VddXE8%7CP#WO9b4{%guCoC?h@zP)fKOvp?=h&R+WB za=*7$G?_USzs7W_LpX@6ecFF^(d(p|=&EZDqP`se8;qT|Y^6*g8}(O;0l>~@T*MwD=nsCJ=SHZ4mBcFenDs> zk@jl=k%O2NenEWhP&@T|wVJw^M2E6~-uK?N*>fj*M`T`2)jmHJE|$|8RrOMP)=4Sr zS-WCEYk&UkwzX5IT3vMxmg27Uf50VmCg>-kPG+Ajn8I(yVQrS`{>i5S|6iI$hYigl1OI@@a#QJGfxcZ#L$dU<_u6ir|*xUDr1*|NTm zqb`{ph16JMswcaul_5Ve_@TRAynzAB$H3SSOHYmss=1_kUv5 z5jo&&M-if!){N~S{tf$e$nk!(YNlIqQYKe-`c9m5ZEeN^cX^<(@pefyOL^Xc>3(K= zAhk)?YOpUo$15WGYD#^9>00;I*GZH-z9#8*O1Eoob6k!{o;j~ty_Km^TQDak^tNYT z52LY#d}ZdvjL-UDLDH9Mw9CCwgp+<6`&~u#Y94oCauY^JD*2q3^EpkBQrjG5D76pU z1+u4BLMsGTH|RUzqi@YyV+IgB9hJXZ{}>z?l*HX6(am6v?c%vFlwWHKuiOPW(^_d! z3wA;++}U>HmamJuE(ToM_w1%_S!M7$K4F0lCd&=&{DCxmOXXPFge~`e68MM2#ywg1 zmI}<`xtFHyu#NqIrCFoIE&_H?EcA=2kAP1|g_oCCO~p*FYSNwM7soUhW49a*<+O}K zHf$ZqwMC9=&m9lgmPz?MKB%r!_65D7C+DAVDc6kc;9ej>JBd*0M1P;`E3gmoD&gsz zFrUM@FWebYqXwMcX(6qU3pUW@+&G|K`}}mEHIx}XeM6WNKMtazufkM@>-jNu=pIxy z*hajkdGaIh$<;h>bRk!!an8rjE7_Aj8~IepY)|1kyb+_Z*hMRplZg8XwSBNZcZ;0Y zQW#?BJ7wQLZzg{P!#AyFwp^eP8j9&(P%h368h`z5&Nr~(^NdK(zxHPWLbqFir0w(X zFH?rezh2|`Jw8>A^XD&oD)Q6^IEmI=e&oaKas9%w!^E9a| zWF6g1N48S?D=5T2jXe)!PSX!w@kR~^$KVdVyP@we0jn>a4Uvm}!NqAk&!GULPj?(@ zvcqT$>Mj2&j@lesb?!{djQ8=$3R_IL!^g?M1+fsTQg`ZPDVM{hs?YU8{Tn@B9fncf zFFi)>r*QA$Pc~@{cCRb^H7UAyJsel_xS!vuDCdC+07wUS6 zFx@Z_E_iO8!qnLBnV4qZ5O8L3lT6h-yNByTxx(K7^WAZthZfj}3y+xqi@A;nLHE1Q`XnnePP?J3eeb#KA)wSLDwae1 z`e|kx*@e`XPX60T&W@b;>KEL?FxIb^)iL>SF*^775cHtuW~IF3S}s>1_74TIGF7uG zCVKMbO1ggH)0(^oXJ2zUHY25)=DpmB*9UFy1KfN`UOMLUeS1qyZS){Y8f|@tpIEd)S)r#T+aIq zv0(%Dy`9Rvl)#Rz72V+cMR8ioq^e^25##8`NW|v8pd?rsxx_bd z!PNvqHcVO;M)%!`(f@@@4Jhv~crIm|?Q95pF9;SB^(g3UbS!mJt7IHMc8}7-@ z>tWJ|Hdok4PJ<`C=b%fqd04qU4)q6r*vTD8wV6B48U=&uCKtDk^GBR zlmoAw>HEqO{0V*J{ACw~t|7ArFZVqAG(lsY`e*)rvgyBk&*gF{Q#b3L7FO|1<0+Rl z;fD6ASyfT>4^Xnx_&YKE>*aE`Wpmx^k?N^v<;I0OG$z>nb{l=W@ukM6eJJgdU`w91 zQWndez38*1^n?r83fO6_>|@9gi#!Zq-#uW;z7V#aJFj z)}r_5Kbj}guR2%Xpgnt#ALaJiHsdEhrEBKxkXuOVM<1F-bX`r2#R%w8NGS$im!|*P zl+!WrD_0InG0v`~CUMexc$YS?qgrJDlZy4Rg6H(HR$_;m?r^PL9ABrWT_E$!^anLT(agSb%oZ795;Qo@cu0qJ+ zy_6!a0Ux0Q^pX#+zKgaR!xK=mu zf9BtB-CkdUxHzo*!c7Ry|MX<|P_h-(IPg z%=Wf~tRu}8Q)4*VSork1{#Kblv*|~PW0Ds6tDwzXZP-2@y=i}@ZF6vMz`~ez$R(A# zZ_@lIAiF!+sg?`!`UQlDfQ?b)$u%3I|9@d@T z>)MNC;g`CG<=)B-&sTP~_-}eA>7obU*nBfXTkq}qtIIO2?34bFco}e<`l=^aCyIq|3C%psS1Pa1<|w11Fbp8aN1nemvWj^9?jxIVsjvT9a} z;%2R==}=1Wu0*u*REIyW;=9GPCTMRFs| z+$xJbczgT!92s}n^ZIid-%bNaeKZZ7Kr?f(;w+WY4Ht%WvqY?HI-qX zkp*^FwfO_rnux7}tzkarY3?=GlDILBTm6n@rXQGzT1T>uCduY5M5Tfl1FT&2VBA^U zl(A@pH(iTqqpA(+Fqv4L^-29HS<&D4Rkmv~frM5qI|teMf_2G`E450~e~&ii^$tLi zZnd@MN4y@EgyWEF219-(U0FpVj#%SjZNqo4+&g@*Q3CB@y$R2t<+Wai9d`gb$_trn zpaFg)iTE#BD+B7aaqy*hN>2IQx-^Z45BBsk@1gp8I%xzy$- zv`t7<{4;cR@YpWrU10etE~?mJMeXC~BayU5mddyPYUKe_6eQJNEh8A^l9~-=n;D2F z5^7Z2()Y@@2E8ndcW0|`oq^8Wdgq+psME^JET?@5`rBT;cihE-%PI&?kGME`qva1o zL~Si)UQAgwleuTW;_XxilDEO1T0>NDIbS%7RL&|;izD&ZWkfU56IwbsOHnOrQLa_S z>>~!jS4hT)XE|hVn}HlX+6Ra8&W9zdarF;TFd2*c|qKn(;^CuK!vOsJ5=j(2?su$4qt)70+`uMq) zH#z2ojc%nfw}%C8xFRgbA<(?MtK?n2XBO+!eXEE9AsYj#xW9?;oC&S%!eRwbcg#iE zpp^m~pH#{te)8ii^8R!Rb?9H%*i}+OKmdyqU-OSE=kM-&Yu|-c`3GL8U z>%BVAz%>5RT1~e*j9+<9*{0?nNIU4+9m}&AZt&cW-9yRfMu})%iNvbx317+;=|9k~ zWvIU=9K7m#&`ol&%G_X8;Tn(}JJkFL-L*RcGe>>;mA2HP*B1p%bfSQvF}~(IG(@v} ztmL^4{M|>S=Y#Sj&?fzuyZxB{wR+A5Sh&IJSi0EEc`GBVy!;)ytNY@>@7gcAS2@P& ztj>lnYy#cStMV3Nl~p_e*4?${X>-a_xLmm_=GHNoX?*{Wqqz7eRiT7&PNY@i`!@rY z$(NCTcgstj0fYIt%!p_)UM}1LU zKvtUNg{dPlS{X6*OOh2ddMa140t?F>yUa5a)mkZ#%h*Ql| zuQ5rI3Hq9?(X2A!)z;j_>F}1`toeH*Ou4`!0Xy|w0`l=~i4q6Sef+d{WiNFGzVPU! z=Iq8O%UC~)wD-r=`ALN!b-y0>LCkGfsY#?#ed+X68Kf!9{VWkKlI?gzS8|`Q^?j9$ zX~TF+#|?+TBX@c*ND$xMVG(pt?OuU!yhYs*uH+0x+=1Vav}*xWKTwRY=*L=cNhgbH zd`Opke>O^pHitpVY}{@AY2v9_+Q&8(n6&}_lBSX2B`@)Rr8l@X?_ykPi5_^-ZvL>X zR?q-$%bGN>il=$khIL#P@noaJ=T8@NG7>xdXixyVu-p6O6pQ9)riXWoe#_{0sH z6Ph^TmD-;^26_|=2UY-PrPffp%K7@YXs7JnYgQ-nP%BU~qK8*SnGG3LSwd*uG8>0h zo4F4@@=zUS?)5txEjy&y&&e-(Agzv`10kLA?pV;L-I34|&r{cl9TmdjPI+LHmsW5> z8x5Mik0`WwwmotzvGLv3*ahD{3O)z<%u|n#(mBDY(TA6-3$=Y%eZ!5yZmg7B52kjN zRO`wMP)xrifHO(_pGj(tH0MmT^C92){kNmCS5-ZISlZ}R5EQ@6MawX3` zrTMC`6gHv?C114c3r|Q&;oMx%b)KWCd$#ll?x%xJeT4^lYyGQlJ`t1 zu;S?=%ZhiTwkqwebu?)nFWp9sxB6Jn(FBk`^Za4JxaUst{#hAXRjl;Q1lN`TKY=vt ztpkwLWp5E@!S8sL_f*f<_nsF3pKzU(oMuyIb1*&tS^5Khz(pnd4p*}X>TC0=1O02% zSsqA-WRXOkc7wMKW0aeo>GeWZX@gj)_K3TWj1e6{jB^6b@&z{#ih7uKADUl2Mp-*I z{vbhYJfcoplH1-#K5pHs+Up{-``15=r7v6q?w7%kJ) z)$3s#h9wmoC?#Y4GyW}Yx8_6z?R9iY{V{nOXKyC;Rh zn6&PCQ?t6VIC5Nl;bP|6buw3#V5OotCl!`}Lf*N-T_BNit%FhL@j;{usUL4qe8uLr zgM4)L;y#M)>E{QjYZr1^Y&cML3pRFcx^LVzKPx$mY%o`n=uZ6b1&lwKF4t%qAy9jn zs;-{KVX)%+7&BSBTUo1%)ru?-4HN?@F6{$0>OC-K(wSu96n@mN*)~KnJq{b}zFyB8$Rl9)Ywq;_gw-xN~Wr3#){R zpbR$!@~uj66|#PWZ4Sr(Qa->c`?847$v$oj1FZ^5%^mn%gPJ=JXy|TS^P+&~Gwf@B zp<}rC$xS3xUDH;<h z$-kfg?PXSf$-5 zDsNf4w^bM%RBB20&U0zN)268^>tZHH_k^Tyt)oKA)ymJTv9kP4#cOdmv2C5b(awFn zU_MgM-o0gz5%4fLShw3^8Dd7sXP5c-h6@r}NINFjgX2%LBfW(@@!hD7wls#6wZyQ*{~ii-z#3X>t0PumYQB>!M>BPyEIOT5=Uav({2rw zzO|sM;97mQf59d>L;b6hxjf;%f0!D}UNLXEssrfF<;iP#4`!am#b;j%E<}$IODlvE zw6Tp7D7TMA+RlefsRSaBX+!gc+r-&zrbQj4b|H^I8bkz?3PA-@A@o`0lJlgbS6l13 zk@cES7v_hr;kFSA=G}kn0I6Pi!618P;xUWZ1TjK30@J2T)!hQMVc|IdGR^T!S_^A_ zk(h}yWV=C%q+b=+EL7=fG zReiBNuEu>vFh0iLdq}`$!i8j3tgPolTmn(bJw8&nMVxDIGJRjhU6js!ohQ&&dmbt1 zCxrGKKP_TpDS}7Td$zfw0X9)V63X&brpJ;GUkdhG98>^vK2x`iJJ_)M=4=mgNmB2y zR6%|<`cb2nj4S1Mp{I?K7jzmAi^b#fgs9S>d=vJbc2;!w$tFsZl;q?_dme${-N|Gde#}mAY?GOX zz=2Eb$f&7I+}_x7l<2+2f-W~y@PKI9|QJs>ORcd;4&)5aFkGto+Vti*(A+}NGSUjbtaby|mLkwHfwR0tR($-i( z&c!vJc5?EnzfSNBW;nW0dpnL@jdK`PY6>6yD)&6!KWYO+ata>eu8oVnJsXB`(m%&> zMcIDnnIB=9Lfwl*lO( zP2j@faM9i@pD$eH%7A`s06)S<+4nI$$3ZwZoYe@E&mWFRYoX!fTkQe;jP@^+or3xd zJXT$5>&}UZoI=0~rw!!a@4e^QyCGqzcoDe_btbs9=u*2oc6lk|F@xZN{HR!O@LJbI zh1D%9dXo-g^Krd3Qjw-bo+5G9$J{C6CR$aakDKe*F3$1NJy;7uftH9J+i#o1Z7<|( z;DA0}GkotvoIc2=9wJRGc#V&GeTfua{w^jq4OtXT!;e&IbXXH2wMO=Eos8TI?+|_| zMT2JEMa6z?n(}vXH(#YrsFD^XZ7Y9qAuYBM3)Rs-9;lx7Y5e*TY;oDZ1RL#te+@E1 z>yzZS+XJ>}87g$Pi6}!S;HhYdfP>^K?9)#;mDy4$ZW>U8`ZYkE`sRGs#!@=lgG$F~Vimm}7LNiu;$wHZfk9?x$m|$(nP0Nm?M}lXX z-(+3?@|2)s+#~ip(%!b;w7Sh-y~Gwdq;wJd3x&rgbL;kT^^ChuSHZ}(c`Bb=oUUOf zLuKSyYWMBMjl;RJ`6Y4BEE3JTUkOh1Y{Wh%?fDe7%ZKb2q-Q2$J81ISYud7|?Idq5 zui5A*y0ka)XF{||}eA0|(q!1UeI z3AqdgGyp|JqNFYOkAh*eJpPQ&V#rW5;_^o=Bas_2Q!ltKn_%BUNm{z?qu9MZ;DoeU zW8Qzb;%BIrf+r#@&7XQzG&wgFxL@d76eGK~K)-^?+0ImV<>xhd8e&|zVm@C$x?=Ih z{Hl{0HPPB#IP5f(aCF;NV_GD1!n;RBC}t4S)w&XEC@%5d{tG1Rd!;=Yta+QWrYGHJ z?fPfrvmOrpIk^!E$pua7lb9Xzugj3_RQd(AbX|I5Sjh3TB)7+$Je8PJo@3NT4h~oEBs9&BvxQtE zR0gB8FT3YVyM5P;?ikodoqf?mB+j@GfSQ^Z&-bd2C1d{zEd+??Um^FENLb9`V+tk#T5C9=cz ztuI?fTWgtbMb0RqZziiv!^ZEcq~ygusDbB(+R2#HW^>jW;XZmeI9DugZ`97J8}vAf zbTVhCK243WTNR~!mnR#iNIdcwq_4(i&21~>-g2X5ZSp~s5T+I_F%Hv`IeZhcVYHzi z)WIGj3T=JE*5;Hj62_GqS3=-$#dX+RC7E~(G%8@&ADRWZt9p~=-QjPSEB2X!59 zTq`Uo5qwD6v9)JW>u4;{Q~Ut(efasO32}{zMJiXku&Vg961>|wi7Syotg&>oQJkKl zBWrt)Bi@31TR`h4APOTAphI!f8OevR0oUT)_{fwgUG1IFnQ-K>#hRNv7+S2BGsr)5 z8+in&fDMJVM7T%DsOa3>g8}{eLQ`yvlODrC!X^&NZnDX4u1Un49x-*^yP~|A_nxAM zj>_tD3)iFwufYo*ukE&m+40G(Rc)`AFP%lLTql>$H3vM2cZ`%YySTfyDnF)wnlI~S z!VBA&`+1TMbbMR})PCttGb{HFZ5A3EWPK;OZPnF15I|o1Ydy9oMU=~SI34OsWbuxZ zfkLaFhJM&T2o|yKBf0k1#D`;)rQ|A+CHHl=?*Q4OD3Dl@G0vh0CH(q| zyKN|t@bwgeG&lfW@3RQF-@UWy(+%DE)ok$@QD#xzgAf1bY>2eb%iYI5z&)| zwQ*c(y$os2OV}u#h>a8!Ds^;rwPrv+8PSog4mEHM#KeqjLRktU-AdIiy{a-cu>?V71rr3Ss2yG-1&$(mm0nuy-%l$XVvIkM(uCI z#FcfHA#pva-FU!s<1YZNvK6A+oZaXdm?>)=5vYz)=10()mJ#0k*i}WffmBo z5A@Y2Ind)>9IGsA4)87HmE3fc)U5iN%EL8-JiY;?z3VQ$pFv<#vH~9~@^xXKeFpt| zX73wZVK2UT`K_$zQ>KqB=)yp=(ALOj@rBc%t({TuX#}FJ5jPzb+vvTry&aF(w&sI} zPk@F*nepk&3-W`##Hhhb<4x(k=e@oVdhtrQeXi}figIG|$f=ZW6F%4nDXJ!2rgvYh zIHm2=FPj8?K}}1+f^cpFJzX*k`g)+?VHq|QaEHe$S+rKy|Kab*w0O^eGj{!H;*RZz@t;osV%WkU((d%^T4%}w(T4lKnv*UhEU@_p{ zru#{)w>ljO>~yoUL&K$k6ZMCMS5%(1qMw}0xr{we5fs~yRT!O_M`zw zICt?J^=#x%PzAzgfUWJI28D99>=J&??>k#_YHv|~d?7FKyyKT(*Xc}U56czcp+kLf zGx(+ta$30*ccAnyTz++P`TAuTK?G$63%?_Md2{e?cktT|duwW!yoCgildX8t6h(-i zJsMRzyI+1B6E1(5j7kEOCnd_)WMOQE^Ed}&?c7F2d<8hm6peMNa*`LL!X2Y3KdtBW zycX6;aDZ$GR$!-6H!R5hrQ{o|#1C~ChskE!UKah_OFF`+hl$MtN&P}Tm#vQjupZA5 zJe9>gQq$Fo7!lXlkF2onD^rnNi`;lOP_t3e06AF7DS63y65Ll?)JIOq@Y@5=+1uvh z%*ksr+v53FiZXFIQ<0u?k-q*9~jgH0VQ0lxM z$vSXoq;!pt<3fD4nJYpv7sSd!VsEEWvITV+LOm1vujFCtq9`(%Q3j2?wTi1Oi#sGH zUz1Q(uNj3&R$bIK@u#7Z6MN7n%*oYJj0%pb@EX!h|J*I+!UWRjaY-U&vSMF z)|M`B+2oV0KLyx2B^M5XSqUnaKYk6DfalvNb|YV=&WS~SLG(b~a0`bhaEh+$)&_*@ zP>S#h7!1SFPIM7+8fQ`3#U?5_mv2|u%*1)7c(y9e6=I8W@{yX_C@73dRZgV5uT$zC zfw~x7o;6`qyT3uxYoXtF9dgJJlD)&mxkre(Fu3fiwQ~^lym2@y+PMKTB@FuJvK(q+ zJYb3O%$#r3&~Un3v#6olpF8W38%zAjPY;F?_qEgG7x;I~rPn?Wvy(mSm~yp<>dcBo z=<4d;mhS3UzAHn{{Q95(_f{27-0t2LbYQ+)j~9UUoLk+qNXcZUYG71xdu~U1m6v>g5N#5)X|p=# z^-0-Qpw+*wn!GvS#i{)Gvy<}%L<+~>uEj5l%@UP?!Uc=#Wn7>++rHna>V2RaJB{oK zNv`hXrU+*8%f&j4e2`7|)3)YsRf}CuOI9egqN?A(2^P$q(%xNnw&CMEjZPre17Uc( zyRT(&lqmcfC0w9v^4-&kp4K6mE1JdrO^SrmPV|*c;-9b9WNa)85KOOIln-eNt34C$ zwCaSEgWiBb9zEE8HP{j-G&G@)+Mbzi9i)eDcMnu#r(`_VzW8PdSjAE+88YIQU)=h? zsY#j^#|fC5R%)b1rbjD_Ogy5!-2!#h`Sm&*;7EyEr|^-7Neql-9-@7%lnI-?Eq z=?o~ErZe3()OU`&+;jsu#Ocg{iki!HtIh=(g>BdYk?X#=L7TN;tE_ zsk>se{{_^$?>0B(5k=4FweXOXHw4VnE-l_{MLna>u#Zw);o2tTm;7UBpy0;>f&&QFQfNp24Tp z>z$ZA(rD+f0@S7vZq+&*ox8yX8$&hq2t;DJg{qJj{h9+hd{0pES~8|&h`xS3 z#lQT)F65=2%t#Kj{EEF^I!6K8ti3yNAI7P$a4OmlJK^d9`tF|VL}46#A!&ab78XX= z=HM?!7~~}825O8{73NuE2f|YwRi)GKREHbioJ^}}?)WsCgjunxlAs?uEy|56i4~cf1yLJ}+37;;O#7OPpcX# zTjRY0qcUZ%3o+Q(7g$8pDtmZ=(ubmEQmlnV9B|fibzj(plG;I;D!nRWyoG(C*vPeA z&yDaLjune2?aYvOBH{DXvJY!h}m#^-0NvX(<(52j~i=u>1-*{ebO*CyuU z__1*%?C~pmsd~hc5?*^Q{ypd`q|-7C9!|k~r#nKV-?MysLwkU!5>gLDTNt&DJ8V3! zXskyzu#ypAV{&rvJTKVxNH}7mT$y*f#aa`V!j=rfH$W8%=~`*?zG!kH>A zxV;i~GHtI_SE!Jw|8chdJ{>9F!+x*wmKeg1p!DFtmRCo1qvh2QZ|8QqXQ)Hm`PB8= zxRI35S;>PbNy>{)%jd@B=lgpreG;F0fWUwbE|u(Tkyj?*1|dUcR7l(YzVW1r#YLNe zjc>G}y$IZFDoA1y&sK?yFH+try@2kOM2Rrb_4Cc^VBo_!qAL!zPPtb4*@|Cn>>b2| zgslJKNrH z*vU6n$LWnT$wOXZNiCP5t6XiHi$m&ntGCVJc}4loCD}~bC26uYBPHbKXYTCIDCOwk z%*VyvXx-X1{hWyv**w^+t!=*JgnX%7pBu5uCc2Sga|ws8-y3fF@lC^wSpr)S>K@;Kl{E~EmjQ-PNl)Nm8 z>4}~@Mk$q0_<>!mbd(hnww!s)p=T9xmHSqLi0sJy!-QR^+LJR+n5cHOUAhkr!-g~bi5Ia<3vJ8oovT>MD=OA) zTiTM{=u0C@?fjjbY6l#el<+M26jLpkkjmKr^x6k zu{`kv_ha^%7cSHi2Z#3OTv_mkx8ILsi@2(&9g@F7<5Kj2i|bBDw5Vw3LBz?pFg8?& zXGpp+)cBlIh8|@S?3bWW9vn=u$tnNX%Qh+f(emUnu7)r(Zj&4I?8E zBaPeraN)cvxVkJWgvT0ym&{%<1!&F0QU#)^j) z#?`)|ja4I@9t?Ub>WtkAtIenV#@x_Z2zoeAJT^wmC(z>5o2Iv-U9YZ%7<4NpfX3|w z_pP!INZ*QuCp?tpK_UW2M7>(=8Ji+U|5B_Do|1xz9=7mP!zpmMtcqJu3uR0Xy6&ws zkp-a+7d;E2%6KI*`N?%oVIgV&oh03JN{JROI#Im;@p|& zHM9&!6nIR7$ux904!e@Q2L@80f-9JIJz-;rTDvvxJJLvn=U-AnJ%lV+s-03=Ya$%O za`;JOhXOMP3YXKltx4>kUmLymz%yw5!?nhh&do)~HjH*uc|F(*kS#D6NLC zn|TGOr(yXKErKeo(aVIu8_GN1+Zk81*KfDqf2o!Ybb1pA8OKU`eG@KkQA|tY=c{Zch=rCBCQIBKq)vl7HH6ZHnRB!{90RQkjkE6|vJ^xUC4+%Q&KX zn7KF{7A)R_7rk)W^#vnnHf4g(r>OBkDlzM-nu{4IMCpwKYpbMov z&O0v4Kf<$XoyP1XlWZ~LMxWNcK%@Tom3FtX=UTJ}j4X?3F-P6Dbmm^rUUf=5>#hdq zwR8|!I})cm?GD3ub6e*G@P`1ps!IuyylYV!D!PZ`V6~g~LqCe1-&RWly1s%+su9x9FHcV~xnndP zF`~KLiqBKxaMOC*`QLW({fdpJD_A!Hh#afqzD>Jwu0g3Ii@NI&z9Dh0Y-B8{9#=M< z9O8r4y4=U99xk?l>P?&uGorJ^a@cT9%J?(K%rK+P*mgo8w&L=DwsAw$uJTfBBAbJL ztPnbm#d4hUkHF}erva-gEQO{^9b_ui90jR$evu7!rz1EKb^@_5=S zF?Cy=G<$AkQ9B#>xK<*Qtws%`0m_bACknha@z|~|fmdTb(8L4jL(`~yJEZYR7jI}y z(+u$KjV2YgXIy&IramExeqpf@o0H5QkaJDrs-J$ER-*tS(>HRv=!4>(cG?dC14P&E zxy%g5u243XX$zIkLJqwNlA;{T`=cQ12knsn>wK!f&FpY*wcxpU&Yh;fuZ^9GzOaD# zDGs@dMK)^$>b8uyN1Vc|YXY7U(3}vea&*Yl^#jVLsj~feNQP|U5z?w)wL(>CSHm^+cxG(wumkH`*ICY)^f-)bw;Q-v(C9xV$}bC%r!cP`VBghpB{$R{{1?gWf)1zfgYmAW2irr0k@?QCV`iPY>5 z-7}T6bh8Z=s7TnjwxD+v=>WSm^XS`Q-uF_%4wG<1kB?Y@kLdEE!&s(ds55LOHTimf zG{l+%{EDobvfLXbH->Pl!qp{$+f@Shy1ztM@N6qqd(~;Ldr3>SZ_CH-l(O3?&afD2 zdtv0K!q~RkMV0f=Y8cpDYL6wgf6v5cNK^U1s+|y9N^!hID@uuRsJ7c&pyC@T!`_|e zPvXXVxzznQUvy{I4f!D4xOVJpF77(c{}smXrTXb+!QWptV!bc$(P*PEYnqL!D5)MGG- zZ^5NFQI>Ezi<)BLs-iBk`Vm#QIA)yCb^@YYDXsxgB{oFp&=Iy4>_ns-9}(y_x0g7j zJDlA()xL9W*KodQkwuiJeOcDRR8kIz!W6noyRs6FHnH2<5^M)m5lrK3H&=Pv##Lc> zGuK}@J^TjjH4(we@)iL>mHJ~AEUh)+9{1jUn||hq$pv|Ij&X6y={cmKlGUY|0!EAPDG<{&|8uQRwGa66uYN5a8oWi2*J?Q#Uh?n7ue}h93)=`7?P)>Y$a~{Z#qcb1EgH$WGs?K# z>gs67zLZ~%i{JGfVzC8YM9M?v^V$Lx;v5;e(P7!K_}D_Yw?_Nin&r4ctiplBvrW66 zyl7{%-pQGShk0cDlX`hvS!H2Nev~2uuQkm*2zD61K+)d--_X$fxVr+rjJ#kD4wx-k zK#lJJ#;v01Pr3sgji$<$S6RZ0aajPHcJErgc}sH}N@ow!+Wnv!f(FO~bUB#?aVx%p z(IP6FMneR@h-5I)?EJaJx}B3IPP_i@!KW=zcX-Io)NFUy(N}G7Cx^L1?aD`4!7xQJ z8$P~gFq{3&mq=E0g>{Q=%RF(8H^{C&A@qoI@l3NgM$9SlwAwXFrRdIr#wuD`ALhG0ytxcBr5Z?^Y7sxrxtKuGaY z{8Zz}B1lT111TBbK$@VMN$-w-NK|K)P`Zz*w1@?{ffl|EK`+9l_rsqb8GvWjs}#W? zAEFuvdL|N{i;3Xbb(XejV|}MMMWXCo=Y&=yo1k^=+4LT2&0ia}_PlRbPzTk8NT({- z>lg_3vxe0DoF|)%55&*yp~RLLznIF6%q*(iO02^tSnND0RZ4=-!TA8BHMR@7 z%sB}UoWB!Srw!_=i_YwgIH3c$L2h!g@Hd@v68VhopfQiS9#Gb7!^c5vsycVl=N167 zEI#HH%bb>wA`m|oHzFul&u#(J%22=qu`u@8x$Hi0sWS=|3292T{5_;L=UE+8M_-P> z3YC>B25VNf+ngY0=4CeWqCdXkc!I#>UNAn?;T>060S$g=fcia^ePvQGVod_Q(m-#p zXs|_)+PSgX_% zpLY7ogfQcD%MY6x?Qi`U;S6xr0iUl|9*q*>76&CS%c;>fa`OQEkYxdJXSJTxL4LXN zTxR6(3GN5z@>HIC_#iAYr1G8x-EmGUy26@&JG=s2ci8W&^V?o0%?DQBQJ(4sF552F zKnNBYSN@`{2n^wcgYF^lmMG<<8-D?YYGDs_dreNR#C@p#v;m&y5Wf0!hWt{&T|5q! zAE?Lb3%Qyd?>)p`VbInCJtyMjhsl{#n2fygLarv(nG)@R{3Jt*>t^w&$26l!ZwIPB z3tI6_Qe-fJn%oA)Gw4qw8K8j7!cDsX>S)#gW!dQ<+V;DlGK!M-BoGMG00?7!*M;3< zL*~Q=q!r@=D7QePI30J;XTZ+n0U)+&)raW6!FT2o@Aag$+mMXISxA5u1>A?Dxf1dS zh!rRbB$X)&pmiA1-Oo2pE)~73vO}7dV!ir z-|x~z+0UT2@xg~GgDL8P3;t>@pp}uyeUX#U2tAEDQ72u|mCGP9bWVYQnh2WBYj-sjwT9gmDWK{R7_!z^cuJiWCK(TkLeCOgJ#Cxz%lE*C!bA} ziWFAhgSA3==uiVmd#I;Et(VqGp5XKS=T##6!pQ@rBv14r_DAzPc;M2rxvt)yfeVfu zY8f{9nyT7-{9@S4V`wH4k>>`NJ;^k3*YpNxR9!j#EC5mcGIx`6bWd1rgKm81@u!QD8GOeRJ50!Q+6Bf@ja~I zc*^bcs&S#?jw4IFlNuB|IR_kU;sGK9S}ISp!SRYne3gx=!4DuCS4ecT+{<;+hlLWT zzhk~oVCl9&*A9fy4Nwo-b&14bY(Ul`LI0gP(XZ4tIp^E6FpeB=RZTMWE-AZuf?Q@+ zD65*(AoL3eIA)1JL9NCW5#Xb4iFh;LJu9IWzuD}1=qlW34|htLs<>76zZ{|fcN zEx-sgR)bVuHpr1m36(pofblifywff-IQVV491ZeAu4a^nPYzKBx`R>#q%CTWE}9B+ z(62=`odfm0R=!Yb#6#fdf~R!tPV;!7f(P=&psaEF^$eNwz>2=L;%_FF45DBLs!-Pj z4k$DJ95h4356ZP_$lq(tw-!q2&VK^PpGoMk!WpeBDSBs*kB{l^9yTt5s!d3TmsW** zikS}HG3>odG9CldrA03=t%H6E^!}Wi&_H0Q*VtW%e0V_lPULxDXM)BXGn?L@)zCTa z(%E8b3X*@<2_!5yHZS-aEc z2O+qi8pV_Ev$l`eN-F^dw56gtjA=_Rj{rV>+n{z8N;rB5x;z<*&B-dWgSvdeb0Cal z;|JxkH!$&_S|)LxpK+}rS6|juov=)`^U6EoFHkw>rwjHF`?;fg{}X#~AkuAPk!|(k z0@WEdy#-lZ9*FgAuDol#vtY9rw@q>-$tHUlI%ZT|O{hy*Ck#QgsHnb}%za`GR$0b>wn#e~DY z$U+|>WdGT?+$f|Ww)B2mBc|HmGo+lQ9r)CF)u*x@bw5PQ~-y!rmV z>Z28ZxsgM_emklL2GqufPNQhXDJ}Oa68uEU0c3>P3-IE%+0Mfu>=B-zxv#7VDP=`zsl`%LlfE z?xP60@(X?+!v7n-92g?_8@_!tG*Xw(SQ|~VMNR*vw3?Ez3jX%w@0i&OurZG^RO)c^ zM(`8Z?*jBEh<``q3a{z*)IfgxmSE$WufDPJ; zr%ET$;AQ?7a{O`iNr!-_lLO47tUFd_=I?Ffy!4Ly@7U`BUA(fcx;D|7Sv4u0U$+6S zL<&NlAIp5dphHMxfP((KkVgDpuOcVbd+)z3hRy>>xFDwngBJN%?S1Z_Q2RlIza06a z7y3NcD?;gVZKZv3Qjso&{oYJ+W;`blw zB#ame$OZ`)k8j}Z*3B#)Uz6}C<*~uknCSAj>sC5D?pN6Fdc)f6rS**uAKWP>%NSVf z^cYSIZ#b}Kh+2>+P#f@^&+i!OKghGO+V%FRlPHMZt*cttv6#!hg)6ToldR&s`h5*_ z!}9U==n=w*5;E+h*|5(%XImtkdtqG|arE&rvUW8FMZegcj}kq#9*sgJic80{-;y== zJg61h7&K0YEAw+P^)mAh) z4SX<;=TZEzKW^CGHYO{gfKtIoA)C^EV%>V<+J( zUK`ERuhi&DY7>+FNkB$_pwn=EiuIjAf~to!R40K3ouQU?4RC%tS-?DmpjDuNPD8?b zzO-@1A|bQ^Mo1timXSW8?jkrzzD0{S4q=OtEPkcSvcX^s)i5e=PPYC1xttf@4_45H zpRKwjg!8qT%#pU3KBUBc?Igwe02;xCs)lKZNcJh4!5XgY_B*-V1Kn&?0Aqx7lPo;S zhIGD*#R!f9v(AL42%}e^t;P+9ZdsP|{=JjuL^hQR;A@;FaGGw<=U9tzR5L z|9SHey^R>6nk0f;T_Y+a2C@{>pSmPi#ay2N8iG7t^iRR_UBLY(4M8d&Fr+WU$d4WVL)QGfMWbj)5U7t=`~u-YMm)rC#>PK4YsCnViQ0x!8$4BK zN!(2O>zTo)|J{-)C}#rJg=1ND$gi@^XpJB&>UU)P&1$j)Hj^fW9Pc3t6!_w=0`Jpr z%l(e$M~JXZl%s|#-wspB`UjrYu@g6+{O$E$cWd=0IG2t!bM*JC*3gu~{J6mHANoHa zCigiFJ%f45IHejqWjMz`11sZ zkWe@Sa2G%D_TNhpozY^!L;Y8T8iUq~PYr{Pl%$K7`M>S~dh8oMhwhDdoCf5{0uZe@ zD*}f1R>G=!aRMi#@~sH^$pKK2#^&lo8f4c6o+ z(LP=Yh5jDF&8s7ms>oj(nZR}t{5AK4P!>QjJGN*RDI;6_DHeDWy_FMYA~RqSOQNtoFT|Q9BNGw$UknM6 zvQa8Q>wIk?iX+@28oFror;{mbqYwY&=jqvU-%-l@Olx6d`x0NUam}daTDi zxeP5s#Oa(>L0B=Ur$vK@??D0Bo}KPp6w#T%hiCz7ldMNk#X0PO+W-v)AL1P&aOExl znZVuTFgsSn)kB~h5e~YBgcZ_aevlCj3n2bR8;291W&NogF5V^PvRCW7)%)Rrzw(=# zc!@vXIQrqp@TW@<<5x|6iZNzj@1wA+ykC`ok4KK)%GiAjrczg1k0NTEF{KqG=K1 zMIW*tQIdy>W3={6KPB_WTT)s_FfzIhNh+dYO8~F}OMK}B-4d?C%xzWMzd?9-!@nCk zhey;<2e`*w=2r~8dJ8RdrDaa4>;tK3-%pS%a9qOu{Lj%3KP_sZ8{qAL!FCc^Z>jMU zF*SwA*{-zuG2T37aA#MN82{6IKM;$+`FkDxI%SX>J`LLgki}feUKWhgdE;{gSyz?< z&}O5XPgL^|MbUj{f=>SXsnF#Bn!+ykIUsJdcfz#%jkx^ic4Rv>bTT&l*8i&n8hOaQ zh48=KQknZZ{!=olz$HuQ%uTKjl~UsqXh1`y&W}qK${2B-=x6PLv#7yY46?O~0SaQm zazWb)=d`WMzshXyx@RP;|3^=QG_8-0fXBGjpwELi$HB?=;cYHu64UwGup=iZx&=St zpJw#$ib*nnae(EP5}v>Z~>6GjY|e@9LEpg z0>AxD;Ov=*LtN+7Gy;8UQZMq}cVqL5{th{G1pezE5=R3{iRl}nbO2lj-G~pr(zpcX zXQ}Fx69;9gGNRK@5*=~FpTPa;K>CH4&?VHfE2u`{2pF5m^Et4iY+q3 z_dV3*1qrfX1BtHlnS%FEez!a!ZZL<>_YW%9fkdsZjdpNOwZzn zfOE=QSw!|~6jS2lFHrh;SL)%vBO@MuHt=zdvRj`(0d$nQiI&Z`9xF2I?Cs5heJoiD z5=K4+>P^8A{@;Gn@0(tkAUusI_G(8m4j|j5;wvT^r?w9njTOfw2v86Q`hhk-ssHHLXlR3i1Qc<{z~%-Q#h?;F@+8^x zw^;>{07vXl{`U4UbOUh?CMHYT+c2KCmH5X;kG`&@iMP#Mpa%#gV!>_+tZUAx(pR24 z56^*fWRN7PF7d$|_!EzwHD|l@{hS4>_;9kAT=zy00y_Mj79F1GPxVHN1AMOE(y@gT z4?mWh*P^!^)R78IwveI~5W!in*??#3|I^b2KD$pfh$ROd|5|hnhls7;O7+cJ!s%Ky zqO)xhSOO%zoUYbJYLuDH+B>l`F_--Qb2|!nFN5t*R7t&l72E$?dF1u&YssB z-1cP@dF-8Wc;87h=&Zmk3}MAeILY@njfC*t{3@vcOwcIe93>p6TvUNvgR?$I5~!3MnlBS>?6!0xOHUztGa0308lZ;(qODN)N_6KvX; zy2q@-ZS=32Jn+NA$@C#2is2x08e+9D- z2|*lQCp&)T0AEQQpn!*|-ykh(;ABU&vHXX+Ula6?o^wxV8OO?*KI zeZLL=5akA$gJ2xUyJ@|eSCEsX4{W>`_wRW`2^0o)jYpdu=}6g^z#||-_QOOkq9)|P zoGwFYQB4za!Oxb}6&T2k1y@)WOU%8-%6dha_6KMBIiSdT(la3VLPWakX&$6U2wYJYJWyBAJ%SiiDUN3o@9NkLZLTov-SB?o4IU-u#(7G+f zV8#eXKZR`x33xnV*i)WZ;e>?oLwquV6SU@8@aOyOSf8~yo;@j2=6>f;eWkn{=f=g4j9@05`ULEIdYPnIKEay}Kp1?Y*h^6e%%00AD|+EbF?l0qJQ! z14An5?x<|<5776G0)W76!09pfaNf>_eTyCtZRJFrKMGiY?G*Jt56;b^C~CiD6hL4S zehf`|HpB9pjap)OUm{}dS&Dw`bwm`Tb^^qEDFxpT+hOVvlhf&QhP_0}hJw%^)nF3H zTdKW^1!)2s_@PQ!!|4_LrtmB8tCU97@)VLqTb5BmB5_XLvdP(N-vxq}Iy#R3F2W4892)fUgeS%3>OJop*?xzNk#%?g9HoX=`mMg=}SzW zSoILIok3WF+NfA?>mL=QW^0I2YMSqABow+8DbfeA`x%zfuK|0^8b~Q2w_Of#^e3Xb z@hYj_aH`MnoBJB6f|;sS>`MKT3t<>*i{54!NM$cd6J9OEL(43M*q7X)(C|U$FW&FXzgV- zv-K@jrFdR9L(F;=4X4mDiGOQ}h-5gVS6+7BOeI=U&CT5utG42+1Bx<-?`tWe^Bk(1 zcZ1=`dzy$~pxQ1#_UMiczbLTWZzRp%GV5+|yRDX5lx=%1@(-ayLw1KTiIbpT<5GZ9 zbSufnnNibjRCw%VL@TIW2B*#8sLrOa)0UOg-Roc)?Oh$MIx~DKlV_a*q^g;;f}KMT zUTLB|D7%(5UngtNc?I%0!8+HyXgK~v2arcK5qF9uC=6xg_>5d&D!fDhav zGPm>ac@UOULFSibVJ-$-+o_iI5TfpM!M8c_LDvq_^xSq^#PI+;9BzbXntsuFm!vw^D3ZSrf9GuW!N+}KC=Yzf$c%Q6pXz^1fsU+45hu1tw zz&utLG@U!ebF>~hh`OX&cPxIO;Bu=`)&0+o#Apv^LoMzU7Wha_EK ztoV9k-)AQwG?j%QdBIfi&;jhHBRIQovg#AV>1p2)sbV;yX9HC6!8X}RnK_1n zuN+OMJWksx{dAGgdVA2UO%IfN1R>#?z(7CnMuG{DLuxz8ckY+tmYbAQN~W#4N+*KQ zU#uYB8hkxu9?B@+0l!N&jZrmS%7e#|*a#3un`RE-c_|2#-Pbr8{PgaWLc;Nn1%zNQ z4EuxLqA^nXBJo0E@xi+OGV1M!PQ+W0Z{8j#kFT;%jA!}ySet+ybo&TRn#u-@$&mb_=}zuh04n)U^knm0()rl)AfbksO3 zXX14waXi$0;&^GznZ9@%sU1zD#1+ij9+>HDS>4j~%)6!@eGTBAfQax{{Q{y9k8Ltm zKXE{tpj@5pNR&UAQy#T=q#_~9sCtkf-ZAH~r~%tC03a|38+fy#5z?)>12ct(??fD5 zLiR~hG@g*oxC;{Q3BDf!y~paAQW)fZ+Q$>M@rMOXaSWx}A|_Ww13zr%^ETf9W8!h* zn@vpz>lxJP^fS@jNKUzsC~wl2AIp3cmEMi4XDsK@s?31dd>emv7D$`$ovJnE z_o=64PAx*^w$}0*Z#EEQ53%V)7MjTA-}cu{?FPMWXX2?Wp{Ro}gdHk*TEIMn^BA&p zepDGrv^G*~KY0U6Zcvr)!A;VLpE4j21kE&4__V#Ch?`gp6B#Vt;ty&DO_}pcmn=;g zIM#KMQD0y~L=X3h&|cy~GPO89Iz&MjH8(wwF1>sBs2KSrTIRY3*pQi2=}?*DoHWQ3 zTG`0=eCEGl#d!qhjwGIMj!kjWfop1uN)MN+W-8WBE?v$$gFoGG_ax-Nco%CkmKg;s z!@yIQA!EYi{NqrXDDp7oIvISwO~m_(4#7u&U&K)c4p8wXMfjv`3V#7s0sjOBZ{v3t zRv`&(dEDP9iE95i+@_{WE(kHAJKFMzUz{03CtV!pq1^~Q;IKa5nVX%Jtl-U4t9iRN zO_saUXqbh*2dBR5fsHYB&LUKbf*}(e)~mo#@``1P5vvE z&G+8HUXhS=T{d=jHorW+!gGhs7uN6lD}e=Y)x^b>F&ey=;QNf|RABXjvA(wNjEut4 zKOk8YI`orG%avwlidp&7ab0{vybiSiNZUKo)u9 zmh5_We?f#ceKAR2OfWTW0T z*k0k|{vv?T0*EVy6Y7;PdpKcH#fem4R5bP8=Sn&S8oeLP6eeB)X?B~z3&Xt$Sv}oI zI55u`+wdx1CU8MANp*A2Gtf=o2uv}gI>XYj%Kdb63EY2J<-+C&;)q?bn^2r#7sJN) z3W%uK(^e8!!1^FwW{6VZIC2H}p$+W~FQxKZ$ zL=TUgS7##5R#dOuCgi?rN9@B3;PXSwJ2ad!LBTJf*Z!dKi9WVKxMFIUX{^kHl}lD z!RoPs{xmH1Fc`4)nGWR0%jTZl`JYD95~Jo7eRKv=l)&mW!R?!4X;qyD zCr`LYAk3~00qO(ay-&=Nm1ZfL<0mOUPi;wC?N~B&%PxD44VMh7tkN<;AoAXiI_HBBxTih z@euuisLwNs0{Tn~QwTJg6<)~qe!xMkhb47CszD}{z|}>~TDnv$;#XqcBD;80Y?QA| zyBxZd6^ag}K=XKP`^Lrxia`{_MchKYBZ=0^GjPUY!^0#G;DEmS*jmY_LPo9kf`mA1 zRB9h3-Ud1@_40cL!8831C(Py30fKRryuw|`^r+Re$px1j3S^OpI+wIccAWgDZIu#) zn0O?ih**ZBF672)t|d{lu~0_lVcw*&wpy@@8~+Po#>GbwKP;6G`H-^t7Ji>2VEhZQ zZGBe#N&%mp@^wQNv6oPIf~_Nqa=Uyp3FZu$-V*Fp;-V{d3ocE@au#ejjV?7&ke6=( zMyp0&Tgol3jDq-}K-eksfsXmnX6n9_5;#d-{TfA{uZ&P!`OmbiHuGGCK1;^aiB@j! zx@WGbxYZ>ADLU;XAVAlarmt#EUj@}fpx9m$%zmc9m1ax?-;mN4HNw1)6t zxx5ws`Lz?sG76hY>n}!=+8FT_1RZhFfIMq}_=_CE&>Dy+8Bh9ksd1CXuEcUbDZN+n zt9AY`KkZ8rh(6>zNGvUX82VV4Z(RiCq{Kwm$IDK}@-na7xjc0>A9#YKTi3fL4ggv| zqGKDxWWbF>uX+1yw`Uh9x&hfL;{!#w+BVsrUSL5t>0o;w4Jv( z{~|I<@Mg-DBI*it*OX`B4UmnoOy`_nrDB^k*nBN2_eAaKbn&uWYkWsyJtr3_yYa8t zjJK#K7g50JNJTRLIVl_^FvYcTQJ1bfVg)x-%(9_ z9lN;&Y29qoP(H(t_k&Yp{SYW1KQ_q#S+r!_zcshU9jO0p02?B>HrDHHa#)WFxYJg?yJf4GAX& z9??UbEz*lYyuJ>j%Z zPIxwF>8~a5_wtXen`Vl)wZ6@I`PZN8UJ0i@-fBZNa{AV<&-WRFkR#MyztQWQBA?Ez zZC}KYuRdn!mH6MYuBIltI{w$sD(3rIp~>H-FlxrO2WHR1dZ*whnrn@@A`EpT8r3An zbtQ0Gro>tign%sOt;iIQ=JnoOx@1PpOZ&kEc;TwN9B%#nT|}NC8Z*3>-czS{3;?e# z$bmh%tJ03|MLqo2C0N0UBB4VMRXO=TbHef=Y#DwJm)>#G^Gcm!Tmmw$ zK>wGYIN`kS=6rD(|I{tB)qMfWKeleN&)+r#rxCR|^*;266v~mXa~oHtviJcCpKAX+ zaZ=1FB04IPShMvH_=~_zAfU4avkt-`+V%4GSBGG=h#Jcd#ZzgvS1Zp~k;jIw{IN9J z^)LH|Agmp=q9yQQu4-!(34~_t;wQmKK@rBzjvz{za*uMfcP1IZrzs-g@r zvjGkDKW=y-;UXfZE5dCd5(n349u;lu!oN)o+k)66E{-$E_yV=D%bRDGpZ)h!D|r|o zw*)F$a=;eFVQ@clz&}r|F930D+QXON>Hh%nS?4G0;!52QevJJtPGX&O)fBZ-lP?Di zD-&EX0Qgg&z4OxjwlB@&KcP<5f?<4;kzED!i}{t+cl{gGx9<_$<<#a=xe^N;-9~Oi zsvi&%tyM)7{7S&NTRKM{7RI3}|GS0O2BX>H{(^U9!mwr$u6`sY6j_J0mI-$Vv9uJ{ z36MiiNBjLpL3uripaSNzpKm9j0#@%M3W-S9aM%jX;l-YS)-?OdpNm3tDpvj|iIODV zL~<<&gBfrFf-rsh!8$rHu;_99Gc&2h0@!qAteu~=aRq*H^rOMOe0Yr zPMDHy!oF$*BHrSGMVwJLoi%oiB4P>tN%@4zo<(W%X6e}`Wt2-Q8%jyKcdLExsr~O z$nNVjG(uv%B2)s*spycO&^n5xVo+lwLPBKX72YjDK$zJoA(9458OJ6XZ`%LKPELq! zs@2TC)CLuZwKON0$MQH<*8B#<=c*wR*yZN~Ai;YPim>{S_3Q|UI|%A|)cdzz=Vm6B zr`8d2=y{2(^O)}!I7jpBBcxkeDROKf@pS4RaK51F=$Y{%=-JJHoCfWnW+}2JP{v`% z(*Yt!OJ32Q7r&effTH%b3l*%(!-sqvoi`_O%tzIsO9*txhAyVXw?BN{t9lxmeQ{C5PQt4O~1HyomxaB|_-*G=KahV5tI0EBe;a2w37 zairx%DRT7Km*AEH0}E(9S=Wpm#sPrd?Z_xCjg#&4q&w!lqIdE_hQbDbD zj$D0}0Cln;aWlEv{s8sMLVa0dD6obo=&t=+g?Nr8asoIb((U9S(to9Nw(Iu-()WCq z)bu>kcAnTO#ZY)Ws1#|=`d#fOa_sLpBzbeH$4~JgoR#TRiu4uz0&RoKnUik8DX^sv z4u`81^oj6rghL;#5s%_aC$3n)QE`8qzS=%H0nKCc&xY>t??z5se;=jz8R<#?r1{9s zWzJbz8@+f}GT03QT(#lSn)G5~nri_}2VaYV&Tq1_S_LvyQ_xQ(w_)Nf?JS(*uGYHu zE5{;E2Oh<6SWkLIPjv#5wiLz@lbwG-fI?*-nhGym6|rK~G-hM2TDfcS18}-Z(OvMO z=U9Ettr@GD6S*Lg_coyH=8F!ISBPG99pzZ{mTIVV0?2w~iTEQKJ=p8CpLq-EL;NV` z;2baW3axYq1~tiduJdkoai`LN_vyHc6H3CQ18b!ucVB(oqNuHiDalqG;yvJvvad3N znjmf*PN3urIJ^_srX-lEeC@N3AUnvP(oAW8LNVrGyB@_lk`9O& zcAO#g;0XlbzF;DGXbB(*xls+Q{mPXc6iD|*(_(@7GB|?b`xiK-(~;{Qv87ft^vBh> zj7C@)ho%Dj@#=(kae|h9Yi5t`x=617*R!U^r`Qs|+Z z-S3VH`{NjG9b<=AR6401ri8hgIU&wpAl!n#PR%2y#|gdq>0V*+gHw0968}}FzBOA< zMi)5+cT1M)6DsL(p_n6=h#WApA5XWkf)vk?ES*AYk{{HJ=;bU|SBWB+GRmrocy0`G z$cXK5yE1Zs3=eV+Xwq#sPgS6gS|D6S=v62j-%*O3SO0|^PPJejg7X8EK8)TII09`1 zIkz~UMf!IMkK(q`_3z-{>oW0EvumdLT+9nN!tNUN$PLwe4FvJKjo?^o-4OCIT76@WTvg z!r7A7oVl?pVlvTii%|Oz)vt7(-#=Y`s}>*!LLi&majLJ8w%8)|UaykOY!a^2hM!sN z9M+GWNWXr^D=-w!1G_W!0Xfll>&Z{9xLGc9yDH>xn*nA|u`>hmrIK(=y=d64jSqPG z&pJ1aw(Sdt10We}V&>-kqo-zCik(xxgq(1- z$wgN+=QaI9q??||7lW>M&f&$AbMprKYT3;62ThLi%$5+fFdQ6m?x~~x9@T;`^qX8- zkWE_9ye;96BEyK zUeq-*?HtkTP&4=KoncXLdhPM=qUWNsKUhAp>8~$^Mh?7)u}G8-;cw2fG49PY?Z}cj zb;@?=I?^s|=LrKP4y$kPf(TA6DY{*PWwxC*s=Wmbb_8R+8ynGyTXylk6T0<%NQ=r^ z3usGb+?*H`_eZAe>_}UBOySHxLR{Mh9?p_cU^u=^k(Ojg|C}n5Y(Y0ExA2(e55y;! zd@8|ZS@+kM%rb94Q8K%dk^HC3TJl6v$MsXL&j^Z)^d~GkWt}L+LwPJtyEp7s_!%(O zHI(uX_}K!xi)O!Dp}xF*yz^_KN?l9&Q~8%!yPSepX|}xAF-Wdtx%1AJa!k4R2yq#Tz|?wDq<7LYviMOtR_#O%dtU8HCk2J zuqWPyH|0`~@wJ?`big2qbM)^-Z^+`1f3oqUmhC5BEH?Upd(#T*Fkn{GD60#ZS4wQ`$E<7!JMbt(~?zx+L z#=dOc2O08qr98!ASDF*mo0BvJ6uV?T`p%@GPI{gELfJs6-%g-m=E)yAe)y==yh?9C zbBqioSy5MIpQ>|MZQkA}LRCg$2|wD29V_!n+i7OvSX0$-L7iaiLZa1$ekIv61eF%r zo*(T2l?vV=ZnnBE^ESOS$-akWv*HwEf_3OKVr)6^%DtJ%j1~HaP{x@f-IK@b>K`5QfeKFo-zuCM{J9=VD zPG**J>J8WmXJ>0}@LYI`}NWW@)*pi^vWgSXp=qTISo_fsAo9STb^o7yGeX z)W`?oe-c-+JG-*FK=0ZVtGQf-nlhc?R4bgI54;bSOnuAmx)P+Je20Q@p70I6Ym=%;4_#g2~vN3*KRf46QG-Q_V-+Ry={_1e{=vxl593|a)>2=@$C2f8e@VB z-ENlqQvC&4n>Ghaa$R_J+PrX5Wne)3$k%|bKU1yy6~<{34^WamA*r>l61Zy#dU zO_3kt#prXFNxH`safn4y-U!;fwDi<$d&o5&E-8o7gV<56V|%y;7QevyQN&U$s|{~4 za`YMYRu-@PSUKyKWNp)r1$Ok!{{H^zNH@Ji$!z>g{Kq(>3~q61i8~v&n1q;SSv-6n zsT3CE@w)*uvUoFf@!Z~Lf>z(UZ*5|a9+ZE>#C1&H#=R8wc)jD!A4 z4ZP389FgjME+#ejvbMjTqLab;;yZkgZxI=42(>)!CFaHLprSsVJ@aYIja0!A*YiD% zTET4#9S+~*I&ST})`#?*90!fwzYuZnk)lc|Ua*fo__fR{C4;*Y;~D7Rnp~3q#?eog zkE5l#!k0x=WUhXn52A}i)h2?O1k6k#TwtHZ{*mI}MZH;$rgdzObo7m_^Y)&xzis;D zgT3cFl|BhXXIlke$eL~HPMq7iE4gs0eYd4A{pyHupHSV4<1R@ddu8PuEq;G_)E8x4 znf4Q9m0A<$&#B5#V9NBKgO5y#rDTtF-MON{FH^Uhk6nHHt0v_xL}ocUp+*aH(>2D& z=EgtW*(RDC{7YZ>ASQ4uxLd$};(G1*dUUg$L8ZkV)zQ$sG4`N5derPMr>nY0h~~|X zxcnY$m-B3mDf*OmtjwR5RVvk}fzYFdX`^8?B4_ei9@C;_@IFlL);mTa`Ld1Dd(-M% zh@Wu96#Q-1e2Wb*b0c59&gvL>C23|SXVe(Gi~4Bf+`K&f4G7)No2NSc@Q)LiP)W1d zjk`>|_lsHc&zc*49(b(#y%~ca=g^;@Z28&lmpv)@?8r-m#0t~0!qZVboTCZ#HRm7Cm2k8v3JV7sH*)2W`BFb?P&Vu-p5d<3rK+BBaUSfr z@`#J$h?w0uoMiFBjB!-Fhwk_LSKHF`!`&Mb)f=odg$1HEPH^cKefSP)&hh(k9(~mm zoijW=1{{rCx069X1f}&9oavi9+Pg;qAuV+lS513hPP!WmHr-;5eTJtt2m@0wdA8&W{j5`|un`-S> z2jPR^I(>e`Q{QIkFb$VQsxSoz3q<)U8t)-hmmjj|_uoO&-EY!u#S-}k)4 zE|TH&ecdk}2EX&(i|(>BXufK}_eC>{MFcsewmI8e_te+NCruhDy?0K z>gfn)-2Q~yB!zo*;r`Z4RUNxSieqELOr-@QMoaSWsfg_a;q3~8N&(E| z=Rs3t2c0ksb`$aIs%?BB@<^hBj{3y)I%dtBo6kF;KR>*qP=On-tbW-|QvV(V2em5WftTR79U(_AyETa231z_)<$b=lYX}Te zTKE}n`Y_Bosbajb`=edHdob0~^8SXwCmhJr0 zaWE;n-IaX9P|nvUC&JPVBb>6j68)1U8wk?lVYgbeZ)LK0olZMl_^gWY_`CAl#Ka>P zIQUss+>%AwQ45f_)7pTddEtCFcHfH7BH|+V2da?qFK!ZBU+HUR6lRRx@=?$gTu_2p4gVq(snoUBgU4ve(2tGPS95rgyZ`JWg~ zKpbgb7VJEK!}VHT*$m+sEw8sA-`T1q4xpwXkH;Sbv}suQc9dI5yad9uEzFLXJ1=NvnFN7fUM{O}xEc;s`C2fx4du?z_aQgkPP#?c zr4TK63!{B&xR$e3x#I(2&#e#dO>HivP;EbbI2IBnoxGDHU0p@f-*zR9@zI>_ee~es z^6Opqk=HK~4C{fSsCDpS#|_xU-W zdglr~{^Rl}mKReg49t#un&H{?KQPPbqc~NQCkjVA`P<7+41d1A?|W`_n|JA11Yp*^ z!7uGPgE}>vM+uWLMSkJ*>_{2J{@>?`1L> zd69|Y1Dd~8Z^O?mdyUV|4xRAgiq046yQ+CdVvp0r`)1HH_%?Cy2ha?9x7N%F8;7Aj ztr{;@#M>dAu--vd();nHv`DiEbP7Sb1B`7(#~qUo&a=a&pbfV8xMu{R#pTbV&fV!{ z;L=b!FSHPxH`%N?D;X#f4~M&ynZZgQvGhT&%mrb+ayCe^!&K=&N1VSLMNPt+V~4Q0 z#KB#rlnOoh{?;+fb;*Dn=lK~1mDCCl5pOe9lMBo#D3^t407MskXqgyHDgKtp-Nvw& z_qJKN_kMz`dvtdyd8P_tvX~@ zT0)58Jmc+=!k;Dq+O|LVh6~1Pnu}dre%r9gRoXPGmmNnJ5LBWyv(2Wadn00!x!Zcc z4LBj!amu<+uR;-A@Zqcuv-DSYHg&YVylPq#DhMfrmNpelQ~Fq~?dudQ-E%T!QRa;x zA*G*~>nTX6M_rym7R};{{js@V`dbJAp9>1Sxdvhu?3P&N1eLcFNDqd$yg}O=lQdgy zG{!z<-`PRq@wg=H)<+{wl|&CIr|tA@pp&(ha0FZ*|(3f=An&f6IIjQXC^` zmpq6R#nMs4t|l$$5#`&OquI9zEzzLuTp|Fzyd zCZ`xXfPF3qq7uNu+_p|kF#Ah{so5T5JF-ljM#NdTre%hFM!|O9RIlQoc)>N=q^c{D z?1li4x(Ae`=G4lV!L%txgoCBct2zA{n)|#k(qHD79kY2C% zg{?2SvOm=SEE4y?Dp3?c(lJ&kx?r!S$xHVhOW|jXO0SV#kp`cd2Q9K6F>qA|n!mog zncw_l!Q=ibA+GmURPV~mqtNF!hd;8Wirc+BwiPGf?g8qb=j|u@E;HdO%ufk`Ml$Zq z%k1i=ug^4TOM88sijwY8$yiSni+WQ{*eS6zol=E3s8Mw_XBxK@DE|artk7(ud%Plo ztC~yL>3Jw*YugSR4RB2w1=?~^N)wmplrg$i# zmtM{P=$i7+)~nl2jTDR@P|v;+rxd`gohTnEY8Ur?ZhCxNf!qDSCq>t5%d=b_!fjae z-b;!myJC5k-?lQ_YbPLECoJJ-;BG(xxQxiWNG}D&iBtJX89GzoEQ%Nw114`>}!z zM`KUK@3|NV2Wp=P&-7si)kk?oQ(We&ARsWqj7>&xT2(?O&Vk6v%t@~rrlqA_V))?+3?+L$ayT=+0HzO zJWSoh)CAm^qTImdc}`5@ow2(X<~N+EdiiJPtq9tSF*A7wFDq}v!CYz~@Y2a&m>D#M zII`ATX#Sh1CzBv~W&TK!!^~ZiRTf8#=K(gXKPYqPycf>t`BjROL@GzO>z#p-%PF8R zbN#I^BRzKu)|=&g-dHs$@Al`tEmYkt1nFIxhG90So=XQmS~p13rtj2ks_DMSq+V-I z{f;%n#3!_Ar$0|e0n_G0v(CJo8*VBU?WI>Qw4cPxHopcfIMR4-yq`658lssgDTT+* z;hsNQZvwnrBS9tUxV^)*+MJmWR<+V0-|o`Nq|QtZhPX9bOBHS(3{1{!)}ZG%&mYbr zTvfXqUFB8}wEI(EvQ!yK1{6X7NU&0x{0O=`)SlhHQf^*pTab}vAOOR;f6C) zQ*bP5D$pfpAUZ=w|Aq@>Q7zo7cbVlppWcJEg1#QH-Yuq=J^7kbr-nN^GK?C)`f;BK zi!kf|(A;m|0DT-nAZxpYiMO1I_xh)2j~A1S?u>d~q^$qx0CqH#3X=d74n)VEBIO9b zJlDN#C-p1o43z}Y6=2G&re+3HI$)8Ef&a56``-TvVNckJ21o#yc?r*4$6F#uR&2j7 zt$;~u9hl7C$JR)4<1pDM7(@Z-8@-$<1y^Vf%VgwQ^|phljRt$^_J6EUPypWMN z*`7QzlHKJ@Kb3^`q>g<*4_2TrLwDLwr^gY(jT#}-hL_OGFgqNfqCzJ#9BQpWL*ke<7tocX1MMJ?r6rLX)c8HOHKegvVbkmQ$ z0%oDKbA*gwRpw_|?A5yGd@6D~do#C9#!2!NHip}M+Zm(NUR)`s&M^zg@N)LnQQhA& z0an`YbAxi;vVA&Y;cTP}y6H*WaC2upMrdnpDteQtr7S{Jx7ZQAkYL zU_jgqV}Hic@62*8V9yDI$Niyo3uRvWMt=3TA1==RXkVq|1@2-a;sGc-jQ=cl`&A>@ zySrgJ?rS~7)LBHisgdpIoO*7l@6$J;uiszG z7)IM(ENbBwmh3dvQ0^RmU)0a4!z;o@h#NCHSoi#nw)J}vTS%J-;%+0plF^&f+IOta zROs(>9yrIeNacqbu9d5<&_x+N&+;Y@4AJ<~6_RZs4HxJrHg4U$BQ~gP&#h2)YF0T` za1yc1Ws%ZaFlh#yB z?nF)l65%1$vIfCc{J9Ms7G-Gf}KFeE*Ue> z*5@FA#_T?_lt*ABZ(oV6nZ0Fk;2v+{DALHhK`2YuSb6aZ1vTXnq-ch&`6_T{FM3#j ztDl-Jl+5A$iXXH@GMpC0Lzdgz0SrEjYv%+PKD>YzzcCF*+q9M7Hf(6h0zVo4F|ciW zK!at@DS?6&(`nz@-JZJyrhIp*{U9?Gw6I2J!57khb$nVXNh~WI(qqDkzJHKYNJzU*tyXy!qYoszTft~w=PxnGVCHAIp_M-~GwfkuL~ zlG(EZv5~0W{TCik)y*ZMks5U3IKkjF;#&;0zGN^6iksEjxpw*RchT2{P?Q(X1Np^w zZ{%3z7FVnz`ua|ot=x57T3TAcutH=xc$RjyuQwl7gG`YLs3l%cYi01=I_w9NOu;#{ zV&yV+ijfH$LeLLmD*G^^WVnYYMy;E;GovLNnqOWOOMv*T)KmDG^;TUdqVXFyAIXQX zIMJXwSlIIEa=3o5aT*cueKb`KgXt?%eL}`Cp%$3Uv|&)5~x0 zj+p+F4nN)Y(RtR?us*F>*Lk|fE`bUSuhqv%3v@OD20(o9edK-59mie^&y8Mi`0UOY zI`|fxK64}`p>5wEZ41UB5H$=TyFss(GgJvfT?NDb1Q+Rn+}JO!eOxES*+5*YLG`9PW^Lh`h{(08h4_TA*RcM zEEBQ^<4y2~Gl>jxam~PE=+Ucm6x>yuJg*9a%|M^K!1=5&vv_ zfo(<5V>QU9R-<-mC142rl_)Xf_gH)K8!J4&!u>xO@$ReqzVr#H6q+HGLPxSz#Cs&U z4QoVMCqcS;L=N{TCgZxeI0zi{Y`dMD^Mo;T(MM(GKs9by%dT+Y7alAXiI=+gkW7&9 zn$JD^{_XLrPr?yxFp=JKP%rE4dg!-2QfvlA?B6Vst}DYjrP3`81S)v1a8r)Fe%0bE z%@?~GUo1ZuIQq@fQ%b=r)Oxv_ec-_gn^TLB50EhZ()a5rOsxn(#1(rzBL-C4W^bz1 z2`zp4JJMjm9%Vq%eU1_)D)xYvWIW8&+T=Ixt&CxLsn)%<8?h*-@d^iFHN78^*{!Gj z@(b3m@8U8=B3{mf7Y*Oi0xu2o1vhqn-PMrFS{&x$_Z~>*@a3r}-o=F8Epmyq+7PZH zSwoU%K}vQ8Ny&b^zZ~E$&k8AmWv(5^x~EVo!$jo-*};eqpZIz^e)oUkCp-s|M|dpk zaBfAImWKOJgIr>HkEbxrV?u$vE6F`dRo8lw=`22G4QYs9v1tP%Nc?OK(5W&ywpPkW z_>6R^U6)ys+6Ugb|D3`fB)3gdq9jKvIJ*Pd?^d zQ?4F6jICHRM1&KmUC1>SGhN7Eau1Po#TWUQYWUoa%GmqgS^lUYkjRoXifrw~aE?@R z!X#5)9Kf<<68~}sTb^9^WoXHHJMqVVSP*1Y4scVSAJuxJDL(nv8z{&hF0sVw3`m$m zLK0Z7Ug%>6!_v>&Q3dPt4I~x*PN^6{;4XPbQ7R;s^WSH+OzD-UU_T=qJz8E5up+ivp=ce>(SE0*HsTpptV~%S*6U zB-U%q!oDeXhure-9S!H#-hz`tnGj7XDL0TX0_Z^zUE2IW1M7$1Dfh2-}R#W!0{icZMGBbR?)dc)$M zZ;H3wd@T5u_|Ct%z{6$VryW+0`!`JBJSK36j~;)u_sDkeF_N+9)iu4sj*VRXYw6Qp zRcE;h(AT7={66+Gx^*p4P)2^QmU96JC$7;7>*VL+MLhnM736^*y%CpA};@gEc%a;4ZE*CIYzQbr5`uZDe?pQ z&<$N zO-)|^-&FkyKSBqZQLP`lId3cj)xd1>kq;0{=BpYMKCScQn5i`p|Iy*P(GJurbU8_= zM^txzy`K26C5yZO)?&Q(;+kxh2Z1c}ubF~X=bA{&F~r-|QDdir1pwi5T&C30JJ*~1 zXkL^A*u<8*YKVMXL&HHy5p}v4j^tPN4(VSt;P~HF8OkoL#|9Ht`c>Opp z>bf%h>q6Z<0;4U*gP*y${+B7`)+mO5;sj>7Ot5YAuOpoPr`lK?(+W4ekpLc2+atrY zvh;TkV=t{A_+M=)rC%B_e6-_)WWw-@%dyJ&eyk3-GTFhdfL~Q0=sm9D=9UPhwvbI!R<)q;XE~-@_!7> zBi_}mM`7?=7B}CPB}9`FgWeXDJL2dbtr=R0?1-#;lTi#N&JbRnFg~4avvW{Sr#{sZV^+qnLpCjoSdU4M3)cCFINF%}p zQ2z}rGgh+o{?XQ|^_Gi^aex)n!JDm=O-u1M?l$Gc=k&g;(w0dZ1roH>)mHN=i(|m* zksn`2bh>O<=2E9QjIzVjiuPQ&=I`ZmIW+Y~aN*lGrmv3HVmlt;#kIV@Hk3tv1U&1g z)|74cMSLo1Xv;D+R+QR$jY*{I(n23VZ2q;aXnK+y1G6(KgG3E4J#XDp*i;ZPn=F@y z#Az&sdklo}-+C9jlqyl?Dp~kX0tHD}>eGfV&%WhFP^&BJMEvvzg1=hf zeB!4S)re&dgg3q_5MQwr7caPKtmluv3+S;rU_7cgdE95|;T*j6EVZieNmy+;W|fmd zyoiSPFOrvUgCyC;ElfFtaHv!d+N*42`0LiE6`ykDp2tYc`{Px#5foXN!ECM@^}h|_ z7vu$MQt0|g>B4M}YW$x(TJ8(vvU7`81ClcDA|G4#PEc3>7yvQ+721wSA+<93fcsi; z1d@@8H3`fdd0iWoH!ZTpcsUeOl9@6L%vaE>Q2!nm+%zNU6Bmn&V+prZ7RuH zJIy}(-NQcJ3`p45x#3VyFrNaH;;C_wW{P;EL<9$9OD_Q}aE_D^>(ynpR`?#yPc~eE z;Ct)h-bK@%4rueBTO~4@8a8RXL~0kwx!;%u;XJ4?hwr&a<)nAgN0R#`K=nQX5W)~D zVVqdgR*@o#dw=Y_0+qF_fu@9xt*3_c9EM)rZ;Dea3u?)O^nZs34a-=jL*j5}ubujy6L})V zIHV@v{$6{wlu7)}wW5OCPK|%CG-v8OSb7rRPB#a3p`H$~)29+)+W@~_37NxGo1tlU z`BQ!pP%#|2*;yk&@p^mR?3?HSog3SPf8o!-ZlS3!k9gV>Qi=woG@55C zhO@iO?A?y3{kfbAaM^4eVb)UsWtJ=`17}P33|i|r0#1Gf>h4;ii<(}Cw_ z^}3SGCUCxNwQE<(FZe$6^}mS6iZa*_B{#+ctRf^q7BvDN_TGj=AQ9h-X3Ykn4<)j% zv*yfCt$YCNCy9COwFRV9D_|cDK!U`CZKZ=KFUouZJaZW1>44Pk5_Gh4=SLMN*Qj#p zeYjZ9H2Qt&8VZT1{+b5>&uRl0r5+U{w>1Gx#n1|1KIxPPHV72Srmya~T8M1nD(#VV zaidJ7U1mu<9E2L4fUzMu7tC^%sEj=@`h$3}AF!{(VG7VH?XW}EboA3JWFrc))#MCp z9he$QuWbi#z>R*?fe-QVWjY+pUqA&RxJ>{HH<-YxAq8Tja6Sym^;VSuY(PpceZ+Xm z2r0id@EX?aK3wtzRALrAr%KMZ28rSE%oZK1fdzoFw!^lyVXw0bb*wFyI+V(Q3p)|L z3CFG$7f#em@&g28cCtlRdz?Z&?l=K-2#e_Bn@2`+d-ociy>Raj3#p9X%ssi$O%~Je zFa()5)fed%Tp51S;)N3^q$%LFJpBs$u*!xX4bI&KfQ`-2kDC{=DCsJ`AXw>{V+VMKME!(8}|RQcO0l1kBf*a9Jmi~+vAfL z!aATDADS+!9&mU{FkSBz$^l?>p$mzFZ(*R}{4q7Rv^N1pfZgyn&?G0_JgmX!guzRB zv%Gt85d$BZ`AQRZXv@-NOb#TZv^)Lm>o;iOy>YBSiPCLYjL%U@{Ne+KKMHG3&Qtc7ontagL7bKeI=Cu>vb03N%L zCy#1)3e5+0!!8`#_MaS?>a+3@4Z=3sk}bVHKcnp?&rR+cR-FFxWQ=lhN0imYUcZ3^ zD+yCaWNT7yYI0nzR-|?34ghtH7uf)=T&RU^&!&0LPc~a~n%*0S)i@Py^4xnSZ|=;m zSDW@(*I0$63w;+_(3w7w)G{0NB=h4}>*}YT$(v_RNqDs>oUK(7B@E}O#Jcpgv)_`= z`!-N0-O?mAeZsV5I!~`@Qfs#En6j>d=0teXd^u0z&s-^+%MKM(WF(H-lD-vT_P#o* zMU&ypu^NE+J$5ZhZDx#wvvA{^;+3m{1wV-#@`k;wRHd#-vlgLKA;_J0OhM{ z49N3g5UyqbA}Xdn)&C`nBL{%~qdk`8`)vV}rwO7mjG!h#MW_QDIT3a=s>I0b>7^R) zelZa}1sHq-1mNWR4ItJ>rNMTmBlvCD_;nR9#I{>(6-J*9*v<6EoU$nfaY=$*`!eb? z^@4V1F-k^&MMfY9GJ9kJri1`Zho5Phy+rmZY&|=7vjg@$OADt-)C?Nx=n`O$UK{(@ zH?*=Di0Xe?fIu$hwE)=mD0 z+iTM(Px#7t6d!Kswf?|a1Rwac`m7wOo?Y0UVIo?!$wf(ohs!soqW^YD~9a;#zS!FD~ z7ieW_*}x|?uhg40Qvu+VYk8sv$DDqYGO~sQ^BK1TLld2~PJo5f2u6Ly+sCDF#Bx6s z!*L9h37Pwg=?~^v{JBLS@Y^<_KIYy~D5?xf;R+r&#-(6xx5F{4q2nqo7xwI$w3^;+ z_3l{Bq)OSva~2hLO)cPN8m6`6K3_<qhD4RUaM9%#_+q~e1sBURoE$djH za@kLSvDu6bZQyk-5d9DlXY0*f{k>;^m8lONrh8N%>{Pj9Nm)Ia=P_`@#B&sAwu^v6VvEF z1cvJLmX$K<4i)eiogJ`hk4ZjfS#BNPI~XdHeD#XXaJ|j>UbhcQ_E^wa3Qh_# zW&2zTXuS#8{EN;hEpHs|CRgW~^|eR-MXx zWTCz@V&pdGP$>F$th(Bqjc2Wg{qkA0t$Jbx<10r1Ioi8z`v=DinUVeh7XX6!T1meb zwwCI_XZ2~s%$LiK^mwYyZy|j3YmZT8SVjAhp`tmlRg1Qm?F^1jBBdf@$5M1Ep@Y9m zHi1skw3^@Oo73YtAcXd28x_}mJV2#aJlV*Y1YVL2N}BhIWS*mQF|*8;@Uat?J}AqH z9-D!M$(|puO+-O8q`6V*zJzx8u#G}Q%a0by8$yPSBfP;%gsHe9jI{!pCLtqkR!XN?+)T*Jr^QI8 z$2fzvNxFq>vJe4?U5gyn@f~;}#Lg^@E5uA(VoVHT6N{XI4H)$WZpm}IFZf5(v`+*F z^5&^u`8cRd?yS?{mK@NLWAW9^AOVok2WzaVRV6;9eF2x_mU@Vt|4fMJXDR78<0mLv z`W(Wiedt5%-u#`jt^@ID!MsLS5D@W=r#)3nFO^uD{8QAbOg`O;CT%^}yCava9aduv9Tv8r4TkFxz*_`Ap z$KN~L6_v5#86L@e@Re|y>Da^w*LN#Sy;4cQM@{Q3ma@urNy#snETnO6{bO-8GqacG z#Z2S06nglCY~%A$19XW=@{=4fGY+1jx{Wq!E{uKV-o@EDwQ+L`^RpqzviAFGCT`u$ z;w+cNxiC%F6*g8H+YGBt3o*>hzO=+NLkx6;;yT3|#}n+~PH+~h$dvvFiCZndx(a$7 zoE+EB3Ggq9ZSvrnIncx@{k&zcN)J61-I204I)*1;az{HffXI$3WO^lv+ZNa16To*M zLiD8Mo*r&N={!4hF1y}-?O>Da{@cd9`-*j^V!n%!Z-Uu~l>dLdaqr$=?4%BYu{=7%ifu)VZw z`6jc^-THWEg7!MZjVT;P(?%yt>NHQkC^;5(V&GOq$MNa}6Hl`s_OO;IDe@Ns=6M_0 zJSr3Z$ezSt8m}wg2c*LDoPFDb%0g72u_=gqsRn&sg@Smz+4?wxKj{d*}%y+2@vuWWN-=8s4 zT9y@u2=A+XT4uvqCNHow=0>+nCmCNLd^(n0P#O8YgD$o&OM8$m1GdMV+9hZi)ic(` z*P`_`SUYl|zdiovg%%27m4RO|RB8=f%J#B;kzQXo4$8OL?^F2{hWf~9qE(2W#C!pV zy>@`Ft+4NmrcE`sK<;Wx@|v#B%NfoS-(&F9&mH@Iv)pAQB*48!Ae(CDQ>)mr(Ex1U zZ^9D;zu~PD*HcW!?}epM>wY0nnan~%u6M&UPUkv;su{nXn|eBY{a6sH*RAmtSJHq# z)2}Ep#_SBU!4INk5?r-h5zVF$_S8g9^6pbfl~d~t9PkrDy8jT$qzTC_X`+seeHOVC zw_HrqQ=G@Ig$!p!`_g`Ot=nSw(L>f?i4Y zgxR=#ah=1JG5)X?CVy@_ZT@ll(3F-)(@75*f1$@%ZjE4{aJW$@zSb zP&PQjE>5mr(N9-EAam=s(HU3y-6$cAN+~sjEfUfxDBXyJjHE?L4lqcoAPou($RO}t8v#A% zdEe*v{{A}$VbAQn?^yR**Y#QGSUmgn^gx+;w1Mi=JXWz2{4x1dfm<6`ua(bOUaTF& z#^o*#CiXZn9s6eLRT3^_*HvKPR}*RM$?GbCl|DFk)CpAM*=T3>cS3QU&*&9nMpZvN zEQm5%otnK$tC$O^x2XJ|eK}8l1=nj!`6~=BMCZ6;E&NK{4Pm$x$t*`>ImTmhiKq<*$&vnIsCk>D+aD&PoGl(dU( z+ID9?L@#GP0vBKoqd0+w0x90eA!QIX;jV%|1)>*~0^PQz`q#>w*-y1uo_$6gYI=Op zP2s9*p;l8Vq(U->SFkOI=9Qs2V?UQeBoYrh)=YAb&yC`jSrp_nAE+i$^L+`f1Y<-2uWuvg< z%Uda{!xF2bC8x4>SFEnM>s(4cm1-BZ77*~EVg6MDjd?Hq~h=% zhw)&a)iioc&qA9L0#f5`drxy4^ImysYR@z%_T$}wF2%+W-$uliBsY|%JbFvJE9mW` zHdQKqlXTzbj_moOnNo}Trn~J!!f*4eW&ig{kn7#S?4htHw;iPzL7Y>#Vuc0%p7{`t z!0~`TbsI|RfgBYKPcUe&olt3r2;CcFwG)sBx}YvUFZn<&?~xS8&Sc3W=tW2(a3+s* z7hg+u{*==RZi$x-V2czci$@p)ep9y}X&Ry2QTkn_}L1z6x;q1NMs-pQf*EaN{n#N7EGUS2Q~O zkR3tzOdpjQ>wHl=(W?j+K&)_cq`CPO@>OiY0s$TofIbQIbBc|j!dC>vyEp@}QLLrLZ4RopkuM~5| zIG&ymHdF7DI%;QzfD{&En->efKzddEF2)sXb9C&Urym}u0&dd$(SaB%FNX}^@PuRq z4bMSB>*^J;M)fddY{f0ekrZ&w$@*NWy~}F6H-zxIflEcP>Sk`~%%fi5XoYXrX}d82 z#<}E^lF|)FC9tLGSV$A)A!o~?eyn@yM-b#k_BqDhP4WsoJt7{Q@7|c9P?pL{MS0}e zLYumBq3+A8pTJpo1(GxwJm@T`b2}V0>Mkzd+Vo(mQ9g1w^P@Off9P%LO+Ijd6dQIT zhe_Piu#IdUCK6uX-E>)9WD@+&DBRuG?4RzjN~=C(L%By0p?I7+^xUpw>Vk2r1mtR5 zU?_e1WD#$No=;azyvLz)-uMycgG@*P9Jx6QW*>6}^#_z0*NUo+&|Z3fjV@%%>(F7y zg%SLwWsRS@PbyarLQ>$np_aoN$gYNMBV*3J83U%LX=Dlzz{^$8{kA7k_)Vbi&0nFp z-pc;@iQ-YTgCk+aRf)9_aV)^;GcpC<#%WjeF4uO~&u{LAN9OO~QEz@cpNkE0vB(tU z}=c96N5Ni`50KH_JNj_~jAp<-4K60Ida zDLKFYZUOpq>tceePX~4^iLvNiNrLuJuUUxVFS7;?|Hk8!BB|#+u4*N?o=BJClK(h7 zlQAoC#zg;kla>FT2oJH^wtJ8T)WB5}0uqQlxoAb{<)=f`rO}%A0saL7}SX zBf&AgJBewZE(#kOQ_V&h5r3*B_d62|;|9Pcd|z0CUWF%4Ut%|@<}(yQqNVgSyhIV^ zm>1i~@+dr~;{5!I+l2>+ZP){|qejjRB|s*a=yRXuuUPf~F2T^Hlz>~>76yK*hPGF5;&Mrr1MG7IZq{it|DI8TpS(o|=r_ zT^Za{y;9$Pa9RKnrdVA`=>5W>WlpxmdhDr8e{6Y{HqTuOJ-)HP8S`jrkA!^QWUhBE zQDYuRaw(3CN@7VtJOD{W$DdS+28VAaRJJU-;-&N)w66Y?PGn~Z=Cf|A*}(EdJSUor32qt3g&tRIefWA3)fo7oFTVdZ~{Ja{20z{JQezkVXtk;PoHabXX?>+oE0^3CbKr#mj}eLCB7 z;OItN8H(tdh7y&#*P>5EZu^BR;izo7U=p>=As4^G_`Ld)7hTcO_=wlr=$wK^>vjCa zQ!%vuoX^RU-kDvRk1rqSI5Yx9mN`vFf8ip?1PWUblS}$z9z#6xIz)Q)=WZOnoDUa2 zV<|lQkvC5I84JZ;zB*&r@(Vbclepim0$ev@_{RaWaD&yE$KqFux~#c|{oE_ct-AsA zq~N0tq6RK6C&Ls&1Wpka{8P+}rkM{ie{zexT6g&MLiImM5;!gwW=-Voe+WM4uUzg9 zDtG!B!vJ;@LyB$;gB)KJ#p7QS6Od-l4ax$0xDrndR5*>Ulo5O>*kL-@S+ueEmybZ8 z?gt@Js>#a$Ws|;XoBnc7MK5gw$UlcSSSDZ>E(lYiSt_x= z2M7BdZ*dE2hi>he{(AfF_das}QYOxx{Y2dkr-4k90&9k+UA4n6clKnlCjWB_rd4#)3W^4ltVdh{@VSujKlPgV$5I z-#`V$lc>pN`Br&A3Fm34A5?9^>G_PcuwnYExEHj3l53w4Inf3qh2M&VwgViOjynM%_uEDz0|uDmwVopital-IFC)`&slkt%S=n0;udVAYi4Col)e6l2v~eS>eoG zK3=~=+(l3VuZg0L6}TuE6F}wt;Pfd8Wo_P+gm;(9^C4g;*ZU4uU`;u;RG8s8t)w;@^c@+);i9aVI@R5LPMO!> zwzt@=VZtt@_b(ptugSH!`$p^_b<(#N>2=x`>d5xF(Y z!v}`ONG>aj!)yn)R9MH8mb92ywl*qkrQCP0MY(b#7l27H@IHbZ75ONgnyNCBN2l1_ zhGesp7bqV)Eh!BC(G+gK0Eb8(>aRvyC)X8oJ~G>H_42gt=2X0*(VBTI*Yp01CWE@4 zuoPrD4mXxiOZHKd7c=IDG7tHoPqQ1dw;VxLE?SWQ?7f#@MoX-5zHpEWDtSLoI$RR; z>&MU+W8u7hlw^%yJw1|{_EWq+IpDl3YZr2WJsfxT-O%}`c&u~_Qt{kU$Rp$Jn&7L= z;&ODy6mW9*QIE5`Vu`D?$IYPP8a6ZD6*c8oCwIg^i~Z*WtPL**O%WUfE@_vA=KVc% zoHR}oFRG)ZK4lghuSEMB^yC^>kO|3bO3_18ya~%y2H4Kb)I+bV#-??OiCGuOe1a(1 zA+G_?$`{JfH0N|SrBlyg57CJ3v5u@)%ov<0t@}YsJS}uh^W*ppvvj7X%wttHE)>|? zCpAcraCjCG{FJ(KbX%p3Qz9(U{csNFmAGW4;0UwbLP1?&$1mI~x-At|M`;VwA{D?9 zwMZ?46}1wdSFdsGns)yrvAWoVQ8a{_(3a$dn4i8oPp_dQ)n=f))r+IQqxP$EF1nSRe&_qMUH?8LzKq}Q@T%r&1Ga$__()dCAzZQ4?R0{TunY;J_#Kj*An7awBf z%@r+S=KR)@CD3Kb)USwSn5GMED2^O_JkZh6Chf=CIwHRclal4&ORVNV4~ThUombee z_CUn!p*aroG&{&1WDDQe(gX;XL(wAURmGSy#tfcg7Zh*Ru7B06%>Mo;0=j!=hzlri$-xug9F;& z_JRTLtE^+d^1<4> z)8_F<7R{cZ7^45R3>!2aF7QHWVT@`vJ=bVSRDLAnhKNX6%WhhOofn$)a6-h-;3ZGUm`%KUjb5IwetE^Ga#lM4liTx zu*<93T0SlW>-8?HuN8|>Y9qGGpd@aw0B#_ez}i3lmi_HQBRaJCT0#RH61AU6q0gr0 zy)l&}Yzb7%YCK;#O}qi*F5OV2a0AL;lpz@&x%Ps7Ya_t?01M^dPGxP#(>_upvD5hQcy)@v@l#tM*Yg$Y)lHGwK4iKcXHF|qcW-+<4SZC#a`>JH6SPk^t|3Y;mkAwwjc)nb!Q)RTE<^W6He(i>q$5X?SywP&MK9gVig(D)VePXLi?<$UQ=X`w)75?EXR0 z+^DH~ioy#xqjf=TbzH9lGBgti%?MDRT>!}$G|$mk#9^Rq>s&q=8^Tt?zlU{`Vj9k1u!)Tr9(ch&SA5@KZrL_;gR>fZ!X%%J)ql)sj z-E4}V_?QgK)5P;NC+A3Sbi0GuJX^9bY$)eNI--p6vGJYuo-eYw!!)1va|cJ9)HMOw zTrU5$zMiPyE#EZ)2u{F77cveW_w&f*yQ|~%y=vQ1pa0`FrFPvwd3;14Qm>DLmJp3K z+z2RqI{(q}XB4OAhHz5fTP)9E11Dk44=Hjj5mkBxsnhV&dAySWOrC6VXB=K}Ivt`d@n}@!IbK)K4p6=>A zQ53I2l|uTuk$_v~1~4(sqMB^f7!3f-uP_8>>Cu#BtHD;l{(TZsN*@|6n|;KUb6txo z&4?*|l$0W!YaLpMAgJn;4@FLz&Ht2M0(H&(xLIYNTj)BZByS6;lScMipxeA&w@~xc zrLZgKa#g`Yt!o-|gCUXB+dYvIzEqG1~jIjn7Xsb%wc7m7S5qN9RoK>AtdFEs^okx|;$3Bwb&C0dftz zYj3$HYyaorX!dBf`uY8HVv~S#=tuE;BKu=Y>%hv~{X{GcgNOy>SDRph@g-e%)PXAYJLi#p`N#ylE7) zGa{~pBg5@%#Jf%oG0`WoRW=V7k<&3k-qM-#(9Bx_ae*zo$#h=YXy4D&1N=9> zTy#hDMNH3;N-$}D(guBA`IJnR87Gr#nN)`cy?<-UfIwKT8(XTZV7$S#Vn&VS?4$%O zIyO`huAR$p5mY+zi&y;_wabI9N|F2VUj3NY@3q<+1=1T{d_KK-X65@WdqtaFLnnah zbq$>)e|ik4^;y&dshDoa(B2F7A%R%_{AP&P!d-1yg^PVkXn-Q2VatfPqzs>1>M=J9))&&`mg6a5Vq4~>kzIP0QU=HGLA@xBYfJ_+(ID z?tijR<|b3^__q-4_xR*#5O`}B$z=6|;0LW12nmC*=~R^}M}(}tD85tdWrv>4C)6Q> zeRcN3AEOS)3BA{QK=SJ!5D*JKYbm09?lUF+2Rb@er*-_ty~VUl`ufN~2l96Yu*q+8wNHdB!FX*0v{x#iNSvsmu7a~mqUf423=o!h> zeMCZ#+98 z3v3@V;VK=Z;xl~4c(4vdeKKxZb(l(oYfvb=#j4kLjLLb(3<)NFD4TYr1CD#C?!eYa zRc|8)2mhMz(%A7*DM$HZUqOdDtY?t?%U<$Hgpr$-Wn!oGeOP@UJsxh=5fv;w?Wi^Q zQyDgK1*ZNXon4!_Q=j1dGB^91$Bj%Ws%@AD85dNj%zr%cjv*WWJw>;3LLjSh<_}Xw zlY5}8a{OMfy^56s)#A@;uFf0?y2Kr{J%eFhen8Z~rCMfiCg$QcPRvYtEvfUA=l&6Y zt)lrZn_-pnQ@Vw#<(^C5Y%{rEJLW3h2v4XcaxgYto1_Wa&-_z{iD41~ZY=2Eosckw zBklO(t?cK1K>Sz;l^UK$iI1ttQI6l1+zim3;&=IT{e;;-;$KiY#Z>qm4wdKuv?<>i zu{ypII+92NDE4bJ9ikD*D%arN*&ol`a$mtVR##6jHLE{#K2m6=hG`PCfLR*B^mviW zmELmWj=DeK6cT2U3kd&3kHfsA7>hDu$Ie9dI%Iu%b1*>^mM+G83exM&YzG`+#MxMN z;Us^_&@5-usZ;BZ3={_&C#ZaC#`Pd(yBSl$QMu@znK9O|Zc>}Gkzi)%>9IgtO*(!% zU|)YLE|ZIIaH=HVrcvVp2)Myx8d}dPL+|pGk0^7T&3upa%zF9x?fqd7srW-N=L&Kn00H8dGRt zSYe4{XcEdmFYbz+(9W><(h`zBuDj=SLZwjLfrtBReItSt_c-=R>~vP*+UEg|Hz5fU zLIBNb>qJQNaIiGC@)Y1wTXKddE}!EAe_)@VK+pBJGj`~GNZPqB!ZAvf0SCBZF~8>R zMPsJ80C(1z5=nC;b3BwD2zHF-oMADyxxBpFKkB&ZHpwshOnsarCWy3#r+dXjbC+Vz zuqIh;f2ZS}g6RI=v+P^FruS(4mWuxR=7C`h!r)>wu;>}|Q?8;IO%RRp!bU%ESYlF( z*58a+$CVpdu0hBrB<^wv5~Jyn3! zgCfOjpolfY9Wtb$J?zGyvP;3|QYQpm|Gp#)Xv_-PNl;zS#}#!Rum|8bl1z`H7mHNX zMFe`$#%!JvY*Vx12K}^Zc zbki0wa|#xP^1B=iJ}34kM6X3+?>PpXZ_8!>a15y%Cp4(QJpw@-6he`MSJx6M+3EBiNqvckR0w=nDgO%5Ce&X`bjgfio#*P_m z6q){bm>CrA6QA%$-YYI?T`nG&nzGUx_v+ov97&B$C3=;w9`IrI?$VHvw4@p_& z-*fWLnm>btJ52}l8rjnr@im>Oq%?Rnyi}}zHeb6rkYaZ(Zj7^@_4*X(o?jwGA8pO{ zy-qzIEJ|`oE%!ADNt#bzx;@V7mZIC1JInIK$O=**iSM4zc9pt1vQ@sbS*@Ez7@D%< z^MPYIHXU#>G84Yv)0&in7fQskYm1%dfsim?YE_-*-q=?VNvTZ|bS>iK2;{}TqDbX+ zN`i~E@ou|i#NvDEl${Ae)8VIG}u)TPfEulnnx zj>po!&n9crjNx347xoPguu8#YJcSxjriPZur5PEmhtBr7>B~Yw{_{^tC-|pt8XGOX z)ybVzk<>k_&3NmuIcps|*DRJnHc1wHs!b@{b7eMul}`+(F}-}Y8K)4^mrFf?I0jND ztL0J$@<3$J;F?>fkyA8oP!6g}1n%0AYb@b7eiXL5NLWbqQ-llP)NeFzng-zZqI|!{~z=l&DX^t9Z!@GDC!FjTa->{?9_BC_@nE348%@iU$N|Bk(3X+W;;xq(m}(}CazbP%rgvY zmAa;TN*w8!vL(Jnn%2!)N(F{3LC#RoPTL(KTSbecNGKZ%8%M+0vs#JqibHQ`lU?hf zMcQTg#0%-C^R?U&B{gD?isgP!Q<>(*zE8@Nm-ym2qME3ccUJ3l7S-qj(Ms&s3UGR` zRUUU;eMlyqthw6!L$ePuT*8wbIK+7;B;WT<#CgojK|`e7c#cz_P9AmD(ka&VX3k9b zDgAF;?x3LeOqN$n!fc_7hj_etbQ@Y&WnUVF71Ur=J=*pGH0^7mgY$~Dkz}CKz4p}+4~047AZNEKw}QO;EDjWQ)AGd>J|Cl}@y8 zqfW{YPp{k`mW|E8zQUo)jgV0tNnx$YKs-g%3`>@EZWpG@9DGAP_zauPhrXv;>DQGp zt4KIilID-*RBSxtWV95v0Log0Dq+2jBv;f+#ETb3&y+sy*A?WEZYkbn6T*e>nlU;C(e(Vt4`B>y+XN#+{7b>el&T{FEi4K#0d)D zhow^EsWxF3Dj|a*-HS6YC)uGFPC9jXD=EoyGwo+oMKp{R$Mp3&+}hQLILf+{OiwTv zW)7vfS$RElJWuR+(I#B{OD}b1xG8?Rfd6yyOu;on1`RgXr_i-P?&*T;v=#bM>yjVo zK5M(QbDL|LjjL|dDn^hR~=@CkUB z<0>oxowM}RFQrBr^w`pFtbvOdb!@Y?fpD%*ILhL#S`6o)tjHLC^gr3 z2s1dJ)J$mWvJ21Eqt zTf1(688Bw3G@Xm6Un`SQ>7*-RAyyK9_*gwqqH*7GAaF*k+WzjmCA6blb?Wl+#5ZS#RcEXQ_>we ze@2!!!?Xt#=~0WAcrU)zNQzXL>c41zi~(U&!2dDiG_$fkQeRHOq#?N(`X^&y9d>20 z1;(B)as^%|w+N#W(k(N$w63^~sJq?*#TUr`v<3~2s z94M-iA$g-z;q)om7qUYL1XKxQs;jH(aF*a&sOidE*v;=4z*T50lo2<^?G|RItNYE( zqN|YN#Mxw7TkU?Oj~25T;t7sPAkIrsebcrI6LWwW>|X9sb*H$v90Q^P;sTxpB-d$H z%!O!ID=(=soaEX$m5NQ6)$?JYCFX(vDJ6lMC~H`);gaI^^HWZFMm}G9&gcxgzwZ^k z=Mg8J%B`508zRC`*AlzS2MrUYb|ZSlIGb?Nvu`vXzwir3cO}fKxjJX@S=|v%j!5jr z*45R;j_1vcbaDn7wTCP0NguK@rrj-1FhOOFM2>cH3<&YtndmOdEUUXZoeHxtiRsSl zI_`x`AQ$=JA|-L0R=p~*b%kp9&RDi1i@GcAUG6I<*}_t_VbY}L=U)$~&|g+3jC!gQ zZkAP~d0DD?YE-usV8h!_ugw>^pCg)ty&O`T6qB@n8sIP*TK$gKVH~qV zI3%og2H%U@RP=4^v|yAM)r++G9z*a{4D5YDiPsYL+WgtN}MJq zpj|4i!4}GNqT5GW?qmJz+B3rXlal`6J-+X|yukvv!;~-}sQ4*7T)xkCK~{YI1E6oB z#3`0rje%6_eRkkb_F(57=Mau8hZBvLi2ELdRsxpyGVI!E)53`Cr5qoeXGUqszL99$ zaLQ@24iMk=DbUmj7^gUxmd34}M&SN1^oVa@3+JVE+Z;418f?&Fd|X+4Z}-uwuia;k zdt<^WkZ@!=_Wm)42X-&sA1ud*Y~*6#IrxmARd|u_>+@>6nY;1XZjT!`66w|--M-yO zp40Of`yWb45aw`2+u)OIl(5dkR-bmHWl;grfP|>_Edn;st4bFEx0^;wKEGM~Tzem( zc^Cno>z@WWu9!sA(~ys?akln}z~xPD%2@4Nzu(XdZ+ii&x}xP;*O#Ym0~KqZ!sWV9 z4gs$fkNzk)-;9Zz+Obg}TziWf=)gBr+evTx5=P%BT!A|IL#8o}g8dB>#5yMAM2 z0j)VW)7fq?6TXXQ_a#O%A4pQv-h9{)BUp^|T)OE-g*-+Uc1+0MACmBha_+5xBK9T& zb9Vn(S%CSlFkvphZ}_MnCCadnH@}Ay{2+s-W+>%Aplx=SskZBM7DrMgQk(|9a}FZW zF9P9~c2{8sV%^h%+Pz%2gYJ!%ONTxkP!_#{@M8=;iR~`nSL=qJTsCwfE(N@e=iVZ5 zfesvUv*2{)0;i3^e6E=j-k4pKPH|mOO%lsh?*bEJ+~I;y*3FTh%#6|B{~!<(#4!BL zfq_@tq+kOh{T!i(Q3p^sW57Ibxt6-aNK!iCW!0PG%tP2l#AO08p(r>>UqHGZAf}&r z2n#!bp~t15w*aM59WYT|u=@c2Ws7(vA(vC9B(dx?-kR)Z2(aglMCrXqZyuzc?2@*B z*RG9R0AkoglD8lQ+2!0%4K9fHY}X;j&TB|@SH3Ko>K?s?hB%>6(*!LrsTQ}|h7>Z9l9_X1&mC4&X0H>J{w=wlM?feLL=5T;F zw6`+nY*3C?FwcYf==w5%a7Oc(IS}3OgK{$di)21j`c-_UVLN^oLpR&?yxKX!8nzT(L2!6G zc5UZ+&F(=FS>d1M7+KRFNn;jRiC>M%tkrnvlq=bvew9|tRP**+dvpM%A1FER_flfx zz7o7CaA(5oZ)&={jG9sp=cD%Dc#xcccFB=j z{N!ptX}Rt93Z@ENDUq_V54*Xzk92HWDyAql3d%A_v`h4cFGLQp!@Pc)aO>(wV%! z_`p7vXq7*2edEilA$k(7!;qzEj27GETWFq0crqNkWu!n9SeKmXJ4G&$`B0OuGgvnZ zC6I_*Sz~Fhe?Le|QZ`+hZ&o=2Hw0cBgfDba8OaiCVG7X*RNM5wi3jY5O`iw+B>xrdW-1$hOW=)C_<2lvi|=aO@B9p#ju zgGv+p_rAJG_t=Doil z_SYL&zq?JW$)C6S$QVZF7m0g|_T)pGBWL52AcWcI&9w0&xcDY_C}}%TRVd!9gVNwP zFvln@xBMS&wgg(GMZ_?a>}P*W_WzI6?IQz!dMN#^r1_s)*!Lii?9`^#WaH=mCi6bh zg&Q>ycWXm<82_SqtiAnz4ycU*vG&N=55bg+^+g0`8$X8uXirgqEZ%?p2>$V(k2oK} zGwTaMJOs~;+Q?h%4`*Qg)&6gfnDGR_<4&L0=4-t<27bSSck13+o0DbZt^9oz>Qoqi z?_GP+)}usCkcR#pSp2Uy>l3_>{l`E_@VO6sfOh`JefdxwN?6?ZFl2nKEmePwzw5A` z_{KchY!UF!4hEYG+kdtxaD_*Edivc2V+h%AG4tP_`(IuOEM@;<%p&*oz?aPVpYH=M z_wSLgZ;df~Mxv5$FS!>*(q?9NY~w*(l=hDP*IRKSz)1fdujY@=oC5 z@3ayIsvLngRD|?BgxqJ;22rKo{VK+up#L?H$SGUVdHazcG9IK@ zgBL!UDN~=C{_s4rWrObsn7pyyv^LC+?IV2;QqK1=a9Dot0aK*h08i`2S+3@icj-TF zAH%V8Wa|x$?xyA5H2SD7S%(V$O`5HgPY7C&M$?LF*Q8;%dnj@({@FVec^?~tI`j_N z!cF-vJb0NuQ(=Qs{!RrUPM?`e+yuO0cA+f)FT8emlhO=-;BwhHg9W35E;K%(J zR2n$n7>)%W_8x|ZAVp~J?YPnefd?bG%YTN6e*@b#$L-p~!ek-(<@V1>n{nsbb9_q1 z@?X~}m?9j2<5>P<6z=1%`$-;GJTKF-#wlz%-}WL5an z*Myuh0|E7B1Y1dTYCb&w0e)E@L_r{8&+QDyGLR=r9YMC<>E^GY^S^Bbm{BA`Y!&J#K=PoQUlQ(1(;M+C|8*0<*O?3lQX*Ry%0|9R7Uv;Tp^cY|}Mk~UADo1gL^pUnC1z;^9Qe|`589Pn0{ z_wEc^-|2k5&?J2OKMVxMeTd%d%XRA6Ph0E$y_EictXKQe5D_p7mDZm@Lj9VIxPFG& zyx0HXDva9jp$;_65)_MvF7N%ne9g_Nw(ku5=?NE1+MbOF#anUZ%7=|y0L;j>gVleZ z0nl6m22@XVU>7V(&nW-uRJyV1{c{+t-{9}jUzap^qBURJmKQ-@_E4U*?K3Z@jiS z3foPyDhx7c=Y?d!peJP?B3K(HB07ag?_@LuPMM9nw|EKa^%LM0aGCmS6<4GwgEZdN zh$0jD^CoMI63l#u=X`V+CZHH>36AJ!I;Uny=-vGK6V%acp}}e~q+*(&*S9%1TW=P{ zIn;5?H(%KM2?ScoDxDcCM36P>gQFI4Q|CkDO#B(^j;ttt`<~7pnTD^6CPCli0-aBD zv)=CxHERi!I??yq!w(3^>Lnpr&^{hY2@77Tp8zrEZW3+Cts* zdQJ;Yi>skhlQ0D(kN#ItQB8?z(dL;I{gCuWa#kYCFTj-50o_k6)cO%d=Ngp@dQ}g9 zxSk`&fc^OlI!_xaMR5|El1NDBvw6_Gbb>I`7R=T{+aeF*dP{ImB+9(8mEWOQ&w z36Lw~1NiP+k96zwlkC4n;2rMofXG1jdD-CBcLDAFC#0l;NRvS!*?_w_J#`Z4sMt7< zbgBkJvKg=v8q!aY$oruWw&DW&oUY=6fcyN!=!Ei8*-RZ*HmGzOhhIrEc($d-uk@{e zzs;eGJ8CBv(v}d~k1l1wbsNQ|pe%+|`)$FndD94L@b1Inp;OM|E0v(eGX-bUXeD~P11wJ~9iV)IUc)g}bmC2Do zX5n@@kUnBLbF0p-HMoU_tXWVoOL0%K>n?JhdNL-0G((Q*V`$%N(eyM9UtVKynA|XT zE-sK1k)umeO5t(8y566UlInrNm(FcXR?LYYjp~A8wjJqmYRu`}iWMBMLv&FI-46c@ z0jLEd&FLo65(Cbn?h!%9wn!7LPg2uKmAJ?2c^Kc20!`I_nA>qUZSOkF*_R-#z1!7W51%q@>-Br!v&tQ{pT zNL6bkMI{Q$)~;-P%<#s08pOJ!4{Eg7qXgW;snyhif zLYB>p_?5Z0q41b`7SDtgA*eT{<++8TDP{H`jD(r&>4?RV?Uok{kY;|1`{8C8)=bL> zCI+1Y^%tHw1bP)Qviu=+I=)Vy6zId!Typis z-pWAiTKZDjG5780t;34ewUg*7PJnTk*1n_V*v{fI4g-cW& zlj>V5?N4itP-l?>|AGuLZVY?|mTq1e5^YO2=9m-J5;L;zPf3hd6VSIbykD04sjjy2 z%lFrof@wihUvjwaw_xAIAh|q2BDesLA4VK;G*s((apY$RQv@H08_wW`5185K>_U_Q&}AP@w&F z@t*^?-{aZ;YB&OkLeg__eKM5jDE78*@~`k`+~NFazy3M{%2j?523ObP;>|=lM)Z)DUyRlAsVxP^d0J1hsp-x|n3IK|XBoI@sGpJd-We z5<^Mng%ZW_FT^@FzCawFb_doKg6yt7I$N8pMs%0Qq$}RY-rT}8(xRrL>Wdrv2XBg_e8SOIq?k11z$98|(0jNwQD2Sl@-fs6s5|3x zNuw_$NhTOU2jYTU`TT{F7T_|tqLi;@_*0C}J-DYZ+KJHcLw(8Y)}r?ZMVOp;(@+T+ zvhz7Ujyn|nFGoZU7mlG;e$6Ud4Bqk|ORg-=v*ttGF(WEh!lZ@Xvn5Ecl1l7Hr0Yg0 z)TswEE4w{*jO;+6zu=e2;`T0f?H#(u7SvQFJeM{L#*|N?9O`c}c~~xKVU$%``Y%Lr zrukT$P3D-NZ2Mi*^0B2fIV8FOqqP&H^j2;kEd>~{GnQr%V3O^cyHprImxQFWo^vJ` z3tQh2k|5%@J7HT{Qd?@ANy+LnxV&%9U(aUcvv2b`ao~Gajufp;%k#USk5+za)d~Jl z0LbPc#h-H88QAcC%1NL2D=y8D!&EU}nOK52V?pyVz1;}^3BO#5V5%YBk>gf^A4&Uu zg0Mz0Ft@EnL_Bz#(wF(RdzH{b3ls2Q`^dwK*o=HwXY@gGC>D|t?`FU2##_WT4Qfjq z;`KfrE~omQJ%QmeKp-C?GC>+sV!9(4x&NDSwvs7^r5*XR>Ic^glMg~LSx_4}+lsV@ z7It#!+@@H?3a<15Pz~NObb1&j?bu5>m&w!Xfww#(@$36-D=dHk>}7R}^)P~Z#BCKH zsrm9f7H8V^Z~dSOio)Iq^o*x&=k6~{Px3A`0Q>vHcBjuTr7|vhN=yxl9P?D>F{=rZX^*RW zr|K<$9HHN{?TZ;GK~SxWEQ-(eN>&HZEfSTUwV2{0mMKP%j-r75?u7XA8UhoCexseB zcl`X30Di$w2$dAS`&r~!O6zcYvI4vA4v3rtyRY1*i`9;r1dn_XKopSFP&TkXkk}5z zS*$@duDS)c_%*c*0iVX34Oz`S=m1^SOAjFiUZ_nqO22r)5i-jpaQ|%PJxl7l(8<^$ zEJ8sXm2plN%u^oi7{PZxonLTsQvFE)XgtA0E|eHSeeQ8~vzXuGhaRj0)iL7FW@0B0 zf%A>8ABLQ{afC<$oysjU6lr04I@6x?Z2H-K`Y|rMXH=)a^@*(PT=y7sW|n1E+}{fg z$G+Wra<<99++Ag$2h@~bY*rygl<4Or1)tcD8~`vJJKmkRrR4QJlnbg_;-c}O~QrRyTO0O`o+Y0M4ghVz%9 zRd7eY#dY*vzT1B2&95^)xpBep70{yyL<+#4v1lh3mQ??JSthKFBS54KbZCAi%x}(h9eS<^;DkJgzkhbMLmV-{MfdRc zp&ZPW&-XsX=Pu3N_tIqGY4>Rv&LFYpc~L&nO~`XzSjs%%Z=3XF_*aLLU??lJt`KzX z7zKvwRq)p<@k>;*vf(It2^L$e*DiL(ie6A-_C$Q%8z%)!If`9E^D=mO z(=#3Pkk-SZvlZX~c-o!+DZw58X^=_z90jfU>wOXf@O~4sUr4ohg;DKsdl>db@Kq96 z=~9-`RKI~NL;AUa*q9b0`En=H|A4!>uTxQT`_JVU*-~GYLD!%8{LuuGDq%lP9<8T9 zacTA8G%S5Phy>qH^refl!`+|^jUS?+6{f`!9gFvjRam;u`*xp0i&OTA$ zcf?icizY9!_soNC>zn5=`oYXn3DQRV5=d=9i&C|$U z!)dq2De(jEmAb9HqOI4vprb&IkcsIGr`_b|F)-054C zg&rLB9Qy8e-Oe{?j19QXykYW;Jq&-vjdA4PW zyD_LstRl)*C7jf#@YgW$J>%}gF>^qUm!pIZsdt~sY;(S=*Wo@3r`6HYL!-+ha5i_$ ziMldjJT$h#*pxf zbS$muBiP{L6>>?X`L}+U3_nenI$|LQ{Lckb=iCV-vCtbDVV=uan|E{VguV5sZIJql zF*ee#^rrXGZ+_WMq?$ZzN){F;*`_VMlqB7Y_he30{j>@%QNfevvAbQZd1_;3MZK0C z-jjb)gkQ0!o+SA^z7(ydn)fKaKr@atVpggH$DKN!Ih8)*mTa0uylbzmv#>>TA+4Y` z_X-@ZCcqn-r>-p4j*!JFK5-_&rlxmy==|wHyn*@nvNGWgBY|QF52uB=v9fbD%QLnU z1rqIN7zn4@hZ$CX-e25Jnis*P)S3Uz+&~HVSI27@#}F5qqc8vH?rd{{;Qs~adFwv} z(Am&7B2c?x$XxLAM@^fD&BB7IAt!Nt0E0mBvS4fMfRsok;KSDPFH^6lwd68sNN-1x zLCuA|k+nn*eooG1vx##`TJ9tupGt$7w@K6f|^=DjK~#09_nDd$6l^yY$?U#@}dJ znagk%HWF?$#5tWoxdhOQbvez}XHUQ}zsnwxSc5kyb~L-V&q4khZVGPEWrlfX3~4bk zWJSDR+;#dUoegpP_8#FHFkOE=GEfX>%4)f4E6~I`BUAAfBJ@G;Rs?YJZ)c(`y5dd@(Y3| zjpD(uj?sI;jkOnzyUimEV+Ar4k#Ksprj$0b(S|q;d5UP?8cv$vt9QqaA24KSi!j5& zxxW+gLSl}iEor!bkuCWKdx4*h|Ix9SRh~NA&-Jq3_&~aC*s!M3nA@`38E3qCTMelW zy+Z|tV@r~=l#1L7FEUr2s^!I#vPph@+vfVfTJ1SJeW;9K5{&agplqZ1nxI;sGz=URD|nsi6nQ)KrgK$!pg+HWJmtT70Ppc zNsi!+DPr9srlm=6F-+Ce{$$w=R5SA}+x93dt&CM)@;pDe5E5SflcaiLH^n7@tp3iN zvs?Rkc*st=D6=ql?HzxqBJh)R#cyhQ^yL%r@iFXuao&-ax2Lv^7&RTSh`oX{&Tg4; znr>*q(%%{&#}CK8v$z_T-E`h4aTlCt>gR-%mhi(gD6&y(ggD!%<2`!q3^!bFf2^2$ zk-CJZefw}MGTs*s5N}JL%3L9d=3=%=IPZ;?3bfrF%_dYf#WaGApK2M0?3{!s$iW*u zmo5sUQb%5znh4=DNpB$wNQd|E5>V1UwA`#9PMV1QRiaU?RjpSo9j{3=Ed!!mbXRW7DPWL)Z5^kpA#MAeVM<-945RI~y z=EfEflmDogif4PaCzf#6qj#pT+cZGvgEmezQ7Vd})RTpEJ%&1Sh<$9J1oE6B<8Kqm zw)s=1j<_%HPmb9#*jweminJdj2K>^~s#<#3MCP}AO+uX{F zO`0WJWw@zD_3+=j$O@a$4uygl|rD|FmxIHTsmod+e z9&ju0COu1GWH_l=KEtf+p#42XoIMsu3n@ixV@cY}87hKVPFpTGw+Sl>ci^5R?0uad z9Mi$3jHLRo4WXr{&v*eoP9)JA@x5yV_}aEEvd2>hIpw4`;`T2LSQ`eU20a+@vs{d`^-sBjxS zW6;xl|MfH9fkjoLpy8m#o1b*Fcx7)+SM24Sa>LzB-T9`YC4}S7{ZvAMCT8iamDXm# zA3CDYcbSsf%KT#7v#dO;4#fWDvK0LBj|E;d%4Zhgd}iqp5wdP2Qv||z zr@mkn4~-ydtWQ^X)mPYsRTgK}G>t4itwq`K)Nzm_F^^xTWH{{_j$EF)K19=3)F1Lg z7@8A=q^T?E2M--di%ai=z%-H?!@p3~9JdXODmT#%zRjJ@xn;c1dMbhUHyF&r{Dke; z5qU=+puX(0g0;{Zqc*Uzk%m3kHQh%8P&eSfOO9eKgXD}^zjJ-$N|Y*a7bKFrKCJ%rhcPZwb6q19d-W7QatKG*_zR#Dn{Pl)~VRX z%O)eok^HkMCRkIlQ`535C4~G84;>*ox30Rk#g(*Q`-=3RqTXo7lrQ-vEt5!d+X+M@ zM~zX*cHd@Aqyj>cfROBJmos;NwXQ|r9Cy?(-i_E;W^#H_xtP|IcakRlG8`UvJO96n`6c+j$p>3n~m+#HAk*9I#M_FImo**k|t#`d3n0=)A>V zZm5X6DBPX3l)dRyDt6Cw zP@@Lks3jUoeP(lC1$K_LZ^1-7OanYj#hR)N`t5I>$|qOL4=ODavl+kV@cs~&Nu@K$P-T!$-9 zf~i&*^!F*LcomSa6`ZMyW${h@2usBom}=?XMzf>F=wB<7An3QAA%PI~Vu224ep*FN z#|{GLrN<|*B`ml_e(?pwh5dHYbgjPIC%;%`8@d-p;zFD+PSOeHBF}f3@Grp7_bJ8Z zw=8p)z@V_FU1qdQxo>(2abGOQ2z@3)dR=UNcabOLO$#?m${EL^KIOK^UdJMYY#Xpy za~W2E*l_zk5HtfJkSgC0QN-?AG+0Ea547YS-w614i!gLItqD z25uRPS-K&EMgACQ)kO_@p!b|~5OH2@8+fJ>j;=Q|EX+J3vgLtD^7lvWcpCTpDczZ7 zIx=^LyJF%G*-`_dQ~KC$>rd}oYnjD_AhKkuW!~sN0OQ}-gg(bWtaO1dZ#}xN`LQB@ znT8?quo>9^R{Y>tq@Kq*0{dalrR0dFnN!ORC0g~V&#zx|Dx9>_pVV_ zGTt78Zyj9txJB}ETBxC@5LkON;WRJ5@G|U$h%RIO_386j}xFu z!A%nrH49DkK>h&maf$X0=T_ zoKdw@w+!NpkMRv@NFa%_$PrOZTY)vbg0HJ}74af|`&MLyEt2&r>>!-(6lYNwx1vOz z%P+5MK2mP#9FPm8*vHZ{XnZ+Z7vCKb6sc<)&)?I9rM@Apl;7fo^XVuS9!=FTis#o* z_vH9bDT})`lM(ilxgSavvUv2xa$a-|B< z;ad7ElG8QkZTl>)*rQqWl~5CEF#l%9SWO{ENWNKTp6u7(< zcrKYuqIMK6xQXozPNS5uA*QMe$>z??Azx;md7#wALSIrMqoF49%uXfhdqFraQ#74E z@lCq--8LAecw4xe9i=uDDtMbv!p-wVXU@vewQ}p|j$HODl@_;x_ZB`lFRWIqL?F(! zI36R+NXLH9S+PlN#SRv?_vmreV$?uv`9hn%NXmspVwuKUIUhjH}U*%FQGTK7U@3^G_ZIzDx4D*h_%7w=P$@(@S* zW;qMf)=D)5BSI%br+zvudT{!$i6=XCprjPPl0z8wk<$ZFqY_Q@QkCEQ9+@wKS>sc+m5 zSosu*id%Lm3jGm0ww9#zO_P~}W>nZ~>h@Psn0Juc-pMjY+ww|UgZP4A?d%5g zF(Ry~!>D1ZMHfX%wg3EB8(_m$nC87NQA; z{8w01%<*WaJw2aNkT8L$O_MO2!z_Zn*WC*xmoKj)AABD()3^JyoRnQGcvIg}55OJSUU)`TV4!jr09n zbzUTj<0ofuag3`-UQ+he)u)&KbC9zi*aCxAgA!j@6!)z?y%JS&PwbGKCG$Wz%{&;IBI+5&u& z%XN>KKQqNfLGC$P3w}-bW*#@_PAn{D=?r^ksq{4B#BgV*f@Uu zksa=9RJdWTXT%fiUGna>MqR4iJ??V-p9OsyQ=;j%Pfv1pe>N#hq5^zs5@WOuL)&ig zr9->*=bEfLu^X|}ZkNe8jHpFn@u+Pj$TuFgs(g{h zd6;kMkg!R+`g~aC41>8An}cxXo*2)&ZIaR?TZ6S_0w|bLH`9*oJ4gG2E}l6aZGBO? zRvTZwu1M_?i2posDSgabH}Za4X7X&a%f^{z#wkMk$Aj|7o@*dh)`Mt@fS>{o0u?loxrfO+NfSd+}7~YJ5xk z;E$9G@NQ z|9~>hYHbZ&v~2v1d?bg$rBP4~!B(w9j#-ngYT0p*br_eZ{d|XV`eh5+fL;}|dNcS$ z$pwZ;dpe`_9vU$$)SGB~@u~(t0!q2wfia=O;-4P-Vb`;1dw*M-AYR!Tut)*L5trbh zd)i3nAJXD-{6l1O_e_xG!etLz#eu`XeB3Euj)_+tkOi!P9!PZ*P&7T2P0T)a0ZW$} zo^wxzHefz3r{d3@t4HMy7M4YX`uoX38#@9ECJz>fY%6V>X{_pV zw~wBBE=sE4Po*=&PXL@9d$kULV0pwxIf)zYc_7~DIBEWMEXwvrk)@t~hK7Qb)1FQk zpTkY(6E+tILG)YRcOEA|oZWv9H)jLAwl#H;7Z8)Akw8#{Evx1v6%neu#$&e#uv^c> zsJN|slj4+_&sY0@rWC|AG732|jj8;xh54x%{6?w#zVM;(@ne2`1qqJGcfodnc)p`Di<}IK!@zO*6&rU?FabAycyXg$%E|L0n$xSPLv!~aj@xd;v(H7YXzH6x$s#JajdV!kc+}K za5n(9`aDU9vX=ePt!8MQDzJqpbIDxoH4b#zHZ!s2QM!NY=N%lJd^S8&iPv&ab*V@# zs@`EY@^)|8ZX1WS@W=rNK9k!MwI@JF(wSSlzzA*g2*(Nv0xpLFiDY~mMt|&MrGve2 z2CsLO)-@GkbN=mzyquSn&M1ZyPUgLrGS&e4t?EON@59%yJ%=OCdAeYlLPZ_|>|d&u z!4fbcPc5pnuf(%RHN9PZcEV)tu+G8w3W5H^)CE~S#8KB@*KY_@9enCjJPwOO)I1MY z0r|!Bs-B~`wUgz-^0l@^gp9DQ&%9RXVo26!| z*RL!lfjQz|f4Bptq+%atd;ZizfD+PRpI~$H`V_}!@z{Pn!8kLujg8W>3T6et6b}dn zFe#!bt%M%S_$&tGZ-jk7S$X+m?sdw#e}Z%SUIQNpf@O`7_pH3o{id&l0rRx4u{RY5 z$z2wc{S5;3o)z8@S5mF3di$n|GT7q-J>Sc3+<%;0n)J-J~JrBjVO=zc(o+ z>yVt?w*YD^U<34s3klR64*&LNJ3fghX~B@n0XO&BFA82w4Y{o?U<(7*?i!$iOsB69 z`x&gRe7?U&KalpF&yXkwuT+#(jnS5CkxV73fzh#0aVSd7s=MHG{jhznpk23zn1>UY znx-8JGWiW77UY-kp5MZID-C{4$u|w~vkNsz{^tOdQ~dkY0tt0QV+q&8=qOHm5wVso z5-!|W9C7Aq?Clmt36twe+xqB1ysDp;_Wt7c-~ws$XOJ023sklJJZ6Bb*7uab*YPBJ z&6hsq{uB)e|Cxr#0D}o2&SZqr)q(V}k059GB&eI5GK{};tfl2HsA2LsCN^+99=dow zW-Sp7EW*wYa}jr)bc=k&$P=gUo2F`$8bOz!Mm+w8FOIm@SsU5@~vR|68Ki0n6V zeTeDJKt`Hz!K@kpN>M%+a1xX!bReLShe&i8R1U6qVf_uXUovXY#kYq})iL6y zaYidHQy@xn2^7!-E!z&soKavZ(LHS}Fx!-7@=Gytur$dIE9f~+D=_;84PO44F==4a zv+%B9E_+rvJ~iSWm4jvuH}k;!ILE-34u8)gb(X1CZa3c4I^3wnr}KeLmwj%q4fW?` z-ezYfz8biF2N>YxY28rp*WZukmN);%VC|*czEojUWds#w0S6S-@uuQ2T)w2dQ9)m< zl?Ls7!O+n{ly8bzx6NMth1l_PjSnF0AKRMcDMDein@#JTi%l;a`tu2?0ghB2ZA&%# zI``Yq24cSk5PYxDJHc3T7fjqOWEJyWRmxE{@Rajv+-r8gOzB-?$TEmi&ue{Lsox-K z>uXP7p6tPwK#lDffqf|fQ&)|-Q{-&+?`GiyQxx`BBI@ytdE+|29FtFgeg7hrsqWMx z^TU4+NvC#1o;L-{h+%fv6eYzs2GRFWJTQr2h((0@a*OUq=INyZHg<^k*4E;kdX3K) zejNH23s|C8s62-IxA@JpCuE9@iUznQaFkXqhkQL1602@~IrKechd;9>0t7Yk40B}; zM1?`_lb4G?@Ra;8UvWo`0XM5buruQvVhzrm4^SrP1lR#3kXjuD`e>lLFp_!4#{J#% zC=dgPWz;saN+YU7OIC@;G$=QS4<=KM=|f|uCmBh+;%TwVlPC6$*$59$_~Ct^-5bsw z533{7jhxf@+2xQ4oA{nl5FGz`bFlhH`Tak7>RQr9;-}xw5NT2_8f{E>Q=VPGE=mU? zZEiisFjN^*UhB%-i{bD3O|~mdkJcCrJlR103jf@%LJl4=`}IB9p?tTv-?2oPGX9LG z-h3uCBgm(+UEr|?f&L&L`4TK73S8XNIc)f?r!5;mWYN(y061;#Eug0&@XtMK6A> zW&lm@SlS_rS2VrTeR=L1WQe+%ifb-L@N z=A^pA)ND!2{>?*+S%3NdawIkzWUZN?F^gGRiXSOiTFAo{K5LlioIw1wQ)}A^=)neq#eFjh|06kXUXoLq?#`%xwz+&Dv$xMm#ijfhIDSnrLs^9XKa*_v43( z26`~!0V|JOS6+opafHEcYxPx3`=dBDT2z_zEFj)r4k=cy&5-Tnb}DBHPtz~4Q3OoW zamStG??@7*GcZ<`8XkK$LbfKh)+jM+MP;--kRbd)3TN9!yUvD`{=-dG30s}-F7z@( zX_zFV@kYOPfdG`bPr;CS73;-~a>h^dT0;%5!}yO+)S3G*i>o|xjlP#lb&tMxF2Szg zGXlG0rpL#zy^>m5zEb|TcGR+@2`|3hYFaLk@o z{VG=129=4V&N*eN@BDgiB<$F8ts@2OOD}}bPRvTHY@TuwyY3qcuxUhOCI1+hQ67~9 z<|KcmI`;VA#O4Uc78k0#|9*krvk_`W9Lv$Xt`=m)Cno-;O55ctC1emQ%jnIv$jP}N zU3^jn{gd()xkm*SmYB~15m8=8b@qY^`lxFq$VFTm)IvTo3(85t8f%aEq!@Rg!$u4q zon$OHB`13fdL5k^lcYxkIb3_&p~HqJq5ssalN^}6P?%zPxZHr`<>cd zWxs8$XtLm5?hl8`# zz5G!&9&)XWk&2gf<<+&5GLWdJB>o18{cyoPnX(xNAoB{n){xet@GFJ zQ_;k7D@qI2R?Rm4U*FQjf%A}e6Il@lp{hf8gQ1m#j{*yFVGg3mzS|4wK!Z4m=a7;u zID?MtFvNx8x`bHE5CTaWf$xqBr_rCW&wiH{->3*o29nUh7UfNla({)k!ZGpDxiY7K zVITm%a+R1CALWifa5EP`!+r!h@(sVNcADbpBV;M4j3-X`uo-qmD_tP?I>I}5l;$X zPG%-z%)>>d>R>SutHWKQ!Y4Z=i)~+3?Z}U3sZUU-D+9TY$wJmN7{zwb;9zY9$rFPs zG!ln#PFTK3Vfq67^gvcyl;obNpYFA;1Ir}L;FU(+uLs*iNF(^HO;UJNL#j%bc9<`W zWLSs86piP?#Y=zA(kR7p_D(&)(G7F|7!5V+GHWxR>dlG<*n{7lwBcXopUmDHzC6w+&EP>5DU*_olXn1@qW7}A3}bn% zK)Pu*(n$w?!W&GYg(o@uw5B!a4@B-8@cl)vt~_utR-u?Jsx@C;gI6B@z_N3QTNz~F zD!#GS)6P0@flru~H&rHg)CbcT=Ft#Po%JuJBa5z~ko!J5xb~ zIJ~+*l(%3D{_r@{b^9y-7 zs3b_EkteidrsU?(vj?Y(y$*`~`}Rg3`#^qg@d)Q(jqN9&H(R^eO5B$x7+y`03ozQd z>o&xH&dsH2HtCEvJ;I+JOKem@CME9M{6f={cTPZqelD7DB5g2FH=;0C`Wm1bn~gEs zP84M$<%8i-6x-Oqxu)G8TUrUStzZx@H^Hbo5#T2#!7!R2Fc=#BVu$VLjJY|hSk7is zCCFwJ9u~s+o}IkdS6vUhB1aW{(ySl)zBhanUG;X1soTPiqfx^$<=10YdAHan=ipEPt2a}xj)%Wfmb!;4RHwlBmzv>j88IUt<>1|+6?hQVEL<%>b zfYA+>&4v%&WXQqOp33z*JD067%v_}z(1Y?syzcUG^+=+}9FHNpaGGxLWl4Z)7}@9Q zH7VGgtG?J*wQ`Sy!sWubM%#=XTET<9q&;sU6?MYCw6D{;3IH=j#pa>5#|Wz{=*h@lW}{EL6Lun%_OPP6i(msVtPgWWZ51#wFAH;;!+zdC zC@WwO;6brhv6azl@hV5W(R$(#l=_Bn*@zhF`r1y&(`r6oDYYwQ|4ng}%G`blv)}(9 zJ|Rq79d6U_nD#~P8;dQgg% zl10S+=uz{WqFyh|gjkCB?BN0$zVw$3JSah~@N~4Awimo1DBH?|skp)m$#IK2D63e* zQD7JkqFy(EpwA&I^Rr`*6m9aVu|@(VS6FMY_A1cySzy|;t#e3Hl|cU$C1i2hXz@jB z)J@v&AFjM!4ax0Wx8A;T9tCu}Cr^l$iP&oD&~`XDx%$+Vfm%m`k|r%YGuuG(t?<(5 z*l02ydADh83Bv<*Y_V1BYw=>-SEhUaECKFM8JE1x>}BNuMM{O*lq6gFMZYNBTfGkR za@62iCBe8G?B{PwU5akrTm0SV3>0y@-2=f>cCUIn!&PnFb&pHF3|}o_yW0lOd!wm2 z;wcg$rBTi#r@xzf{Nqwy2gj}A79UuTc?L-B zwDxnwJ8s%kT}+)-hP-ok>75PgEHpTt-2!-9@YOq4L&19?F!+`eAT`0`-t&vkvg@4M^wUZPau-<+%?d%`9K(yXf_-YWLGMAQeR&`-D zw1>aM1~Jb#nTgd0to|kgx*lp~oyiz-)TnX=;kyLyOAL@9Z0iXmB}ANg2b=IIvTSZ+ zjH0Tw09L-bqFYX#g-MHe32WW*a_RaN*n$fbV^pfvQ)a&aSXF~HJ%!JHcy*I4>_<{j z$^3)K{(hj%yhs1J_>``+JB0gP1$5iFM!lo}W31^%*0kf2Z(c_6AdF}D{cF2coOwYh zlhvK;HBB@LVI+8*anCN)mvx-(=?3waU&dNF+Z*#X`hnnyx4!giKq_iSm)Fwmsrf(S z+6^lPE%e<{+(R1I|6560u^g*S@Zsq?*YyG=e`xwM+R4SS`-MS6W}VI zxa@^Em*xMF+vs&FPBXYIC4kGegp`7VN!h4|n7{YMgfBN2aGE7ebZ$xLAWEAK7i`7G zTV&&nM#H6S9z>A*xUBYf;Ql6mKqzq%CW^2F5c%cgK0&kdBO=cLER)(W{F>Uy0Gch&x1K!1SA<9O3f9P8_)qb6j>DiH15VI^o346w|d zOeUIxs;|)2!ZCl}^vh3uVBKmPPTC4Y*m28MBXCDsv2HF?Nw0>k`z>h zWjz#yL`N-{XBxl`sr8h%WuLPUIh9(pwgW<`#gla?)NIfXlh|_%g<3}kTos-yNC3BcXSBml2V*%qsz44^s?Q?SB9|&1D~d( z9Pd;cg~yvaLu>k7wpzG-0qXh!2YS&Pl-Hd7QpqX1rUHXe&eGamMWp)Y{td-zms!9T zpgdNSfC%DW%4tCfn`CF*$jl_53=hB2l{H(a_p-`UcQeEERy}ab@F=Y6MBoki&zjwOrcti4pS zJG^bBF!NwdDf4L|1Ek=6ohrMU<@?UJ>lXcq?&ZwA*YthR)}uIeAS+Bu-u5l1r?)aM zXSjA^!ds&_@>MCMNu^S0!8fq_syAmb{8JA`LFrTE7ew>#&+;M#x3mN+h!)o>T335U z9mF-2lvcAMH5q=Z(JkWY5j$@S(PP+A^In#6PjN?2UOkkH3TL3b=1UzZ6j?V5nB?m7@t-iG9+WhX8 z^=%T{^Jadv$+U5wgG#T|*Qt7RG>0J?p85blTVTlrZmKpc&>0vb23a zo^3JjwdeeKgb>-RDWkH}CaZrC4E3DjPn3drbYsIfGyLDNz#hv?N9^y~3Q}EEhmSn^ zz|dATt6FREDKkHeqpW)wZ_B*3*-WBK1^HYjqK5##kl@8xqTWYrK6_j+c?!6-?uALb z0~*ZKu9?z1Rx#UCIXet-x(eVnBtzV4Uyo4AY}*F+)Ic=9gWYIxQzwNK5cqucSWu!@ zHrM}rSVNr8!w)(C`q2#Z-YKx%TNjumqBC!!G@=X!j3#AQ3Donzw;!|>llcYQa=)N{ zrllngSSUF;KS(r(w% zACG5F;jWk3KS%6Zqn{k0JuUXhdtC|8rC6qtkMaEcDRS^;dQhD5k8>*Dp9^S@jJ$g% z(I6TJB(AU}aCa(BxnzD=d?V$m_@6J%2HYn>qu+&kzwf|2YGhdcx#YT@Yr>B9mxJn8 z!KEW;;^O5k@M_fEdidHn1I)%uj4~#J9vfc7&rvf!j?m7f=b?Z;7B{cVXa3Ef`pOd3 z@qoPx!SG*0c;LXm?-C5^gpSmT05@iujG&Lrcim_|Mi_%b)|u?}KbljBI)?GFz?i7- z0nf@2hn#u}>4`lj^R9WZ2?JYr$#*sgD5w8QNgpRC|H53Sn2OO38+;;yaE7}K>1!LX z*Kj_Vj9#8*EQLS6$4GC){RLrWBrX)Ti0_NyzLhI4eR`_vcI#4~{~Ek&PK5nLC7xQr z?jOfCqAH$XY)r|qG~EQj_#`z21hLpG2#6^r0JG^FEf(Txm?7649=xT87~bQ)a_b?Z zNu(tH#e#X!0RjyR05Rrcg zx@ub8DpuVNF3%EDV4nYJg%14u7XqITxDb%P0?`V<|(Dw1B8pGLB~Sy1n8_9 z6}chny?r@Vo@-5{7M$Wel{294jZs1uu=ibPjm~H3XCmb_c#QMn0LTIb`Cj>#O-DKc zaR!B!ToA9t)+ZymC5ivu#zpQS3=yQ>UWBfz)IH70&dXKmcQhP`^_Ows#IFcw^e{HA{y zMmOI66p2^=srDu2+-*`yt>LFSCF^iH*nwbZ}PV z&`PO9P;35gz6usu>9+MmH0a2+rrbRXf7$f&{0RP#RA@o!)2#od*<>U=q^$H}Q#o3X4HY-Nh@DEi16vS3Fc`AN0Ak5) z9ZLJE;GH#ky@uPh+GtCg2f^@T6>2Kel4byyJDpSqUowagUnLZ&&7A)n!#&)yX^m1F zD*Cy~>&4J$%p`~Y!^UP5Fis?LUclMlG4kOK($VYH0xkg0Epz|W2_jD1ab@|oZIM(& zBnPKv5x*I5d?W zTU9uxuNA<6$N)2mWW_p4w*UDKXpo4N>nas(J;b}hbJIp;OP-{Hxy!~d_R+J#ZI})4 z@>hW-y+LBMJjl_A@b_F{9)6M33>Aladt7I9R*t}b>iF{4X!H616(e4Ad3DOwBzukDA+x`-b(fQgHRn2!XO8;fp;6jq7b<#VuwIhBo!_4cRl$T&L;Ur@To~_7{V&kqcA;-V59O4aKqj_qSC3oRxM(zq)*55PENPYym~bE0a!A=?edu( z{S8VSmYS?w6Uj<1EE4xyK18fcwKRI<--Wdf_`mtO)u*~<=-{3k_1$ISa=Gp$@0u$v zTFqQ(+BGi!bzMF@9FlT5$IR@z@454##uJ`T_}WreK@CYPkB@avS0HFG48*G5OS}Da z6xp!16*-LvjO`1S|DGOs34V1M6dJU?O_!L?As*}Z7fS6-&o-P*Yju~UT`TGTOYB>; zaJG8~s?QU{`$Q#Rqq{$=xrsdV}_@H^Fi!*rIkGTuy6<^gNW?Y&jcFN#Ok?le@)aG}&akbYZR z+bb0&Tmjs%g#M)9zvUtkheLYv884$peDPJSmHJ=fU||t+iUmyw!q(}`x@M-Mj5KAS zYi#N^Ie3SMPnS;?SGJttRS})yW*}lYkzu$Q%39+VJ+62;pJEv5CX}<3+iNwh5Al#4Wa5lJq&Aahp9LBe> z>;SxOf;agLcsfktuiy7cM-fl)L5VJJ`csyj0;U?0;S{*lf>xG~JUxk0n1rgA?`t5b z+FvqOHuHczPWcW$Tq=R=lS?cr6#qTwrfh2*I1%5`FtP@pTnR%;d(GSK?!I7C62{u2 z9@J56PDw-7nupiGqpM;>2N_hH3|&(3`}mFDrwo6H5!pjtX?`!nVNe5%|H`QaDk!Un z{l@5NTJYUeSdh%_Q!t94$=_3@h>O1)klP!GK9w}ncs84dsQY%Kb$i3)A2C%3iE9^~ zAO7dd2YbL-Ch(;@{JgWOmu;be_N|G_Oh8pq6$9?8NHUnT1@J5~0XSgdXIu%c=N2Ks59@s~GsgQ4yKn+otg8`D0PF~mz#qUb4S--DS=A>LBQr_jY-+>(1ny}uUY)Teh z6**wKdx<%jV1Rv2uElCzg}9O@hy0x7XE@%Dmskidh=EuY?AHaoC9?xChPi8ru=tyv zZKhGr)a3YhYP~s7x!noii@J|e`Zy}JwP~(LV`2#-l}qsVEfVtO!h(xaq=G$W+V4dn zIA6Gb1zXcRzhP3R0%Rg}7_Pi^$JbhX#$jvQA9$CyVzd{iq~qqfDQ-_fR21#tyAl z!J!1qux)zfM+p0W=Hd$heyJJ^y!Ij3ljJi;dB#rHsN#l#wik4+)FMqep9StETes1e<4Wa2`G1uaiPJIr( zqP2Q+L}_EddhkGfAI=L8a?uMPB6lSRI**L_JSMLz?f{~WRjQN%2k{K(-qc6Xg(AcB z!Kzr4mq?oP1+d(=05Ush8XGBBQ?A>E1l%#tAkNtRvSn%=2wjKY)cs#MxR{V>&kfxz zFn?&d0I!b?8UQgBi5k#msfV9>T*AIr%_^f0899^I@w)pXi$ol`YerKs4+X|sM)59z z?KFIDjZq22BWS2)YNiO;o=|)i3{fC|B~T|I*n5S-E&b9oYa+%J7|31@>N}gO0e8FD5pNYT%gcW$l8l zjKt1Ht(Hb1n6C^YVsF9L;b)O?3`CELi;h!+I5E1nq7peBi>T%hw&ZHr7$336*h`sr z_7nppANcLUlJNUOUF{T$-SCQEc+jsZlJ&6uALx!)sQJA<QZTKoIH2<`{EPZ;Pa(x+E`~Y>^&>>Hw(|leMVr>2kcv3 z^yCq)c7Qq+K8WmHEkaZ~2;`AnR3c#2TVbarZ9Pj0UdUH>haq~{ulm%OO-C{?UcZ~G zwLrHKxSIZSzF2eORVN2*KJ1c>{#z`Qpd#3QU)1SgoDrb!m$AEzE{A0gV~;ROUx}GH zapyDW% zWXYIjs4)~vAlsMWCzql0tZer=^HXXUh6Q2r1V?b9OfTAYK<5K|4@P$QxTN!ka67KZ zssY+R)kUzNn^ZGWn0QgF$xP~5b5rmBwEf4#Khx&0^}%3t#TO6UfPrFzcvFKZ&+tE) zAp?$wWx*`XCZHBjXRopZq)1m;=U}m`@-BGj@kFU`pq0PrW@nC~{PCP=N51hvFcqB? zY~va?rLLKH6IA^zVDQz7%`jxiAu|DAE8Jd6Qzj!XPH{>;_1*-#JEOL{$nJ^7G_x2& z`7iUgF_AiG*`>qu+^EpDeQ0^~(}0Y1`KcQBFLxt9>h1Vs&$U zm`UVHf)UrA+{*u;)5J(kb0Fc{86MK1TzTAt+lG*Ahe3Kdn_;`-(B3p!cN2rA9`YiB@szJkEgte(XRLO?@hl#lq9 z(qVjC*ax%*`o}gPvs3PE1Ti>AT?zgA)6YH!HfJ%KD^7}j3D0_D@lG=jbmELiwXF}n zO!LZ%j7boY zvn+r z_(+kZ`vEqe_tby?>+aJ`>BqAPzN&yYVB|9ny68AD4-;O^a)`ds{0sbyT1Gq_#QcM^ zl>u6sTG3KLuDCWrxtCil&~%5q*R!e7jgf4y z)p_9%H0HYWrnXPNKw_%IHumlN;*U+)>{H6|eKBIl9tF}*?CljzKQu=( z;@<9GVyIoR{+ph|5Jc-UKS7Uhi8~{6Z+n33#S0Be@G-y_@ZcO-IL;LWkEAlWy&GgW z!wT2FHVKSHkvJL{P}c33Oj38O+QlO*nZC~7iV8;k%5@N~H3 z4j7?)o4!4@EY0Ixe6RcRgt&KEc>}YEm<*zKLu~h3XZ3b>-F%t;-MB~T>#;=Ghfg+m0T8wU!lcH0Ji_1EV$tgDxp!hlC zDc62uolD{B%9SM$U?R)CK*}=bF_3Hscjk2Pxf7UT*3g8)elsb znj>z4RZb4U`El%NT@*wPH?PA5vs5yB=QWg`bpN%hThYuzVAq$6&9V);7s|c3_Nq|? zZXMiflBFaAhwZ%sGCz%oGi(33+@lYGAmv9AEFcV2x;nE+eYPqVh?Lu^I&*sgpz(Ns zWWPuccl4l%!5@VIa;OoAz+`?0!crJLtCZLg_(3#=wlpyUEU+KQf#h9=fI`v534CEZ zQD!C)H^em87=eH8&XR`i^B;k7^8wcfuTC3CwzP5dL1Yk_o^7G#lAH})F-uaQJ5G@CO zj#T@fJA~5{H|L9T&|q^{Oj*GOP>N3GfwYaUqAbV_eLmv|pwzDh-AI>HX3s(dFMuBV zT%Rsd#wXMXp9g%>j)0?RRk@Qy-<|@nfdT!D5aM&D`w-aW0vVYeJv{{4a!86*8CTSL z_|2K$!2ev(5_PQy>?;!h+4MS%z2IDcgSzVfZ=Rf($m#Rlz4f*b{ip(~oQwC5gruNHyob>5t78`ZivnfB2_MwUMz(?ThFu z`&{^p83%?s6olmL&GfLR4bC2gSl9j&S^wys@T=xo<%m<1OS!0vSz(lt=qy|ECp?oU z5USnA1-_Lo8lIMCPBMNClg2$Aaz;!0Gc=(Bv964_0Z4ml0$5vLls;R|AB1^T2{*^gEP0ZWr(sfq7#J<8ZVj0mEB#-4qui*c&BFF zWaj-5@LL!2WQi!Vfdf5fObe{D`<;o$XG*~DFnv~%75m&Wsh+Xf%E%=2+#3tu~ zC}+pLMVTtxnLAM&*2knyqK8);5znJ4K0A!oy7jjTe*XW$g5QU4!~(^=|F$9fIrCzv zsg_{rx^ai1x7>c0zGIqpLl|+Z*Nyk3dAu$N?6bnifdBi9eY> z2VW{nzNoY|jH&p8PB#D<*?E z30lI#W*%#)W#{>Vp<~1-_=rj?G5R&ehS=Kk9M`MR6l;)ii{3=Q8Q7o^;0(#PcvY&a zmyExFE$&Zbav=WBofCNDz2pqZi7F@508ZEmU>Qy5>K@DBX;m80f*}!1)BP}5b4Qv_ z9LRg{f9nP({XZ$sd~+l08-zy~N#}jtXVyj_Ut*Z}ii|Rc-M5vldQJe;(3>u0b>DN+ z_!8e5-LVwAnquQr%@8y?t{kphMR^kLq1B1U@=6*@>$I0Q?dLb~zX zS}qs?bJ#g0>ISr@pWD+>rl-+w9AA8$QQZXz(*b)k)m2Mw>8_kBwJ?c@@{zrlYifMf zPGD0vim-UM(SmVuC9jl6v~Kq%Jus=6qED3f*Y~>F`&bMt1HGmtZ-dc;AM(pu9 zRzf*@RMLXJMRA?%vsa!_=n;9uJw!^o8z%$fGS$eZpPO?LKQz@McUGXUw`o*2U=PBPv((a@FubXI97yX`z4D}7^~T#6>Md?D zJYak}baDv{nu%c!Bwpn6>oE0lJ8Ihh2nPVjw<#)zeXkdziV{I!@4zzL)jBdeNDpkcA2?fi1 z(I&F+ttc(#CsjUD;_K{Wm+Pgt(dN_5=4I_{2j~Wt;NUyj}lD_*-Dbu4LQm@G1c3Gf&FcW@B z|E2*}5rPf^5=}OHg0n42IuYw$*vB%3$Y(nVQ76ySL5i--R7ui@MQMhdn*1rI2fPXs zM#A3EgLuH28LpHjL1Wlo<{uEi?-jI*o036LaRZ=$zNl2IQs2}XY8f;Cgo3Hkh)=5Y zaU>f^9iu{Cke>zH>OEW@Pmb2jN&E&d!#4igQ0_3=kc|j!H=#K2 z4w%t@q$0pz9ZuT%^a;ebyYV1QJ~GGEjNB@W@0MpHrK05&;(=(fc_`(BHQ&h^ zf{3C%TjQJ-gj)O8p`~lEKQ^en-v!2EOGi^w0PSHhwzbCt)BE&nT=3fd)sp@vPRV!{G;_)vQ00^NG z;yp+JdEOf1GoMSxjQMye_s#VN0mw9m2q{6*B}(4g6#}@$N~_IYeAihs9xq(#7BFBh1z9`Wc_SGnEeq4cAIEijh#z&mcjV^QJW8^$EJK2&#e1O`5lRQuDScX6W)>n!!c5+s}t70~bK4tPg zY5TkEy3C@F;~~mr-jf*%$#K=@cU{B#M6`{sUbl69lC?fkRWkt>C>#Um4)TE!6)`FU+4pCM@PZsLm_ zqnt<~T`R7c9ZxnEW8->SCQ!xw@qK1O_qbT>bvUzyS2r}D?b|I@_H}THQ2+I}BhF{% zMG-r4>jvTtyW|A1LEV${O{8m02WXo|;+zKGo$gbm+E_QOo0d*{FZ!xcm^EqFSr-Zc z#XU|J?^iZCKCC%qUqt^_f!v1Vt*ApM8nBfK zNmt#Q+eLReswf}w+ZT4OgfYS03}OLlR|qxrM)(DNFyi4)WWwU@feS%oLS`VaVLEx{ zNwamNt8ADr>AKopr_@^AvX?-=w`b-P?SwTKyG&W(Wd2OFJ$HYM;fXSW57=>9Q-(xu zQbcXiVs6VNDVK|tXI*$ZUh}KD%+{u+lFu%L-xk7fLxtlAi)Y`j)&^zt6sCsQ8V&G4|!Ug*Nqo>`@PKXm62J;o@ zgIoJFJD(SDP87yu=Qo`Nj+if3R;Z>u%qbNw3#|)=iKeaUZ0BI0I-TR3?5THy7h~iF z?9wB5aN~Gm_tJy%w}|^Xx5@&n4L#@=4J(``=3ueTNTLs_r52 z*N2@Go36W`E@s#yir>u8e1gHxew&t=cOh*iAz!3vTg=|OVmp=fnr%KZCgSa8JW}?i zlWr(JdtJ$ssn2$|K8N|U7ZD!jKzfQ5~n-;#*!cB+2sVv>>(Yrm+A|aEKBo|$x;YsvF`!LBu1`L zm(kG)gnXwb!wpiTht3vrFY0w)o!Ru{j3{==Ao5TqP?##D~5_R{YXPA;l z)Lq=lgp_uUHo2-k%hmmBK2~ssCu&4>ezNgW5`3EW>`NYx7(AOf=H_$$czFPL2rlxn{3!FHg znIJ)`hc#7#>DkLhENWCu$Np5_40F>Kd_awXV2VqWi{G1UB}djYU6)S+YS_CJ2kpjE zYHIDC5w=msQW(g7XdHc8Vd|LCYq~aD)=q@+i-7|DpLpvZ~GG9`= zAq^kkgV`Noo>Dc5(r+H@G_m%zeLk^ZB3HH8V}{f`g|GFII0b$8l!4C*6XB#I8d^$@ zue6dSF1LR1J)Sb;)0`F3C5Ed>Zpww@JyabQ9Hs83aRxvI}Weeg?c#Q`Z7! z+OxKxiCazum-!g|y-Gz335-^wRSu4=-{Nb7NO83RGpu^qGGaJnG8P@1r|!=P$Z(vs z8aOsTqr&nq>b&^G6lWWK7SD4DH!(O8E}kryiyVb%YW(FqSvw6>1>Gn0wbI5jt*ppJ4AVdw>3O0kTe9JS7?5%>RUT&I9k#|+`8qMb% zgKI1nL~HQ|#RF^uHN-i=w@b%KRUhk7%gCek5MMWyS(Gwn5mCE_9qu`-d(r3;Lp*b8-AU?pgwjBKiacqrwyP1z&n(qc z{5b1?+jeAD=UM?D00&}BUM|bS`qSf&EuRuppUm4<-fNo;la3uvbuOC$^==vz=W{NT zbe!~iq-!@uPH+En&j*DgX*+* z^aaeyRxUBSuwq^Q`(2;Frt@-0jY6~Z6Q+DgVsh#6R{u!n!&3kAZ}DJ6iprALHZ`w( zR*xB4dkC0G(dfHwy^tg|nl1=}pcdfn^mjlC zlEL%E+Bz!QGADZcm(>^lcpi5Z1U@aw*QkfprVb3*M?fP zg4?t|Kf&t9qhr8hf9%leUajs4KwG+CTN0+k&&{taAz)qpL8bT~-nLp6RQ61=UoKwe zUHt!8o#jJ*06wXxljTn}K}At=>9w1*^rU}3PL~~ z3%eXFQp3BpU)55HK4dZfv1HgYpeLBH@$l`X?LSSKy41Z~J@QMB@Y`elc$SH_lb(Gf zn-rx>zwWz-IhCUD@8@YFI z3Tuxnc5cs!Hhd&l46H#rIXTp01brlLCXSn>WC6BpvqZiw1Enw|uKu`V4O4~IBuh>`@e5L&c&x!2{VUlkxmrk@8>cK9&k8 zU1Lc`vu+jUtw5isflKC$)PL$Z$5xzy@Y?&|-S(~sqbb9m9_@pJWc#0~$kk}HR5 zbypST;HB|#3hh6)v0BPK`@i3{=`pO2eP#xclOl@+YxVzQkxYw_HS2a!SYgs!S|3wa zip)QM+=bP(`)$RpfX;QzHZLP|R(Yrt+5S%(LWXSk3lg|QjosIzf8F%p1hV0unw2XX z^^48N^CBzg{r)t|l6NxKa1Z|V^rC0^XGg4rxB03|uX8TsZ;OLZBfI)@!B_sa_IjTI zm&hf3nquVx679)Jt8v`V*ZtG=KF9;Nvmvs#@9gHEKll%C@bjBLev1$RwDwJi$H~oq3Bi|^ z=*Q|T{Yz^bc&7*AqUKHPO#J7`_%F+ztQf3-91VK8cz2T>2*7an~PK9hu)Kn7pSxJ*N1v zUd-MUnV`6{ydLLHg74o>yJ6Fx=NavjZ^^ZLD6aqr?!=N<*P28cHqg!BtMybI)5{$%m*k0kq;z>lDXLXXAlmqSzAVG{*}yb z9)ubgS+!#q!X;cDFs%P|1LJo6F<0{;hqa9CmoMzM z&0@;TUmv=={e>mBE0El|{?DfRLnYtDygvGW^#5(+@NPg{^vZ`nd;aCCeC|-mwA;wKDVx5Uu#XO{kBfq+`d_&7W(qcb zZnRvp##@Qc$uRpB^C|cMub!+GQ@L`Ze*A0Y8ZKQv4`pP; z)O?12xtku9)Qa$SD^IlgY(IlBbl_CM-RDxBP}zij4jV{T^~-NSj{t9wRsRAIjBNl; zDW(3yhx{MEJY@I$0kSg5qm4(w$v2?G| z-9EB@*F%q*=#QFzt?Umm#SqV{&|-maSXbLzB9Et9{v@g@?&<%W0RC;kf~kynt|yyO zYk97I_#bPbwF58y`^v0|`miLhvoZ3r-Kfk$R_^@wa6RW;&COH25)@6$5|3jX{%np`a@uu0j^aI%cuY?85d;?-7v?mt)fTns2*#=MXw{Qe3*&XVPl%i3?O-KQ+%HhG3H zq-FbzeTp$<`j1Ps{4o^o!qdKfXLj&6`w!KDe+i+NOD9{g5P|#rIL!aUyZ&kmE>b}y zV#k9|@W8wO=@u`CzN_Kh(od{-dJhu3R71~A`sb6H@=|0XQMzB>;`hh^pNvjH!uvDE z+laf#&HsMQe)Z`qOT4tY*sJuAp&W2t1Od2AKt!}4rMFqY5Do*3OPHtX+*q*QVNk^~ zfk|~-&T)VRa)pT5o4@~)n2b0F5dw4)aE6W}!L5h1L~umx85lR&V!am;EOxn0;$TWQ zh00HU+wyw~>X0+8sRXPXm*1Z52zX3zQAa6(YW8qGAZZgpataA=0wrd1B7k|aD>)O| zIxJGG0P2tjBtEuX2s_gS`m9V#Gjj;7#-M+_6vVkiV&BLH@|S`DrZpgp31frMG)+bP zXqR()>78Ld*%Ib%2w47DiUZr_LXXt%0jx11)EEXKoNN%{>_n8-FpQmsC=Yd5A0j42 zT6fyVaxX7puA!km2bkt`azqkX4uE@bvN{GMETaX0nZo(FdFYF5umGf1j{NMHpNjmq2WKDdu>$D^XXz}jfFz$C z&3g8FhBzThkbcjp7VdoO+*;sMK_lE&a@65tZ)j4}jILE~?^9Hmbeeq*V0X0c9obfc z!+kIeRRgHo^;~lX6Ro7Hv9uTaKp2_C zu~qiO1***|sT!f%zR|K1g!f)4A$J(9U<>i=q$Nh-ivXe8R&#vuhIP;}U>@RAA$#K4 z71=QXKw0O~+c2sTvKxkH2n7iDhq#DaB{6+*&njnu!C=DEgZAAx{N;?X)~GFxe-g%J zI{+!&k2%6=I)dO{!x==`EZ87`M2|kW)^=e7pQ*AiqdmWQzBISSR)3NPF%cM`5ts;xF)~If$p@{L$VcFhi6Nv+mS{gh^?aseagp!pQ_rIr~iy8HFmx8|GkIx6dVe ze0~ggD%JBV{Z`|{|284S9VoEBah9WQsCaQc#R?GC`d`!~&Oe=F>!xQ_Ce(w^)sW=y)W%nc zLJ%n%s%l4Wi$X*-td2;rPwpv87WRO-{YcmQ8W_Uxj7nrt(@*L>3<^C!XIwo5T5NI! z=4fX5gX5Ex`MBk?Y9#NzeoZ#D;vm*!s7D|Z?sHeg9Ry@*F1nqNHW#`aoJ`A=@ zn#Iky1Nsn!-+l`={v#O{LM3e6a3oMXy+-81G8RVZbDwJfgxr#DTs8NaE^M#Fjpqo9 zPREnMc>Dd8mo!fuutxa~!}0ZkmCNaF3G=Tm20YH+-6KR85CgXD8E#E6Ad#z=vRcD= zD{YUL+X9O6K;x^OH9AHme`69{M>a4WbA7vUb0=?l`TlI+K3>mo=(ji&_NksC5w&K1 zac`?{i^oHjh;ii1A4NtKC$CscM1I8Q`oMf{AkW19!iNkRB%iJ)r9xjY>!4c5)LA zwL;Tum>oB*7wlzi15tyj>d5TuSmD(z|IeTYe|uw7Dts8B^s9Ha?7h7A#@7>PEUuQ2 zZ)gL8c;$KlYC6`N-END8icj+NrT|zNt38DP;?QKuRjp#a`eS)KK2Sve$1Zscp`y=$ z*-_BkB$~?6YZ-0p1jn z?VAKefHtsQ*0Il=;*mL#8x@1Q8p6$G3_KN$LttY0hwn=HvH}v%#RQFkb7+ZL9|oOA zbZ=xwYMA^3zrqxBYK1RbCrs&tIOVIp1_{M#AgNbT`5N7!dABseT(Y|0Ntas(o7f(^ zclpA~QMWkjw94uL24~?mKM^oUi2j+EEq`FVI)=EGcxC0{Lm6EXJ}?8hO2uo0!Yl+@ zL}R+#k)qiSz`L3qkbJ<85$7m|=ja5RqT5I`nht_n(kl!@51;vX9ra7E1DHH#_tm$D z96;xx1N0)SwCP1&!qgt;7gnM0q9%6^+`-uMh~LFAs*=N0zSNE?62`OohCsqK9oFD| zrYpE*zKRp%7RZ5yfl7Neh_!$={JG!Z`bqwx?I-Z?927mX`TIFT+>OzJSWU6HI9@U&gC;IT+qHt_MXVPsuh zYKLQ74Un{iqnM#3%R(cCJ4?#gqR4sb%d^-?J=aIAup)gJL-Sb_JXD<;m{4}rCfLJo znxNRcBksyu*@g}`G~RlxVDNe4H~dHaVON-)*>7H&)vAIeU_8-&Q&M^*XZ8y! zuIt^79aE2)GFA&VoZMKO|9y21qDGZ~k8$rPu(~`gD5O|YygZ6Vol5rxTh~Y4=1DD~ z(BIkwv#=KcED^%NZUmnwM?Op}s39ef6weYckbVJ_UWTK6V5D2i}IDl>9;%&210&M?K88`S_+CA;m$Rhg<8D07h9HDBQClV9`XT8RFZllOEnF zg_FUEx)$bAuXO@3%P`Pnc)iVUqr$x+xF@nbyZ!<>NXd#Ud5-||MtU8~3qbB^Tk0g# zafE*i2qxO$xC~V0^7P{p5j z+8$!|_rV6Iy+^L#2W-l$3%=c%YP;kgCp@4R6(t8zEwxc|QPuIMwC58XEM|dTH*i$Z zsSpr3+9F2fTStz_B+y)#Cgd)DN(U&Hh-h-t7o>X%-TH9JC}uSLM!?o0T@esk4N=u7 z`-v}i+v0#_+- zHQv)f6K^m*SjFKYMOWE*F6wD752skj>mFEXP)$|K{N_k0D9{GbOLKt8x^@M-U7}-; zYb}=XQsCg1nw}&zCrFVgeM_oVO+k?_KV&nlF@ynO|cYp5{CG{EjNWH-tm z4WbYiGU$(SV$iN#!GtWakk#q`k2PSX23M%&$Y(CdE>4xbUOcHXKzaQ5u3<>hZJ#0hhu^$kBhtpl91~d z4(ml?Sti3Ne7x00AZASknweS3JRLc?4a);`-t5~fF zg(%LEaWG7NwEaYvvL~qGO0prt(}7}Q z!Q*b}3QJN`9HB(C^*nQcfaJqH!%q^21gS!OPeeR;<}#^u6fPAt%l*R3;VzShFb!v{ zi1ovu3c5WjxcAUH&a6cU3jc`JHhYzepw=8o+tDp|S|NyCdUh-gr6S$GZwk8xJi9HX z>fHix_;PUEh2x$WJ8ZK{lE-3{vT5rdU_*I^#xmdPQ9A0O$4@N7DN8K=RmNLyULsdUgqT0zyxQZB=!5(UYN@R05M z=%m(0Nx;4LuczK)1}X)~m9r^+z(2{I1w15`LJxTtko?M$gvdq(^VUdjUdAw+-6=}> z4j`FuTIVY`)Syv2N-ri5-H0@v4MNTHxeMvhOY_rPGUWIKoH zH{|Cs9|Se5P^hR0K28q*m@G=@N<$+!@%}u0kr@VqplsSvZrn(V%5$Mu$7N&300p`+X6PTF3 zm&N^Co=r?9i}{yoXAC+yd3yVGz}bhVjVaYV8I(k7(}@4fp+)ObJ5)q21F2CC@O0=A z%3NVEEjJr#DcW#Vbu+jEyOa@%?LE zL00W44Zqwg5MFa0hQJumR#Q0PhB>HUbed|V1tCT=&`r?g){2@yQSHk)6e334-D4hj zDbmive5e@s^ODA?&RQZ>U&+J^hn6dL92+08{73ChRdyOH+q+r|fP3wwbeBQ3vRn^U zon9QK))KS%@ajHEn#s33Gb^qU5f`s!NqJe}0dJijBS_r-s#*MDrHk4mTUJCv9$^ za(+cCycG1BIvFhz>lLkB^igPd$B}jIm0R+o|Augx=m1nnd_mu5m{@5OLV)~A++ zyjWVIHhITLIlI8@eIsuq>nK>%cDaXhvLEXo8@NpPhrxg)t|RVoA**a)9qiOj+^6g= zr@?j@u_pCi16>{Ab|c{BK_dc*6}ABw&|xgUTg8(Tnq}TxN&EIwdKe5d%Z)4y!YP_$ z^Das#s21ri*cbMh$;3;R9m@$Q%eXfWc780(U&uiAADA^|dpXY8`N|2~ycZlBh)-Mm z9Cl6Ldb}F^NE^_EUEB%LQ-}W6PmtLgqLCK>0@g9T&M>Zenl`#`TmZaDre64B+9l4T%D67`)c(Fz_C%G z)GWq`@OV%+#{x#OuJTgIyrRETA%Vyp$b^5uj%B4atDnWB0s)k7H}V|Mk|X} zhO7#1USqSJE18L?V;Zpdtvo3p$n{6dQkxBdC zKA3!+xx!M&;|o$_?`Tic=P<%G3&$~uB=?v=c6_OHVxcm~B)B46_QQLQhv+r~``bkj zOz>2h;XXJ6=64uIGp!r^=y|l;AbYn)0uOpHfvx_fUmZS(!>OGYdW8}<1qA|E*r}&2 zUr?QfWukImEElehh~F5L9)x8XLNtvYpYdWbYZ**PTdX9n?X~T-tjS(oa0q{lTeVSZ*JqvzfY8_8~Orh`|hqEQvtdS7(j2LyyuNA}(ES(l1eI15T(ecrw9E(%0JKj32Mt#i5 zP)M9PijGpbZ^@Q4P^* z=Tg1JXv^mK=Hn`tt&#J2x4q>YXV?*fmVZ)qM`iYVhfc{eZ{a}VK2ulE!E;Z-X<(3U zNNiOzTokgn!nj(@UE0d_9r!b=dkWF)4IsyGcu6DD28rp;L3@xyb%gP9ko7>7*EV#l z5QiO2R-%Afz0hs2@nt|D{gLKP>gQk!r$4UP0f)2mlSb%I3?y}4|A-rgd?D*(D>?KP zET9RT9rIMXQeDfKPGt|{TSNGulcP;}%^C*pt8!qX0a*t%!eb8qNDG3^LlDg~fP9GH z;;*byjzZUs4Cgt3wUZN4@%VzvK`5=hOnFKfNeE_Fu!gi{)Jj+&ezquD&?d2nnkMeV zU8m1V_Ui?Ht#FR77L9TuLO36h8{e-m37Pxt2Au^Y(p6Sz3m+2auYX@?R2yc`3-->q zZheoW?`n^YJwVkH>Fu*Z05LD?dee=a?S=4l>Qdnp6w+V9wCPb5>|byyhIpzL*^>Q> z=b@52sL%m|Q}6kQ1tNT@1MIS%NS+8XYwoQ(wIvI(uoS6D8@a5M5F_Q=n5^sjHNco* zoV=DQp7-`)l~)hdj}jzNfgEEC(X0tV=c9^%&R73p*Pap&g2x6_Sw*J_LiEeoFeR!D zG$2;M$%4|knbY^REO48rpn*X5-W<3tQCo<%z?)$ z6Z<^+u=L1SfUfQRFt9Q9eW!1)l+&5D!5Eo_cqM5;vrCrzQVjif*s*q0=@{nM5g4=1D&~=GZ4KDq23h=(ee{s$L-&DUz+lnitx)j zG>5~Y2R=zh74U$eN7JnSH`vN4sJZc$GEN*<6ld8jKO(Aa#O^%URhUl$C0)=Dcg=^R zHU!5y=kbBF+8m3Rvmw;$8*)F{LtN=^`RR#UgPAl9Key_Z_sucPFLuZuDMhggtCXSN zfR!^u6xT=0=5|~%dSi~u(+4W zUoT|kDu%lrP|WS7?;!_)WKNWiSS3@^2s}fr!iQri`<~Cg0#S#=;{)Toe3*+QpXi?T zsA!4XBi_N3t8lH!RN{G&cB9Q%D^>nWUz?%3mfzz^EgL0By39)>lYAkMLEAKozkV)1U(xQU~H0nSx|M0 zB3sO!^y&R;LK41u< z7~Uv0X-U>QYSEq^!*E^fd&g`U$32?-Gewh_wn2CY< zGNkY#KK;oT*HQ)w36OlqM(J`gPe5^7Fsx0eGO^^rmoNDsHzHShhiV=7Bf*w8S$ps0- zTwuaQ5{}*!9PNhLoy#Ed7iN`Ox}Km!c6Zaxix-AfOFE!w(fa5t3marX&m>>JrXl%x z=;H+Li$-l5x_EhePyzhD$AhC9#4f3!;<+0^Kkx_UO73F6b=FQ(_fyK|NlA!dEP{PY zpkAor|9JtDCF1~@)z+~h6KWH^as6r@@i>Nd?o+ku*KqC4&l(RH3yEoBM1yIwKKdLL z*7}^+11p!(|6W1ddD2o@2)C=%=t2-RQV$Tgdd}o3jusJ!i(Q(8QROyhUiINhP`%n( z@QAorCX}lYVRwk+dukI0m@L9vVj?8a^~bxvxg#wb&TdRwFc9KDu?Q~Ipk=ZYmyosS zX@5R%KLzc-f+MhJFyop2(AiMSl%-0L^FS#DCmkvQsa3`(uDFVG`e2?l2?MN`o8$fU zKtE!De)|(=o(;8ed8jt8W)YnNtbQ$(l!`MDY&75UDm3yp2$W>&gh`<>h5(^ONJTW0 zLTPtF=Q^!3^6|K?SmP)31VRQci_hy=+i7T)f?U z(|FBg7NkG_6GRWr%UrJqDlhR9oXW*i8{a~1M8_Uuwha=QOO3a=<4%#sFQYu&cR~-! zfi|&dhJ+~LR5R7H_8e=CJVT8v6sLLQ3lm^Wg6oySD&txWgV=<#wNFD&JfMNK-aUP< z6NaGoOUhrCLXFa#TO6_4Q16wqQb-7d>bX#bHrwg5dtK~&d;RtCBi_;v)r0IDo7>#{ zyA=HHV&yg}QvS`^E)|sVL6pZJRrzcib>pz1nL{DTKbXTEw{1x!p(v0p9RSp@f5 zxK;ORGmjLUC{F!rG$ntLbiP$8F;m#HWBtMTdzhn&&an7aWmllo!T4_4b%fu@YSh~# zFhXMU&6=>Ioq%+?ICCPn;N-_$YCe(^<^`FVxj+DQteKf>{>WZI(?=YS;uUESw~0!L zMuebEiwl?^ng4ztpc=p4P2BMYWisnG{i*o)V|`iUrIPT;Zs*CNouVJ`;j9%? z5XBy@7%C`$^u54y8ez$ZiDWUq8GvQ*_l4V-KOK2@>9+1bVMhqiS%7kE1=JXLLijIu z;`Scza(08jys%gMxb3HNjm(?=5^Y{+03j>Yf+vi15pDLlUK(B$D?tz5$YbVSFlVmg zMB!hE{}7dDVzzCi^;6oac%|rREQporF&(;i%c|hpn>baK1J6+uO0a#xc^*Qw3^a)N zRM)m5I^D877W!0#3l(}J@uJY-K-v^mZAIgh8KQlm8KKoXy49~Ky@%2-fw}bHVFQT$ z4`lc*!rrQ+k>x?yN08EkilZ^q&mDbxZ5W=fk0HPm{hI}Ei_Cf&HKKcEQiS)&V$%P8wIVGfFz43&ey-q?xfR^O32*KN-X*>hJGw*#)3@SXLa6ek@#k9uty+Q&`b}J4 zgbuZz?u|JiF=t^!MXxgZRL?=H6L=wITkY*F&9v$S+tvrcwPIoQ_E*=g5Pol`(%A2q zH%YJ+I(eSor75!#@gpaJw5z_p*yW=T-}S~?hyowy;o-x%`bZTmQ_1nQ)&dmygGnid z8pgzGCacWHQ`1VjGY#zD_G^bF8^%Ag8D(23D>b7zI7ZT;s-^=p6EAJPfBnk=YPTV- zNBo@_d?;Kp=b?mj;o}?JVGMqtOw-jLWav!c25Xd-aEHC>|3Za(gt}p$EMD^L~^!QJ@|(THY(dXj3)odAjy+FNa@%ok6xZJkaS z|ER}Cyq(=D7IylRb;$9QvaN${NSj@bO-AhVSmWvH!K*Ru!k9>!odU0pdU#7S=_dQ$ z1iG3=J7tDRM=C;d=A6B4%no|YGm;N6Avupz0I$Smj}(BiUGZ|i_ zGcPv!d(jS?M5V~LY{=4i@veVCOS^;mgIA|I?uMFa2vo4^sJWG|JY26-mnAGtnt;{_ zXJ*iV*uCW5{+ZjJb(K4J%15~`byFCcqM2D+dx)NN-08|5P9pEF+!Vo*D!Bfpu0{mD+C5yaa6UDwPzHz4AgTPh?#`nxh{h^FaRv9jTQ!}Pe35q zM2{Nfi#HRNN}UwFi0RSyOq74)rA!4xYr*AEz_C@n)JZyFQ0s!DTy0I~C0n&RLIdaZ zai|ox#<%eeb6FKbeLRqqqO{|sK)q}8p{r2NMQpc)_>;{d+&s$|eVFvn5ck*_uOY}% zMyL1)C;7Bt%c1#us$X%gTTu=zFWO71!tCSOCb+--TiIS~>DcLz%Xt;~A?JZ}nNVMw zsp}!{BzsYHw;+YzmyL>|Gl!zF`KWLOEwaLG@#8P08{==fUa|l3PLwmDDS+$xSu3D< z{$m!e*?m3kC`f#JwL|V&NqMD@{d+1kv)^_uBsLJGKUl<5V^xO*$2Bs z8wB$ru_Zpmb8bA@m&59~ zaV71@^S9mlZ0b~=O_S15$LwyHG0|KV2QSE1E>q^PkgYUE)uyat_jd3|3uUOhT`!xC zf1Kf|@O?2#;`Zsp+Q$_2k)t^6WGpu(V&GO!TG^g4pA(o5aooF|sc!gZNF3!ne7sG? zBUC%ZNf(kO{hn}%PABd}pJAvfbH(>dR?(@(i}klDi#J`y=#qHD5rGf&yQX_iGqT|z z%h<*L*>(NQuKvSgkk&hwy@cOyK^53|Nw@SKP@Cezo1DW9eMAd-)P}i+5BSHq5FBZ2 zQXF|&3m(|V`sPHuJnrA-mfmnEzBb%*Zw(@Dy^qsSFZS$oY-B903Ar+NwpAKIeFO9ID(r3yfI>L1&9vSVBt@MX_^3 zK)!6kv-lApbTHlP`aPnG724UTijGIIxLWA3Tnf4yQG1U-U{g4-;Vy+e6uKF#HyUHS z%>;$3EuIe`vQ^qYRip=~sFd;V!HtfHQ>jFS4e;WXq0h>MyOWKGNL<^vG|wGpHpVk& zp4u$>is`9#sqM^PVW5^MXvJu+K^EdUFm9?i9J|}0trQM#sXCQWqid5umYD~J{#+E5 z{ximQ*(fH`**QwK(DPLB4ToZOG()+rQ!qG~g0t||nT2{A!aRP46sgjY7ci72U1l<{UxJO7qXi01F zXUzfqZHW#8{l$N)VsZLRz(>Cr3vQ)>D2hjs4Gh}JWX$3&1jXPvd#5K4C2gt<^nq%r zNsHV7GI^^eL7Gf_$Pg0`w?Ne3_A^hK-kQ^t_bBhA8N%Za(`f$%3JtCxdkYME9Bl~Z z>i}3vVUXi}uwGwLzj$HBqO=+SxqKsWGK?;1=TF}CXZ(7iHAG^j|G?d*gDa;Yo#L<{ zZP~gX9ii0{)30Mq==p`%zd|a^u-l)jxFhd`n46aN7;3N##7Xi_51Ps6rM+jdN{R`B zV!Z54o>xf$dwLn)v3@ai&I@vZM&)J`rN&05xkMQZLhX1n(Gp#@zIBEwIN`jAcq#ky zf_^)Xbx!6gY3>r@5u^qgKAuN11!`33s#iWkWc|_VocXeMQ$}MB#^=*lN*f+_m~uqw z@@uzo8K40uQRpw~4oQ+K_9-z9P|7O}O6z?Qm_d!bOHKlEamGzA9Gvo0-Z`0`&IANx zW2dagBb&YLj3Om(xrknifC3PyiLq!p()X%y2l$;2jnFoC!_mf`&v6Slc-S6d%nqnx z?+Om89PBQ3>(8^LH>WK_jNzf&E2t5VS5U^33C&F0)c&6n!dPZz1l{Tc=dxE*`9?Zq zK-o~)I?EagIV068GPIf`(Bhkb4Q(x`-zo z>Brx^s4<1yMRqQ502f|sbwfd3YqXH><;Q2}rDAwtoi9h-0Lig;rV;}|fK=`pJ?;!U zd(GEaDnO_5oUc@?$K3L{_zIga#&Z2^2}94not4$jK|CwoJq)#0pu^9Fz%D~hZ$YUJ zCNm#KcA|zt3*fr>!mISX6F5 zk$v)E;d|&wT~yNPwZ*+A;goOEW5`nUP)0Qg8YbblAJ)g8=?(g1i#c6!RQ}nhogC<8ID3Z_TXdPl zh5#g}VB01=&m&Q!NNi5eq}X~|04ZoNhaA@%a)GOuic(Wl?{6&kVQfxo6JD6<`L_4c z=_9z$M=rkF*m!6T!LN}f;U%r{c>tmF2H#l+{1P1Gj5q&*KnOl5c7CIx%HOu|oXU9+ zMa+gi#ar9l(5r_Kbk!_V6!Q*Ynh4uoqFMziNfw?hCVfjmQ17d*ASa<>+JRrbYyIE5*O8*iRH1o{)kh-LD!b!joS>D-6nzGH3=BMVNpb&!1j!ajRN?gL%OhFY=x#N~ zsl>h-B%IGUQmTGm1Mjud{8b^KA#x(tBMQNZq4q}4sG35EDrg4vSm{CjTe5=oMyu%w zk04;h>d;B3-!UQj&1Bbt77z+D#>_yP2Ee1>$0s5t5|>cz=T5kD9{|hp8!Z#s%R)*; zgJ1X3!;-4sCJuaZg33ow=}_*3f&&^VyCNNaIuhcued@wcH}Uo_3cl|=LGr);4dF;> zx#@G|*AGyzKD`iTEC>Y@ zx+I%-9|w>br})g9*T3JH6c_9Dd4w$xqJ%`?I_|Q3ComDD`skTHWKI-ol z*|&4~sKimK@?Yo85`Od#-JAzxbiYoVG;Dhgg)}SZ***?J_%;V^g&(A=%Sh^TmVZ#kcl*Ru~U`lot<6=k&A%&;A-F&muTZV7rxi6W4JI= z71H;3fI1Q~aqTPo_q6N07LDa26o0Y^kR;u`{*81*RTmG_^1@IUD&x(83daEeAxu4a#_E)7Ked=`u6GI+P`UMlcHK zCsV(_Joo)uL{$_$FI;;x_7^C5(p&mt-mx#|Y@I&ESafAQx(p*F_tgwb+{y3v`Zqs- z)aY4Lhy!4jH2M3lLN`jj@GSrdIyyDFkZF=hXEj42sfgizVxSRvuS02f zCcybvzRaw8h2XRhqZPnfS`+f#K`W9sL@4@kClc}nKc9y1$TU^H&%Q%EVF?FC>XHS* z)I#Hv<(aj*zys%FS(o2;OhVb`PAhO+zG%FYt>CvgfkS{|+ z)+PmO(T7AA4^(gN0DfxaGqV7{Lw*08;aykX{slnDsQM<_=Y(;aJyY@c9O%$@00d(Y zgDC_|Rpkrn%?%&<3wM^ep$b9DFo@4S&;z8fkIsD95bWmE9v3HcDHa#z7xe>Q6r;h> ztI3SUvJd<0-uJ+H%K|XNdm+aRX%RR)TCwtBp`Gu5^`kNB-h2Q>A1$^OjYq?xSOFY( zib80DcoE~q8OX~$20<{NrA>c2B**zmaQhqyyJ73Hp(7sb`4zmEk_xs9{MrZ1=#o~= zbSk3i0uq&DTiMToy|*3z4aW{-CoI1=gdT}{z%&OY!oPtZ^JqFMfConY2t7QlHw5=< z02E2?p|v=a-1I~&p49LeaKWYAUFbjSRX7jjh-cDUgC#Q?H@YR$q0XEoWQNz|E>oRW zeIF)bYC`#OUAo?tQp{w@A6wd>ox~(g~_077uMWT&%E{-|>&MlAo z&|O;hnM#4vz(1j$a^8<~5ll-4%*7x8>eBEul69fI1zJFO7yn>$0%cQ&O;L97Wfs?rbvql-QG$jbuN7iugdP>Zd{6mgSHd9-++}Wp7ja-4Rxcc?LPXO_21553T2C7 zSUZ;9Sx9Ld59peGeEv|bS-7`IwJ5v{wBwitl|mLAqzbs)dGFttJLVTo5j8h`afESN z;lT93wg{U&@8fM@q$lrUyN0U^jo)*`y9uq4z4@$En$&v&lyEDFA!>+zKW+Sh9xDoPYKpbc2?r5bR8OK_%EOrvWd%oQk`Wk`AsXcEn`hj_%Cy$IYb z!KE>^CZZ}3Vr-B}>mL|Wprjqm&fCAg0v)X8C z?@iBxSRD>w{Dic(8Ja+4z-BWeVDpoLKOF-D8{|0yiMBuMpf1BJN{LP|OtMk8b3f~` zhe_^70Yk(=xfV1=j(cjhcL)afa8GmJZh~4r0!&A_+yLmK=-vR{bBqH@vz(ksp8FwC zztC4}q$*L`JEIHpTQjLIa{=h+%h|8Tq>FDMfK(E`2bU;qYM2f+Bz zCz(RtDR)8~!gp%9gq~2umkVhvQ(jTsN!(}*z5vzl6$Pxr8YFSHBH#s`$?bsf;cLU2 z=}VB4rvjR<140dDK}~57SCCRt3>~5-6!txHMo1&hKvuUH&(zTXRJoY;_NwWI1W4Gd zAv$s(*uoReVeJ9{VLJfDtxfMi(S|B4`M{6u{kaiXpAgf*Fc@s^;A~9_7ka-|Xs&5< za||xu1tQ}9qU$B_ujAa0-M75E%hjQ!%>9><`lbi&k{$@x>b>46?ge9Yd>zpUc1N!N z0H@IQPDmQfV0OoA7??8XC!S0BM+A7bBA0)2J&gb+&`c{LfMgF9#106g25{Q3WCdt0 z&H*~8b=9=)(FLb87Ba5*h<;9m8`cd{+nWbFvIdXKM0L~NL-UhRnk&}0i(v57y4t|L zv0o$)kdT%z)?}6Gnw@eG7)BHY2ZCoKkB zlB}Rnl?{^*RodB>T6Q7NDaY9j0t~U!=9wkRp1fIBEc$@N6%zk|!kesAr7hIW7of}` zxZ40VE0NYAVcT}*)l(J6jyg80A!<}#)vH#3P`4lRWAoWQ&r zgGUoF!eCGlz{5;62f~Kr^D5V%E}iop7^=%5#mc4E zK?zKt`NRn%l^Dzr+6p+xqlh^gOhqbWgpjNP#|&nn_R!oa{M9dg{X=U5wPJ3KCMQb>6Tm-BP45Qu9a=xZWi zb8+d~CPBusQKc2?9xiNQc27_j4!n_AtAqpi-K{Q*hx&W>DKK}@%N$|kJXu8k}0MFG`adYi}x_IB@18C_gE`!<2ZK)Rs z*`^0Yig_3(zCgDn6jLqw%Y*#I;kM{s`65+}+}?(#DaLJooV?y zRc5*q*DeGN)92;44QlQx?W$&VTANB9rxt%KE@_^8fALgRhHCcWP*E1RipwRH3jJUw0jV(^pl0(||k*_Sy9YLAmnI-GdQWUbA0 zbirLu^hu=1iH2cQ&U0Gvyn z5AlX<^ilKD`l8R;E>+5&renz0BeKl15V>Ha4&{vfr3A6<|MbU&?ovm3_Ww&O_tO&> zT0*aD`VRiGp#D}5{i}c1A3NxEQ}!|>%|ASGp{J>@_^$ug#raoHT<}6PlKAEioAWnXqyn}(dY|8GU z{KJnd2};a6_u(1)zg&P*RD@T5YZUzJ3I1EO(+GLCBk4oIPWwxT>t80rUv}4@e}_J0 z02rYZCha`>1snihbc&Dgw}Q-Xv*`D){`FhYCm;_fwKKT9AEm7S8lnGkq|g~C^ktKa z|A_1@R)s-(#Xk)!f3vgfZx-vB4|pyv@#sG?ccF2O1Mj^QF8$YheSt;5)PUhPqtYK{ zrEcVS`48!TI9+adk%LUK8E_!SKThi}&!zUWfzkZ9JnA~WDZfVS(16vUEZEgpvt6?3 zxvPRUq3cPc!r(6#20Dv8@b%@fv2_9LrPI%@5H9@n96~SndB)tJb@kepC*K5~>5;d! z`~5Ug^1Dg++gT|l1C`Ii~PeuB{XueVDvWHpG~6tELe^R@f?nE$do|2gyi znADlzKBD8{1Tn(TDNrVi-v9H3zd!oNq>h0wRO@%LZ7+a5Ni&&$lH5`0)7!Sre>Wc8 zV#v+vQ_X`)dD(*K|KPumj)3m1sWbeY3vQo#o1g&f-=+Y!Bl*fmk_*{;3$zqI+oMqJ z)1RaM@#r&Yw8YONL6-v~y!OF^Y)SAO`XX%l|HC0118Ejv(glC_Ug9HM_)(Pn&-Z{c zIH5dN8W(;W8Mh$X{%>RaPY&UiX>&0v#sgQ?>;CZuT`|IAe;dhPsu0DHC%M6Q?xo+S ziQg%r*8gY}=yzzfEH|Fb-+TI?d_uXqyCFrV!;p}cK|xQ9o&RXKuMCcmj*CAtM8?dZ z<33@BVOq$iC~9Ap|bw|^^Wz9t}k6(=l158+OBP+=M|~FNu z?!xlvvxxkw|2oY7^giW`@1(hbzn%98=$fcDJ;#Oqa-h;M+5hE4{x*=jL~uF>_CDv@ zcZT{d2>j!Ef-C&rFDV#%DLag&yDWpsNSp5G{1;K*JzND-J_j(Dao3`RZSmB%5^KtNec)=dX+0 z@Arb=0tfr5@rEZdFZ3@buAhmpOznRgNsle5y4-tPdD+XxM9LYfr1N2a4)K?7{BN_% z<}rLv8!P~CWaeM|Wf;)?!An?n{(l+?544hT+qQB4INlz8BG+sG;iBq>A|XSo@h+xH2bKy#9hYi2+WTnm4@vH79K#Ix-8MxYl5@`v4*!gpi7z1EQ z!62i(Bb1VxWQrr2e-F67@tpuX;)!~|A7%^u1&R*#cSitw2!l)L=SA?J@qh_1~kV)MjMclr8rhaK~_0~hk6LGcupOb1yEhJ`*Q*H6Q}LTL?`;?Rq*m;;C@{Q zeyrn4a29}pQM~=c`R%;id(LyS3-;GR8%?N)Qx3{$SMI1>K7+Wa2(#Uk271$EprcF1 zBdbc`rE)L`u!JoAPJ`sXyt+BRiBp%D=z{b_fP4Ft%xpr|F`F!b-`1-~e!gMQ>glJ- z;p=V>^(H62uUhvU_v>mFf>uX+gO>=fkJm6m_EX1fE`7WVZni4xza^Ri# z0WcYtfGSsD!5#3ZN*J`$O`un%NAuumQ4l z!>MUdaO+`s_D)P3RNt79X8|6UyiqBb;SK{7#Okx(TDewl=Z8NvkIc^R1XVbudyWpmoBT%ZGYwx_|k1)7#58zHQ5Xl+&B& zrWqGc+AVgpCB0t~M~@cMH-H?CKy+ltOGDyWQQk>dgyuM8R}I;OQOeF|G{43d^1_5t zY5{%F91EB)m>MJ2pDr2fn)E^%aP3shU@eAK@G447IV&rqJ?N{#V3F^*QPS`@`uxx&l7n< z_e;=ntkFO2ybM)%F5_QTZ{3q%kGULH&<2K-%RtyjiUFDf;-o^eYlcK^ssv1Y3aUH>t~p-?566l{b^Z%J}&{vJJ| z-Dq^5|1aBGnFiQAhWCt(T=OB~LW6i>$I_k-Z034sM$o~goVat64bVQFIAbx;886QT0RrQC|N{18NQ>;s{j2M&_p)LzE}DF z!&Z8+RCluSc=V%x+q^+wy1@G%-uGvQ;Ri+0g?n0;%@g6C09u2b8HPbwKYq;}sEgZN zCF!_eoeygPK5Glg5N$H54Dz53?vNI7>A8`(U!FZ6z#iIz?J5P5g?Y4?T$WOdrS-dU z0g`V(Hp0~wE1)QCJNsOJ5oE&# z5fT?azPi&j1r&=yIqowEza(~1uj0exVn&kHv|o83DPwHq8J8z(v&;o3p}{1yVC=#h?|r|`fC4e!h;FZ&*YbA^IXA3 z$Bw!JU_j*99RL+1A0gpa&NU#WKHKug@*HplCn&xx_wfSLi7p4eS-L4;? zLF8V9@ufK42a@P3Nr!psYkGf=oBrG*ehH(lN`pCi)tw}FaFz=$VD+)>GSX4$qQs^JV+aPvh$CIIDtA~pnh3Ov7BRt%tm5vbMT2sZe{7l^0|_)#Bg0CRm8;GzY? zhxE338;#G5Np=#($Z3X z*-nw=HOcbNz=~F)K&=L_{BDEtebZnisN>!uL*acPU9^*Y5-{zsgZP|)*w6s+^6r5M zWlCPZcUi#zXLE+=kvmSsU|0J9X$B#E-$y`&35v}Iwc;*r zc;O_~%aA-VlH(7m^QJJ-A2H7TO97Zf3wi?>CNH310;G}0_>TW zAZX%3=0WVW%OZ-IUXW*u@faXJV1O@I0pPc)-UH%6%H0rx=$B+g=7BZaFwtwUEBU2wc?&neOyMd{VIG5ZE|IArV%x z!6wT<%Iq7U-#pD~0f4y(ApmVMu8jemt38ObY9MO(Z2-0p10PiLQHxWD@047@WIKf< zy!PKiVBHH~ag2TVq%F@7hT4J1KEVEe(>=L^&Hz&f`CbJ;l%DUv{)>t1349tukP3`3 zfd5lZP=bzCjMJF9*~ z_y7fUK&5&5D{{Y4Dm(iV1b@^5Py&!M&0_cW>~q+8KKZ03=K z(0kSDQV>hEOol$ZAbS$Y$xeitmBlBVvX0>}kOovTJ7B3CNw06H6*v)IJZ(b`3#r-@ zyPteu^z2q&6uvMdmt3zMgx7YEb-1l{P(|r`6@A~_Ij{|esyr=36|f^OT^L@>(GHH4 zX%;p=(JWkF1J+()Duz%gmxzr>4*p@{=o|Akz2OBu-z|9-e++MderIAbbr4rf!_GX^n*qSE zxaV9*7tvUR)gd+SVvL)ToWy2l7-5Wg0K{&)xU4G^|m$GGuu0J==Jw)xd$)gtI8UB=ELd@~Tm{3MUiuk4& ziFsXdqd83pHZhW3%Kbv)eHU|1;=f0C(0y`=x<~TYaO5fq+GoG0y$G7)ti2BmbL{K@ zGk5j0QEYLTUk`jy3Q9vEE$tw~B)hURN}qEGn=iBwjZ;#p#|DK^D$p6lp9Mbf6ellu zw5w;Zs9<>_pU)3sEFv(*{yooGW`ZnxqNwK~N0f5F9T)*jI|T2uko21Nt>yk_53z1v z0r}!ts~i4mKr|iV1dJG^U@)yHPhx<)H8zC@sztbgdOq6-;D?KFcAnP?G5{HZ0Q_YLfUr^?T%j-MY>4sRQL8sYp2aU--g-U% zrt)})2D~mY3&`WhER5dbgsWN;?m&v10AwfJoNNdXXqk*B&Ty#;BU*WDUGxFlDWXUM zSgm~_GJ&tLAt^Yo)_*d_w338IqiAfa2XKh>4IQJ)kWHyNRnQAjQ}M&5{&yf!^@? z1L^Pzh$w>TPz$+~Yg`b9-{-;Q1@2j`2B^!_?Y$6RHR7u#oK<_VTBiAsIWVWgPnk&lyqP6 z`iNR@Ne=gYsdC>Gkhe~WiMO!041w^B71s|rZ@`aG_i;oD>WdnsKz>}Sie8UPo0_>C zfv}8@yzXXAbvUuIMz|eNd7i|R2T)rB>TU$F22?nJG;Cl|55t@aF|!82PMV4BH&RWd zkR?1Ss8v)ZR?&!)u;31ejC+Ilc}#Z?mk$rRsUK^n$__v=eTeZDz^+dQ6@&1a4?q~F zlo2#f{B8~`5!R4W2{NeL4wKxO1!m?qZFxfBA`X+BTt(Avfcr5FNJ~I(=rSQ;Z2&^(^7{{QCQZQz-Y*}A?*;N6LFzTD@QHx(r;a) zIcq$v>3&IYJ8`}?zwLp)RB@q9v4;iT(VuZ%pac$D@S?i$W-Xj-xUIk>pym{5!ubXE zR=T4Iu-h%$YpIB3JRO61rswCdPz&tsvH^nQy_-HnzbIUhrO_aVG_+3%>pbj(n9@L z8{c9)l=-jl3$22yhvR6hE9N@zBpJDs=2%a98yIsiAoa3#gi71`>oyijSo;glM7CI%3~=lznCvEh;4un+#2L@rnZPmV;93%F1?Mv zg_=L|dUsgcA1i|p6nYA+C|!AIMO|A(J191E?w0C2`Flk&q2oqI$q4h+!7R+kO2fCx z4_yKpj{TsITB9xeKE5C@gF{RK;fG@9sH&~fy?DByHE;qY&QSwzaM?C&wGgf?%i|{&p4U}=u#2E>bpWb20{9F(cK=8Ezw7W1jAV=yK zNi(VT#3QOW>=jyRo9UmCTd>{D$sL;yq0WBu5zC(=~tlqJCiW;jUqBix;iqeBp}cR_w? zPT#X2InNLc3nu{)%$wyaDJ?Gsh&UjwZM{*i-{lyg@*&PoF(N46r7}!{GIy|VOH@l9 zGH$CE#Sdx|Lgk79Gc<3=7b44ELlpLF1hY zY?&Q!pJ=~Hk|{D7LuAcJx+sSl<{Bq$;TX|0mkA)&L9Q;w3?4oe29D?I)dX#mo?gli zD9hU|{M>+y(@+K>OwM+0l$x-3DJPxa-Mf=dQ)14Keh2FE79;DOZKpor@{)!yR_{Y! zw)Zu_$a&AK(ukT66)pa~B0jiNbt{}lr(muF>{N{JMqh~l?Vd&Wb!BdM2Gej;`D6L^ zqv=J303T437q~6)X{d&d!<9vMlI>f`H9?mP7-awrnCGMW{dcxjHQc^Wc}ZFDSCbUk z@s3j0HN4Y_V*Uhwh&K)}Rcq&|9Sv}#)O6+`ZvF`6SQtAc_-u>Ip=yefx4hh2?Zauq zM^;?&-k?S!IvWfb=i)!jwmgViZD7HTL#gF=rA2uh}k2%bzJ2^kzIcXf~x>H!WC`cL@DA%@yFN>|;4(r;#bLB^AU+A_2w2UcZUZ|vS9w>E<8h}5 zwy5MC4gtkc|2ISa&##=^1hc%gZMeirM~UGTXfVXE0T0X%d58EU>MhhQKb+2eE|7k- z;@nb$^9>BS`LO@}o0tBm1px5`-_<>`zBbcWLfd8hN#2;4A{i8Yr9Fmgg(s&8{nAX~}H69knNGx-T&Jc${ zUdz<0L80#6N;jFn<(wVTRffx(=+@b=8Cg50in2|mh=s&{5WV1yfRj4D(778)-WPbF zxTc<90wAD2l3!}SAZlHcUw;4bM6Nvq7HrOC4POo+AVZ-BIWCZj_z|ND=%qEUfHkCQ zV`)T6>jr9T0B2upen^SlqrKW^4qkX|KegUA|9QE1fG5hJO0)|#&(5G&>;ERfzlwuA zn_QmlemRg{^C7vF-;I$K8MBd@ddrStzD%SnqK$pgT;W#txXZOUS%NS(>J;g{56vdx zy3vDPo&p0iHx?C@1+Z*sV7gM|U-9(r;_Lz`fGx8vUcul@fC#4qs&7^l4(w;K_CM`PTuom!QBPDD*^!bYbj>I01`%XULGhcuiSMrHGG z1osJ*1p#EQsoFHJ23c1(US~wOrGZeqm7|>MlX4tlwx!)OM+a$i!oYPN}w&VV(KyKAmO` zyk^f)Ie)?{)xGhwZDPn7IeuoG0am)6u`$V8dX3KI1Js(0q{Y7Ak{Eqs+9Mxzof^^Y zAZGiU@STb@w%{3og_+=qW&w<}H|ZRjJ&IU=^nj4Ba4MK1E>JL&Up}RHn`H~wz+ZLE z*G?o#*re`q`fibM4-XF>P?1mZID$xK3)%&FzF0Hm*KDA4{fb{9h85Ns(?78?PV@Zs z)-cW!k$|0$(n{~{PSNyOcalG`gkE8dlnU_EaG{0CwS8+}`3QILlD9{FIKd3RcewGw zhw*-6;k8#W+*1r^ZWqkl8|+d?>EO6)!By@@esGt(`WH+yj5}lcZxgE$kX7Q7Bg}0c zf&S6FLi0t^38h1nK@hrCPkg5y!G~9m8j;7hN&rY*p8vLmb)M-P@1iq%4M+o2c(LO` zix*H7s|c|`UU8qickSMUxEIlB%4@_uSBHQ)CbFc8eShLE$5%p=A0k~qYcFYuNJyOn zNyibhaz{ugOSxBxNA8=?H0Io@H8gBjCs+*BFnESX`zP>xz6d`CdAb(x@6em~iNXjM zL6aJ$qay*B3u>Len@x~NTCop9m^6z5bK>^E$`TZl42{rScw>rY`Mfo*ULIx~jK>GW!n zfm+NEq{|hVA?HJKD3QHwN0vK0wlE1{ZRD`Zntnm=Tv%fSK*j&Fq&4zg2ZlF@Z896GfS@J_QlJIf$;bCkgIm zfjz-te32!SNm&TapuQLh0b=<2Ksq-1=4~K~DeJjPktrP@ZybE;K*76_1myQFaRX;i z{&-1)-pPE^qhCR{#y08?(IjuqWb%vGW3EqL6&_8UOS&aI#nk^U(8(K=sK6do@v%;k z7DiNF;EGKt3Ja)!8Eowz`HI)3;q93PmTI5Wr{*5K`V8-R0dD=&ipxxK)(|?;C|!Ov z+X3I1VtTLqj?(CC@B|d}qK*%|^T8J*BPDxF21MT9V7{_WNhIn^_ zL5=OnQV3JM^d4$~iyE3)RL5G5xT)MkTve*8XSrX;-&R_$?$MvR3qmoJuhdjV3fReI zxwO)TxoS^mVNA)4nqNQMJ#)WLl+p)y1{VO%HS(Y#jB`+Z-*Hx&2sL^S{O@NByR|b0 z;cMZ9@>E|z{KXTd97;x|d=BfS?r-+U>8;yJFv&)ax3}~Xx`Vm=NZp;7g!|8PCZ^*j zMn|*E$gJpRyFd-}CC847cj~QVg8jsMQbw6ULJ^s*)K05%bj?lGNkW-zfTA z0zUfhlu&dZ ztuDo{sd!#2$iD`5-sYOnanw%w#7H89zXKe!W$+w8S#N}l!PltD+Ztq3*n*OUDm}#r zCdC`gCOCPK_kf&{x%fH@M8V?NsT(H2p*6_LAEk32ZROAH@-I8lI|zjs2^M;H{?P4> zM?c8uYPfwHaHAYkZ#DZM(TIU}y!7W^;qB-4zJ^QUx$^}vbt3>oi+E1_uzZ0_z8ax2 zG5Btsh}C_ZkgUn9(7;ydyuA5{0nr06E6)JaQjC+cch@=_>Z4ihu5)uv_5oIyEawAbtAc&JE?YBK${FBnHF=9&k%E{oTc=o-NH74__{x7* zE9Nq)KoeM`WC|PZZ-Y!ovhi%V66$-unmc6ytge$Rrlc7TDeI77bvHRVnRgiW4}tW! z2!F^qq>!S0^$<<1YWlr_VGN$NFFFc0eHb9v;e?^HJUI3;ft!uQfRSuNdlG?h;f?aq z6`r|M@XBWtz}MdTU!Is?a>CAjN(XaCs=aDJtcArw1IE5p`?lS_!wM4zDVsSMD4VLY zXVzMI1h`Gq>=3muZ13xIgWhgN|8)`*RrU^fd|`V@CR52wf3*m-W@MZ?LTIBp?Mh|s9+&jB)1y9&dH&RxXdhEv^=nKYNMB6KZVYIl-OpjmqwPA2B3=q z5bVFlz*SELi@t3NwnAD%O_)V72fUo)KA$u$qfrnmrDC}_p$i#pPe`-qg6az127SQH zYA}?Vk+!#(o>(AQ7jFk}?RbJh+7dvKBXKW!@3j-ksC?F2?t zbT|U=onOfFFWH{N4Gri-C9iISa^yrfRk^c@EA71}hoGdb(m_9NZV4c=`+mTKVF-YY z={#XcX_Du}zT(dxzqkH$7Vap2A?b{<5~BtDHno|!1N*ds=h7si445$$b{YAo+D_pf zJ`5PLeHl(LY&n_M;n`d1Vg+$0#Jn30zVte)R`}1u?-csqWxw4{7qw9U@jY$>1ltH$ za8TeP`x|nedUx=3Eq8g}!*|5GQu&g38nagX0JO^OjG*3999@Aga8%v4PRXusj`h7A z+av8+{#FJnVu0DFffgSNx$d?V>loyP;XxM7#v065sP}Gjc>COsUb!yKgVzvd7SHTJ zsktgj(a38O+cMil}>dTr*kiG3AKQf_GaL)T2&Bh+n5}@=gk)7K+o{( zt7WOuFXut`b}73Lo@|=Qr%$FHo{&CS^BC(HN%7nQ6YLy6*Y?zSVR3Ip`V#9*Tox+! zkJ9At(U~p_VGi==Htfd@@<=e_G1@faEuXpDB`cZQ4fK<3i$k3%R4x=O+g5#kVrU2| z4Jq#TE+KgVyJxS!9rZwZQ9h)VdaU%;d*M$4bP7fWK>-9vq@k@=NSHsyRoRpF-myJ} zy~fFTV84R(+F2LlQ#XHi2l(c+wrMAu`AQ0X3naxS1P)NwBSdVWuERk3uap~q8ynU% zXpRi1h>QG2i>|#tB&Piq^Cr{5tjrpiS4a?@oVLKJ5am|vAE_fS7-)`SbMPmj`krMj zHDi`TO*M#cdI2JtAoY;D2$Oj;r_`b}!4vP*_M!zpQK{W*sj$>CTOt!8)fYE&=EC~BakbMKyoAAiRo~fzaT3HWYLCPKKn!T3dHFfnST3l>UPl7 zJ#g+%DPIt#sviXAjm>!RM4gVgUY92a{iH7=hk1MnCgF(ddwObD z&}t#9>GKaf<&u z)$Ip5&UzLm=Z9xIrZu^ro=Xg?y2v_n(X^0|t)_j=xWR>PZ0TVeX#+KvghOq@4)h%F z`yo)=njAYi^du=BS-3jg%+r_Y0-?FbP>KjE*bN^&FZMvSwX8K_wGijn?dFx z6_Gr~-a?LJ{#jJ6izw1CLyfyFlg7XSKdJt~?Oo_0A~R5tBYO!%4twXQq7A#H^_x6r(=r7PaBV0<;#{o44@@@8uD;l*)VR(6oj_L z4Caq%@?&L+`720CmGx#TaGANe!`z|640k4^odbKN3@fY%Q}0PnvdmHxlvkemDtEJo zJeb-Wwy3>|C9~Z~#7_91&nyhD~ztT>xwL96D-?prleRHM+ zy*w&=g^w_;FO$E`{lfH2P-)p~K(d^C?zz7%w=7gKe&1U+D0z;*pse>LJ|+wvWOa7u zWqPz@?@BMsV^2CD-_|ZNM+U_ramX>+Ds>3z{N_QT{URg^YzDgfqJW0<@s82ZvNd4O zbmyL(sK;zk;B21%!49~ zX`(F2{oAOX0hx_APe`X7Sk_p(&9^V*;Bum#Y9^+JRfpvN03%k;zfRzDH@VKL!rj(> z9@9En3gd$YLF$}tY6DUbS` zoLM;sX9tDqz%lu)b8R@$(PZ}vITVkUpFS>xfADw^Ad_h2`NmV;F6JHp5sJ)GSG!A& z+1BqFA2|n_isjZt(d)&uIb%JorV|5~$w&RSBPS-eP}B&TbGAdGQ&rVlG$wsHcdX7` zI!D_!vrGQwtnQUI7i;N;;O>C#@a3)15-i)NUG;XtJB8g2#PF|XU*5-`GWM78U7i8P zqbaM-6!m9kDJR|j!53RX=;D44sD)7!s{$mdt zy&EO=GNR~Y3v7x)7}T=&8YJCIQrK3u3C~_RjnqZVRD-%u^Q1xod`WIBH>x&?GxKZ#x)d~S?T(qa@fb&})a=85wN*V~ z{P7zL4JtCP5YAjp#bQi!z&PEldzicAfr=p)8Ra!R=nO20!g4A1CjjKy+E$lfHc<}x zLk8XC5v*G$m}Nk&8rG_0$exuR2vxbK>;|ZvxS0|y|7hYmN(<%((wSEeVd00yYs?dk zyPw~7PfEE6!p^mA0?g)pmUqZ)>bWhaz)?7C{5~<$SH{ZT270}2u+vy!@<{tQSjJtB zHO#!AfOYtQ3o*z(;Cc!9MR@_a zd{ z^uVUOM$M$D_)PC$(-gha4)MoK`BNu<%SG}6eDnSm(Lp?cFOV?oi+p{XRL04|==R%kvdP z8!{DI3Ez>Eabsm5`fCyXb8ufFgf8NZ8z+w#b*&zeL)PrMK0c!%NB3lP-}XpzoQF4N z{)X5(6dEi0VJ@JvYhkJhlzzy;pnAvchon6YF&5{^ns(T{I=avHX-P0-6>Ze{=`g;OrDVKtXl-zlF41dD z33t~CI6H${*9(_EYW~fktZrwvO{0d_Ot(&kI8h_HWb0@`(|KJbf)Lih6We>?Jj#RH zy(my>4w5`RtAPsVylr8>x+W@&;ofXWNj|4pz2>6I&|usW7yn3e8}ou&_QemKki#&#@1Qjh4)_N=_}z=iqFy#_KwNbjDrW1pH*`|SXUmwmxbul%>8 zmavR>Le5c_aBexRYFG_7wIem0D(k6-*?ZdQbfiVh@jTocP&XI~g|mZmAuc3?8I|$U zZ@Pq{27`B4q}}5p1c4Jx zTMcv4g)L_#*|HR8g4ZqrQV9JYandh{mN}gN`45o_nio|6#64QC@DL5{(5_yo%xmCO z-YP$9$#s6b!;istX?g2$x{#J(KVNkW35*-1SCku)!;Cu4ZZMgIdc;e*4O-kQyEgl+ z@N;E!`HT8Bwa8jk2Qq4P1L2bU10umA7(cGU7!obRY~PaxK6`;Z+>7d{8`HzZxiK5I zg_af@eogz%18nUw7{m2;rHt7q;({*xH6+dslc}UX{`5saa!tqWOQ@t0^t=(Rf-y3T zAAhywyBqczgB6xxKl(Jy1sTi%_MUoxJ*Gl218tl`*?h)`lWRrc6#5L~u5na%oKU6X zE(zzQF_Ak{v}j!>mBh@>pnh2X4Xk(h+uL!4)Xr`YmvxAJr7;Fu_&lh=`TT|iNl7R7 z+R;o&lmmxZjvR8ls4;9r9<#N%-xV?6TKO???~y;-WBwV739`j&#_@}^?`yfNZf|i+ zf&#z>`&NA-dHB5ov&rvKfO0mt+4eDjh{1o9tNk)@z(9z`-naSX0~ZEP2@?Xd10`iy zV6(LIR&de;r0n5#q?NOP+Z!&^eGRU6t92Z}UZ={juGXV6*5u{DiThI$N>O%}xlthG zK0|u=k@z<>R>{_@kfmjbv zXqcl~gDguNr5lR7!-t@*l0jBgf}VY;W(BV~j@B=+l|uN9Xkmog8TmqEd!(@rkbC-b zD8l(B*V;{i!I784mstsOLay)gItnlhQrY!0GMVfKe>6e<<;QOPs4wicq$R}!!NmMT z)+ogyIO`MZTsCYcfF*CCxQip0%{%Z~0d5%a^#(*&ku)eXH-BTXIz(qr?WQI#F;|js zaGJe3NpI36VR8JHH{5xMl4?~E195Dw2Xb=vZh0dO5gdFUEi|7|)V~#FL-y2*?t$){ zMP~s|Jt#Jp+mGIw=`KEET3hPlbz9Bgre6LwS5q$^rs42$h#v(AYR8h|Zv#%DEmH&&=6r%yc1K{9CZ|*VOm4gwS+Mrzs4w#6>NkHA%PU} zmImy!yxlYVG=~q((c4rw!;Vd?q3x9uK^)T*4bu*cMD7R@&Ef5HJW=dT911NKq=YSvjJUk-A1d8Y}&BLSXR_lsvR>lA&E*R6AG zgecW&AHY8BY>7*OkstHt2Jsb(?VCWL3iUb~dUWy0aJiI%;Y_p^5dd4qKDWUc+_@1bD>3;dDkj(q)S<#-ajb6t1)OwEj!oNU2%QHX_lt^jOw_CkM{ZQ z*Q`Y++?uLrjUMslaFLVcpmO`9K6E9>;8wDa`-GQ*4kvBf6T_EJ9iL4e0gVT zeLkF4gtcM{WgDF_>{~kjsA!IcMyaZFZ9;!RfCL?304IuUfg-R zB~9GZlx({gl$HHeX879*;$Iz|VqNqawQvvRrkCiIxZ?bs3Uz$fyVB?=VYB#%8`4H- z>}-Pku^gAxM~kCPX&webr*($cF`8^=@4BX>?2=fV{eEhkYLNTdZ4-iVlZicwvleGR zi1^Q~l#ZR<^L#v#9t_s~uA#ek6Q1bHZ(`!5-k0ETo86V?aNL6T0NPfVF8MUMJAq59 z){9=S5nHkMC$y_OK%_787dpz#W{5M;%$AmymPUZ%LM;T!jNXpGRxDdNt%DA?qR7LC zsIsDp5bC3TRJa0o2%d8$-Y=PL#1PfEV8nDw0qzjvt*$n%>~T=X8Mxfi$;pR%Rk3!W zKmGYV{Hy|VeJO^hF6@Y#f~~rd$E$N%Eu8$gvw#6m0$i{I-WeWn`*AFdm$aQX<{>OI zp+Jr*trEZjjd6SDs8AIgES4!|18<`mI&a&q^C%~wq|No7g06bXmMoY7^WAEI;q=4= zSTKez%B{OJpHD40kcXiocYHrnHX`~`H=cTuZEl1vuv~j}Te6W5^E7wu#=crem>Gcq z>N^&3^7Fx@@)6rNB*6vu@B4y?P983%go>fd^}V6Z{#io1PyXhqkt^m3OAu2o+0CrX z3FInLJG|z{k)0(p1H^bbrnpjS|ECBx_Jn8cajR?`4yv`TRGE)IckR$-X57`GB-yAf z<}sIQ##}6hEGd7?UOrG*icKk3O3T|| z71^$X2Rao?PnD#*2AjdAm(6Cp{A3B#sc_3;9V2i@?3Hx0D*vJ|o;<*D8SXV^Hs~uE zOph@^K8t#kb%S(;=Ez{dnxrG-qE~~P)}(EgoIXP`;vGB_+qaB+6@z+=yeE|Q;;p~i zjhf8~;rC*|gnKQ}90Rw_<&j#L9qv3U6yY8tXHBKtEo?Wud+g1Tm6UY=2R3&fubyw% z+y$`gx%M!*xFdbXj(Z^P#}i0HTFWelcj@gPZ@f22#c^fy|2bEGS+9TNx2T~fWz-bs zlK5()hOk^NhPq^Z6KWa$u|lHItO-obS(&O5L9!%&}D z(X21|Lz9oRU)QkY5)U|O-OHk35z~EA+a$;O98t^;uf<6_vP}Ns>{aCh-7hKa;zqA- zFBK$!qI1f(eJ*QZ&U4Sa;65RmF1jKGlRP;p<6nb=m)x8}9%S;bnM_Q^bZJ zbf)wmDHbox9>&@3wmbw1Mhh1^wz_dhjZtbw-oWnG#LwreNNmV&h})2!BYF&K^TI>= zD4$g=j3_pdW*z7bojWyWDjGp)dl5I$gi^k`L3om7G)h{%ZNE911Dv*T4Moz4s^P&a3aV~ScWbF??IL}U-J#`7&e8yTVzRVHIns99 z;M~VdqvTSv_}hp`mCNO>bF7y~O^`+IQ=hg!GPb*+TC**~VkP3=r&_BWw-hFN zC7mj$!)_hklP=S1oLb{ILt36g)>m>CZ%S!DmKUU{!)8u+^4aoXD({-eenin^a4#jL zJh_zOkuJcI7)$T&+^UsTzL*4@nw$3S;^z}j4W_$Kl0rrX;=R1FbM-N8>NdfeFI?8R zeJf-OJbKTHYfbH8E@wN)yz#lwuJXpg`Nj9C)LqDz6bNnVJ;>7bJwPXBnsA!^Ju1RVmo|EJ9R8IaTla@*RAK|djmH#ZAfPLA{4&S#?P7^WS!8k29&|? z;W8Af$Lz@azR4x7ds07fIbieIdndElt+Cm*qb_3;QwwD24sy)1w^%Z*eT;fNt@kS& z_kL8L3i0Y%K5ELB`Om^OKp54{>M@Nl0)l2Tp6;6xB9Vkk@$h zVvRh~T%(D)Q16PeaY~=Z`9Uw_1gab;}Tgitz}q-2n$h`DP_(~hKLGFB(szuBxJ};#>_)9MJduUWGERH zRx(RTq0BpHLVd_Et~mV=Jww4oGj zht=-dI0(iZXMfguV%02xB0ZgOWA#~Z9X>ufinqYL`l8c)?{4<0x=QVME(aHyJE%z4 zc|>6U628!-8^5wV3D$rO=ZUW{;Z167nY=aJXKv|3=N42W*p;|`O~oMjUb_j_z2cMF zp_I4%GFaiqOdI5b)LUVa-ys&_uK`;aqe2I<2t$tgT_lxd|52h_`1TsE85cm962us0 zMmD}Vl^rn(l6(2MgkN|vZmGx}XFCRRo~h;KKE$<(zsu|W&NzMc)A@&IXk$C-*&a0M zd_P|NkuuHIn$rSBQU#dPRDZ5_v@~bre#)nb z$s2B;seFH{`p(vd)=R`Ue*>dQC6Y3Pb`iXzovDHj?a%X@um!8cPC zB_A(jC#~|JKGo6ubVmEG;zD^N6(9dN;yR;u@lSq0&6(N5Rj|qJ8gIDO6!AdvwbTL3 z7SX^pxrA!bru#NXM|F-Bh^w#d#Gdh~b{o6(qBJY_!uLYjQD9V1ukn4;44CGH&$Ma@ zAZ(LJGBoPtp`6Uj%cI<%6`7Mc)t`VWPafO5a!KlFJa1&COhDtt)LMPT@a&Q^n9(J_ z?09kUUj)*;D`Y*#NT|sxRlUwtx@}oMD(%bIn{pyylIwSsg4YRK&86E&r`3h?m?2>+ z>PMdUqlQ0Fv26eR7*X%vohjuv$UHCIa_oeSGgIyt8hQ?E9>ts0mZYFgT&~xrDbUQF zP*%+Kf2$N#EM_(jZTjCJuW0@v{_8)GxDai=eEolS^;Owe`hBKHc@x^a>OYD+q@^Vi z{60Elv|9*j8`~SF1`yG?HQ9(?(e$mLO^klHNg0y*2_97jyWI>+727JJ2WY$7WBK98 z$hZ$FpcQHIW(l_)Z9H7YCcgsA{3iX)-{))h#TiX`CBX>uGunfC2 z7oAn~0d#9c=2=VMyVx#f7yVjr_K+#lEq}2LaME1{(+lVXi`GUCr})1R*GI1a6(fyBWJX`ZAxrlmzo;iD>(L@>a80^kIV-L+O2 zId$mdiGBzDii1nb>&-i;p0A0z&VDyrlMmsfsYJnKcrkM0vI>)}l2!F`>ugTU(n*iM zdjVdCQ;pM66NZ&#sqU65N~ceutA2n^xFYkE*?r8!chFLiIW)uD)`UKsQONYdkR=a* zO5jYFPo6&WWUItu%ZT?!_T|=Rx(`+--1wbzgy~DwXp;7F+QaS>&c2NXDG5@!M=oWO z*&`!7w#hmAU3ETgy>Zuiw;8Q)0?MxU>uK(ORvA99gX--sCZJsO#wcQ~cwi?}2(XUFVXUe<} zkdz9xba35@50J2QmJ{BX1Cg;GY`C_a(5k@^L`>3$xc;}7P=XfJh0@_J__(NI!%WE* zVv)`tEz4|u5$voexljEaFk0B?SO_}$3(|y#w7c??A%glTy0Lfa%LHwIQK~Vf-%Q4f zVFwc|oz1P4Cc;^4^>5%CgzlUmmaxs9ZsUr4^QDf8$lW3HXnnxbl9S(oz=+KuNs>87 zO8AY)oleU8E@swps7yN66xi`|g}90I7vRIkbAH*ivze{G+Hlhh+Q^+}3_lGQL}ATJ zmpf$jQy=<-uRrU%5W2tA3U$!j-YH4pF{*4mVj}urtK{~*NQyoun4>+n4E=6PYT9trH(1+Q!hJG%(-2(OCK@(K#5A60sQBrC1Z9V5(W2(qQepRADVW^V}hSp9H#pVe2@ zRP)(%A#Bw3lTwA0A%rLTJId@Et}YeDCc#cR8PO=`D;(LPl%A?c|ZM2XBLX zBxb$5NexAd>0D%el_0COPF<|RuM_Z5r|H3DB%0lq$-YvFJ4eOWt&y6uZlM+PP4E9L zO{|X``1dW$=md25W3(UA$E#6X)1L^n)+j%uW$e-}j&HpzXD^ldw8)nwS4<}bd!}`d zVWUbUG&5ez`_|?+XM(tzl4?C_4lOypnJCyL3z1hVDQ#}63Ii;7%p$+NhFp{gpL4eeZ+7g9Di}kx9Ah*c*&B_X`x+}w zZ?gzAeyqDmo2fV9p_fORf&jVg36J_=yi)(|vFCMW;eniV%o8Nuc?`G5CX;4xIm5bH z4b38u3;0d*T{O#rc06Bh?{PdLd{AeV1o43v-91Z1U2Cc+&kq_iY^TRTcqt~X5%P{u zelW$_I9@*Vx~7uL{SM2JD~zIeVEHK;vngiE60iGo9(}|SY7nm1Eoyo9R{SAZM$nXZ zap5<5bqI5Q0I%up@u_wKaeHLM3kbk!$4+n@@_ut{GjAn=FQY!=mVn-L&}(YW6u6%q z2Z`jiw18^v5dqadN1b@prqKrtzy67CuoUpq)-ck(-LmlHo}y)dP%oisN{a@cjawQw ze}9Q)rR+Hj}Y-AMtNI&=LYHdOi zoGfZ8r>U{!QIedC-G~~i^hd_a8W3vOgHgxz!HMJC`thCvl^kReoG_K4b2xZ; zR>$kv6g79J7Dq>Yzj61k;?cYwL=9OmoX0*u{Av+b(&sMFhZZ=UupVZ+1#0}O0q!xU zw0=+gPSk2xUK6S05c=0Fsf3gA>fdD3|BUxli(v$6=1e-)BpDLUI`cb8DW;In%w~1a zgEDqH8iO7prm@ob*(l68*`!5mF1vcO=PiyxV7W#rtCq1XGnUrxS|Idi618;k(N=IH zz>4FSHB=}c5z(Y{C}85Ou?plkP8JJ>mY6;$`&>~U)eDlkz`SSklqyzF9N&`_(G83X zDUW-S8F|!S5l|AbylwNcxrnD>~nmp0TGV_CanZzUBliWcA0Qy(1UdNf z!ynoNgHue(Kdn{j^6yo&)kU2xZzI&%#(6SX?|lKM?yRPkcpoknZ;P_Le|heqkkOKq z99$h=|LS*V>(g)WZIa6dMve)%w>ya57dj47Y`Jxm{ zqbF-f>#_t9MS>9+tSkJk@uWW#^aN+$`rIS)6ZgL;^(jVfD`O_HCuyI=i1Idb`3c z?*jdnZX?U}CcPL5*2wq0k_N~h!r6x_My=H`CGe^!<4qCh<8kKk!;kDdU1kW}ZEFA1 zZ}?Z=A^%&Tb!BGE;~mxWTOr4TBHS=qx(KP#@gy@PA!YI#FUs*}MP6B*u0!lqCGG|egM5?HtG-_8G15j!hT4D#=!w~>dkn(S-SRDW0|4tx{XThDJ}iQP1_@+*D?Jhn)a+%`>lN{ls;TFkbjF>sd)7tXCaW!>fnTnNYXOq4N| zdZ}E}?{};rko@BGJZdHS{r!N#<42)Sw8@6-fpE2r-(aO0dU-!g23QX^YwhW8%_Zq!uofXDQ`gMa)KN;nUcj6En?mpm6 zgwvUEok{aDp-EkHPJ%D#@S*~N=ntx}D*QOHS@W{JB_Fv6-$u1gw|aypZtS|Kz|JLQ zjN{hQSQXx7fY)47|HT11E0jxx-kV{qpW1n+J6Ik%9TL$rbeUtn__44YiK?_&q1<dM`onEP-IZyXyD-G=FP6wgDmRP zfMgmX5VZn21Gx$^^TcCtb{3jDvmnn*au7wyd5SMEdvjJ`{`w#$i5eJlRywBiGi6#kKCI73_Us6psa z@mg?%9UDLkViRu5X91bY>H$vKql*qcV-Bv=JFPlvS14 zioJM+f65{JdH`SKtkA)>57GL2l$hd82uRJmc7TD)KJ__V8Z63JG_7)fkhYmZt~LgB z=CB;BT=FGrt+#l$e!uWQ1fq1U=~pd#GXDQdMWrIcGG$3I80E^qDJ z2e{8%LHYW*%_-jUWfD}`vb4JU(K?Lb8x#QXVofR7#oT~Db1$jq-*WF^$KQHfo_opk zg~?Yf%3t9Nb?&W&qpi3GeY3B7(AQwqJ9o9=|1a7;9|cl;wtf1#_hk2BJGU{4uA=A{ zHqHqVH>pld+qFo(vHkQ$IM0PJqZ*fPUvWTf|4_9^*wY4`vmXg7s=k`O1^e}nY-71F z&u!Wt8OgUPCUL7%)^vN#;DR*lRk{9H4;(z;!t?dh#A_8eVQ0&DDu{OocQ=tlF>_nA zO5P7;V7SHibugv8T%jduvC^G&E!ff_~@=5fm{D2D)R2WYbJ3lWv=zqvNh=%^m7^d+$4o? zR@8KI+2Sb3TxYTG(OeaouPM2_6AI6?Bwnuh5lJ1Yta4`Q6CV*g;_THrUoUpr;^`XJ z4XDJ#xx37wpTX`u#T@IoZh(zU!Mb30>H`Q_4KT3$1~WEU{7G%&7maTwC8vU~Mbb>* z+oC4w_bgm0B5aCdz6)Y_k{Ab$ZGi8tI_l8z3Ve;Vj^p(`GQKR>iXsB#t}GSe^Rq(E~AFVO)><3H-RMa^<2R1pWIoC+e^(J zpl0Kg*j(mr)fZM$L8dppSf;LtkJ*xK{sD9C%#uR^eiKm{Zgsa=@@Dg&B`564nm+QV zx_=EYCP=n)%JJ+?S&GJ~^9hXq92uj&{znW3^01Ij1Sjw5V&cih+;8EyObJ_Q$%UoxwHVtT%WB{aTu)l#O7{h+EQFn)p2Fy^1Im}Zma#VgrF~N zp12R{G}hv`)u__E)i)|K!9gF*N)zRz|QqhjhQaXZFmVfIPz z3i|ZsRXYrx#0|#{I9w-`n%uc(qh=9RNSn7w38e{3rh7%Lf zd`}*#AQ#3q>jnjtz@a<|OS!<&dd!`b0DuMXO=Yp}lU>UIhgGy_rUvQym$PFjxo3Z! zKoQWbRT1By3UvRS)20-GY6AZe@bBozqkZ>@l_ApRp7) zP(sstG3mx#u#V+x&TP5L-{%M&vRgWIR7rAnoCVm)!Rl>2-Pc?E;Z=lKgOof!VS^X8 zgUgfm*?&OkEI^4xML$62%?6F_f4aC{K(aV`%?yJmq(@_wrr~SnxCgV7yNWR8V zKQ>0Mp_^@tnT+r6=BmkCW;pqeE{*Rh%pXnsQ8fyRyGCi;nXR#Gr$>@vTZ-Z2fWC2Q z)iU;nx~pq(X{_Z$F8+MS|6EJKIr8^ENKysyBN|@yj{z5+T#-ldt=gz5KQ05h$PhZM zqAp5D=!^z;dJyS4mOXIa{uP1~U4*pDuZS6jNw{JkHboIiG>Iwpf${D7Sb<*wx^9~L zix{TRRVznSM&1(9T{TiRTzL8b(9&?Wpx6oF%ln9_Ff5!)5qnktSXCJV6+(UJ^-^7LR2ky&o7SPtWR z&D7dMnulqxha)#3-o7F+;uKi}v2KnSY7fvy4u*L@!BB$8EcVr?PwNqWW$nFo+v6kh zk4l?juspHoel=kLNM|XG<7d_I2_W^yn*UTZgV$gZ6CT+D02Ht9QZvc zw2W^mbUKVmGkKgc6ZUy%g}MoYL{8D_*W2<{lDUG%Lz!}PrWoZ&DZAK!iEopmS@KMx z6~?KT+%vT2J^_zEp`P!t7UArSIeJmskZ@8C4W3e}#i6snk7`@4eue#r>beJ*xDI(qdbwmqe!u=$ zBa?}da0fTa1MuTqDH`yRtQAIAH7=0k?A~LHKSTY&Vc~HEuLPS}%BN(HOtatcrJ9ph zvVN#?kBwS%`{=isq+Ox;LR+7aR^?*lxsq3knO7*SD7K;rDe5`|T?owQZLj-&rgh+- zr0<%m#z;v^V1Lf*w!RvQi04)LEPp6q*8DRYNjk+#b{)TBsX=?}f7q6YyQ!YGMCm_U zGz2{`_q|^bF1;zlY~4b!qE{!PV~bU)SZ@jWqJpTkdnXh4FKzS`lWW3@ zdtENKIk5;2!~$*7p()=`!az3YM+$Np3vV8lrChbNlB64fzg#$({Y7Dli4CfK*bN z0!O%j=nt9tt@~lWle8?`6r;D)wOKnkegan<|K?h!5X^s-h3J;+KQ;uuIuXv=-g2W3s`Md%IWc2xgOV<&8U69|QrUU^L{tHaVMas{2&%&8bbMGDrTs4UnSy^Q2dl zgwc69EuAk={oFop=}*PM9xd_pw9O(hL9;2g@Y|e-)m_zKuFoTg`~+cxw-(Y%arE5{ z0?OJNtz8e(YXcQ0w@KyTFDfO{5<$~kDnT#vtIV^Es0TYTdt- zITv6caOIw6gp_-KHz;m zglixMOx7(yyvC)~pJ&D@j)aOFWnBKv=S#Y1#p-ap?bhL;u!f?FHG~F_P1A_f#MvHP z)UQi;DAY{42q?MAV;*o>JTzpdo+M0*nGT=i+8IEUu-EvDR+zBiMEh3tm%Isb1 zk%(RG*v+-r$36nfOQZE(ufE8mA4@TUF3BAAFi(K!VUxr8h2Wr{O>>8dX_k@d*7Vv` ziQ?KSEb*_vtXa@_Wt+G~L+1_m#|HM}t@Qv1bELgkf50{PLtN_Zm&BU(^uX)C6{mkR zNlxc3*Pgv?ad2PTXP1+QYck{$oL&|wYw~JY&(0O(t+}wrbm)YAWxW<{oVv}u0mLva z3-%PU3I`40yzM<*;79yA_2%s%4_z8Nv7A4?o9&e>Lu(`@u=7kqwfr(pWL5<)>qup| z#aE*H$#Da~5;C8TpECb-XFG8*>O(NU`nsTg?MBwjGOFX_D8Qy#8BN8L2+G)is2=t_ znS>uY4s8 z|Fk~UJM7;(Wr!YfoHJiAaeU9xtYG@qR%J*#L2eRdSJ3ZHm0n2F3e==P-Q2s;c<-r& zul{GI)_ARF{X5(ill%v9pH&E>+{yuUY<3$5^v$BD3F}*3q#pG*fGfT}#EZZ18Cql; zL37rNmCuJRcKKEHR-dx$*7OZ#tCw8)3$v|7+lo26vG_rPKE3H6if@<;a>xtWWZL!v zMMl?D%>3QK{S~BUOr+u;q+hLTp@WmV?mW5;vt$`^;o3`SQ=j4Eo!f&sm>Qes$DCW3 zk}O3i9TeXS8n+-C^liMlAT1qwAaX-1^>wIo@)gQOz5Vywa~#nYD<#Nnpt>tYk5INxk* zCqgk35ebK^oT&VoIBr2QbX#J6Tk6c*ZvBxydw?Nv%i_+Fac(}=&!t9p?#hveAqxJE z^vW=ZjVehPj#d1^X~d95j^&LztHQknuU)Qh6Q5xe#ANKl2QhJ-C^61%4-DS5)DR~t z<^15x%Xm9<$!9EC0u!_~8x&^J6?yP87Ccou3PP_$F7Rf(RSWdR^o*U~c57L?*!fL9 zWu@?}#?Ki%0fY3`jN+*-2><?#Tavb(->1w-V98gXuB2$Av zR$JOHlm{?zA%gpuJ(jEo=tA=0#l@j&KD8yv-SLmOCSkK=9jo6))WtC(g`7Y?q>Afb z>~F6Wuw}*<*PD8&z%Yxvz=HFbJFDOh6wC^OSeyn$d~?Kge6p>_!4>+TKdPz;%|lH3 zi`pnwH*;O9nsZY@&Db|TAJfpi1rUo=#EiTO{)T|a#p7zjQ)`=&IdXsZ0w!-=*I}OH z=df?2?@DWO0Hf5t*}DEvv;-~R!KuSbv*Aesr@3(5ce`QAz;eLhr=Ju~*E+;#q(?=m zmVbDvqr-Aejo9MFYP+o9Zu}=}7yTn5Yq&AfKhb?cT$iH>?DB*HfBLm6AIdMa>bLcU zFV(YD;B&5cW#09t8rmi~eU!Yvhcj}e(tFG#A$AE;(8zV-Lbs3l-P%}Siw*J1dOW;` z`hk8u({vTJG;NJ%x%UEkgzcXSyaKRc)PWP?BLJ zvAAA4^XzZ3P;)3d-K`0e`O_$Y`KIWe;d95&glWmwMi_ITEnz zTK>;!-JmLjSS^oBa*;4)m)WOT5U`0~h!pz;+iKi*G^QH(n1{%Pu9=Y@hYr@Zp7?3L zNA_fM^WnRteOKCJvW;MgBu&(p?5u%K)d5wEnSW1%oj?Fxlef+rX2xrIU>^t@<{)7l zQ4yq70B;NuE-)|ruFYlLN{kKyyv0A5DnlkuX***73| zm=UnnQ->2W#RUhTJ>;7cIZ==|r}X7ua_s9TM@?feDXVANPhQGVl=i_r^!{zz8%>#Y zFB$)6^;_(WecM`egF6fi$_leaQpoO^=oMMJA*M%S4Rs+`*piM-jV8X1_YGexhnN#z z?`YMll1@9iuSk=jcP88GAnTDmlVp%>vyvq7qoPiYFUB3!JwxuWY=- zc?X=rv2!?y^s;2Cs|cLT#h5l;@1eBsHfNmB5Pjc1;$_ZsP)>CgT7;eF8NMGXd^t^9)fxiSAs^( zU3-s%rOE}(tX*;7d913C)XLL4CTqY!7lJR68fsmSzDcxdpH#LZCkpf82n!QY!u1vX z5jw(gb_4DRBphaMLUA$m3=kraT()U1WU6Ywod3m|!2tl>-B)=hvEkCuUvoB2XP~4g z*0R^;6sv@s>z3n&vsOBk)}gO$@8IN;=Qo=b*`Xj>8BKxr=eNplkUSJ>QYLebX<_QM zw9oOTpEmWQ(f4Kns~>m8w}l}HVge1cxexoc@+AE9uC}CHH^_}jZev{+s8M#{-h-h6 z=iSUfad6=NRL;dDjjQFq>Ef7B+JsY=3&$Mp5Z$v^)~!f*)V6BJ>d@O2^g`XO?F;H+ zGLiF8F z>{!V>o0;U}=uo>2&Iz8rHwZxwuXlU$j<5?*ZgkFD26AQE=qBG)fv586Xdi%47BxGv)BUrJv@28zR%`OKzpvJDh4AaurRP4R{HSWC#mbdi= zL6o+2+3rnz`#J3KW!dR`daS94^y3`CSVg=htDOOA(GoBBb4K6<$u<+rW&}~20@SAX znYo1{&2#?9%~G6B(&Z)gpe6cML}5QS@WAMHAp7{*K7u~$)B9gCzJZhlng3}WW}>bo(% zG-|3k5n5B#Bnk3nzFHUw_pRyAuV?)I03oldB*sx>*GtApkk1CSx#Dr>-7?wfe0A4Q z-!&{*9-dvbyC13bLz2<0fG_$4WJAc*v^iMF?U{I#FdOJmcpREc6%@UD0q>l!fMCv8 z`IUP@W{!rt1YIP{dX>VTs`N<8?QO#T42k*fhJ=#+#)ce;s>;s4HI!J~j5%h#gK{^} zVRFzE3h3ef3Z|35eX-AgLoPikZs~-%vP4CE*Wo4Jm!ss4Qx%=zOq|=-IX*+D`4#n} zZf?1|r^@Gw$y0(Gl*xCfpP}9m8|#6S){Z-=xoFJFW?^Z1OKzDy(d1g^oWG#Dxu#?E zhpOX6s9;(LTPc>bk}M)^QuJHfwaF@$&A$^_a*kr>+Kxh zf&+9dgp8>mMvG75Z|CuUi_!Z-X-ocl9&CUFOdn}~7HhVDA6A?Yt^@+~bLOyF0OeoU!3tbZ&+D-!OVaz~Fz(FdWFm<3*` z0%~KwO*=l`L{XwN$3!()iwA#%$d~p|0mYEqmQ}xMYfeLVvwrfDFRNDMb(&aoG?%5X zMjsyXnRKL0=ewd4&>Y3};#&u?bc3l3PWVD@U>7;Zm}X;DDWukbW|{ZAhj4E2Ep-=K>z`R;m~D&$tq;>s z%9C&M)5);5q+h*08+SPhB}a+s0*By9jlCbF)%z9G>5J5NiUPVZN)|`jw{4W%rDT`S zY08IBmAmZh2_!1n>S76(xO^O!rZ^c_?$LGmJamP3tOhzv%9ktU7<(bpC96Wro;zRV zU!(V)?BHYITlt5*>EHkUuc+Y?A*6fv_R-^-)HA{Dsx^l4VDpIAY7C$T`;mU67Dm$% zth^_~#a;4RNykQ>HXu1?NZVq@6{?p=jsv3SwmUWBT1U2As$TS~)6M8#JjXqWs+s)+ zwM%^M{2H6OKjCLPCB@M5;mQMDry)M&M9n8P)d;pq;HGts2eVPvpgPSQ7hgo2uldzt zYa~y7cDwv>9WcFuJwdU-c9$*+N_{HZqX!_VJ8;3suRWb((*%;uw}ihRIXR?dEeM0 z)~NifD)h6J?##pkeFmsgn7F^v@L;%+sq#naFcPe*2mmq$(Wq{r^^pJv&tK;a<=cnV z(WRvAx9QALzKQjmqB?PREFJd1ZgR6!TtJqNKa8-h^T6M6_1T5N)-ft1aSB$obRB8ma zmBP*i&HQHUbJS77JFyFL>Qq?gUBKELa;q!g$J_26!s=s;_zbO+SW9u+owoV~Q0u=V zs3sJS7Dfj+h?HAWaal12-uT*Ia1&&RoFnvX$5vpm%I@%}^H3dI=igz*v6T(;muOzU zSi+n112HCr?58okAu6kR4L>drppl>QED%IFFZn5p7(g;x2`chFG_Jzc6b6*^7z$G{ z%n`P$m-}*0F{o)l*zfA4a~@)u6rr4uOyo?^QHp6^LsHu?8C%`zFr&A4fywo3jT1qt8|Q z98X<(H(MxJ=zMqwY6^W3TN!L}47GEFlDDh?C)mDf`PhLig^AA6lF8$(!eW`-y9b$j z&VwWb&Ry{wkH2GbHSW>V#J4ATDN!92y<{&|Q3D@? z*U;BdyPC0^7Iw^xO)=7^R&$NF8lKvYOg#@_%Cs7{RZ~F~iE8P0K z7odqO7~Jv;7r?5uz1F*TneoY6lf_s* z8YXCMP3m>rzXdlnY5X2vW7{XoQq!lN!sSNv7EAmtcKVKG(^|%Uc;R!GG%?6k5`@LQ z#EYjPCCnfg$qT%S8DgkFpR^%SAu7|(TbyO5ZDtlgjw@AGm-c))HGOK2I=kf9hkcXc6hiGMP7zvF3G9=M0(tB1<`E^EB?=k)Vpv3ir@&o>lgN=`} zCrniX!qp!{JaOnj{{*?mcQ7D#$|=Jq$2^w+Db&a@e09ZlR(0w!559}m*Gg^08@i8o zag1E7h~9v>+V-rB!IqN23#qZ^D*Yo2O*bjmv9x$-dI0o!}W zYf5t<1>s(Gp!4G8YxBUcUyPF?Wp2aFT6CxzrWyN0Cim3{BiOgN?_bfmHU;9}BP!He zZ_;x_7(>b`J?0YY$uYSCD?(H2akwZo*hN!U{4^NYrl74vMFl%7&HsNBYbnjIxDiH`?)*_ti^e?99=F3y;XzIgO1ytfULpeHH?Xc z>och@3*s>3gWk5>$0IrPtLrnbNjY-9jI*Kg>Mlmi@nm1A84*_#w2{z%R?9z@5QNNM zgz)UY>J_nMBq3?l?%8kc)5e2h*dqfrABIHx&GZbeNJ1c zwwr`gYJy>35io4kA4nptkYi9o(>-X9aZ6q}za*)70|o=nj89{On;<#v5Ln{|4%95! zSs0eZ@!Q+-r-^l4*#)}#HKH$|T5V<0Mkh;0T}}OF0Dwh?B*tM-ySd}52e;h8m8m!$ zyx#@hbJ-dPjgl}v3NHQ#Qx7<7Y+A2A9eqP}Rrmx{CVKCX z98H*N03UL<`!n2$TPN+2pa>7?A?^8lIc67~g{G|H$LAo8GXBQ|K~)J7R$ZWNR-!QA z#j5oeGWF{(NQZ9UI!I>49>uI8Bdv;#Xn#5{bI3NDlJbK)d>gI^1j$_K*LESvDX#0& z_VoMYtGMQ2^}YChu9ZV;Um(K4x`7aU-DRbnU3NzV#ME)!`=fYnW{!bAdxe8rC8%K~ zl-l&Oj+ue$198+Or$(F*nbp-kjvs5^EmqyHKD#6pr(1R!Nf{t#I{cpM=5WNHHugR> zHK@#k>qk=t{OQWb`;o-G{Z_>A?{%J^Om(5=|ENP)k4T=M5Y#h%Pj$806HhYn8YE&P zBoByQAS9`4hP6L~))X1>21naNd%$x9HW9ln^Y^3aMKAb67sg)J4n#WT`jM4?|Fmnp z#TnZYy*T_fvBn)_;0|L?@Q=k0b{Pb>e;chZ;BBqBiL^@?AlC@Kxb8hAze*%k343sF z4vMiF2m6tipxbSWT{5^lHF-6df^tzcn)>4lCbfN2RI!-=d$NyGrB!3ba>a^XdR&@C zL0NsZoOhB9FALAHC$f%HjVCZ=4RAX)1X``(j|CbutuJg()(Vv!8X|I>M|q)F`B*h` zT3^iop1K13k@DE(!=4@dk>e-3C2qT-OE#sVb=+Vj#9zj%%0-&!A&G`g+POS`ajixi z@Kq;k&qh?Vb7zJG@&?P@x?vO1?ZXDLwHfO`GbX zU0s^g6gIl@mw0*gk(X1RF917PfXHF;;g5NA|MB?IlH&^4B+XQm_1Zr)Kn&(f^! z0e*m|vG-n-zi9mHb5(mVMq&T`p?~dH|K6$kNeD2Cq`W-)ZWNZ^7(%oGr)sxfS!L(3 zp2xaRN>Qozn?VBTiYU{2rAR-`16JC*f-bR%P>N+yVCT|d1ZM%B=hcK)>D@Z6?&4J0 z?x!Gq*~;SD)90%REihLp+Lkdo`G1sRA(a>-wf{N-x8dt#BqmJ4cg|+o_x{4)>*ybQ z?7yp%{y5q_*xonc?)V9&*xy0zKy3QGQ_5NUaMk1w>~-HEhr0$6^g578jorT6zwHqA zeITe+HX$9#w2`vs>p_GW6bSo)$CpsduSF>AuV{-IL6E~gK9-MTi- za#Ubhb1`O}FX2A`i_~%IMlSY78f0dx+Z9EUby&5;O15NvJhT;qJ6U`amyIMiBv1Tc z{78B2HsoNMjs1+pjjN$Bsd+(rbP0@pSyqgoDS4w3uLRM z1CVe-<#Xr3BaB}l;`#xUd13Y+zjuB z*Wze>6%zSQb`$*8MlsU)zHmqXH}tQa05VY{lujT)l>GZ!i=&c-x5b{nJLh2uhb*FL z)Xu}3l|ra3usf zA;|vP2$Fe{g}j9)WGSCK`+FJub2X}dXECB%L;r+o1Bv8CQeZzKxt9&iazT5(yWlZk z_QEK8OloRGS0jRIXe2%bvLxhfQ(b%fXM6)fLuXx;N3AUk)d-*IOl<@yYbxwsW6M5* z*~saEc6=oOVV+uls-{g)TJ0b#`ym~UPg5}QtN?@AM?imGK_c)j-Tc^YdeOOG{^wAGoSuvTo9ub#X>bu|hA1Tp|DzV{oZMAJtkK`uO^{t(ryK{{h_0Zz|Jb2%dnys$hEsZ}peMa7n66dm4(YYQB`#sfBx$(#lzDB&J^}{U3N;p+&lgb}rQTg$*15No_h-+^^exh9co392R_`@eb^}2C zfu=9ukdP4_uG6juwnJ-`9QeXtVx6USD?F5&*io&Rm!q z0XI+8+imYF$7K6`9D#~zvn&hrsP9rfQTyB_Kk3*wCKQE^;`rx4&OWBBynZ1#{N0{L{XC}8o1 z!)$C5zglYlTTvI=SWB)Ze{;Q@7TM zptb#kfF>3J`tK4E05+h=VoBst9C~;rYm&y~ z{-G|f_6vH>aP()>gdnwbBonXIN++W2*o4}lYVyMXwI-BnC&<2~HxE34V((uknW`(s z=o-9DPhyc=o(;beng8`Aj6|v4{p|?o&>x4z7+WjWb5AD;`M-=Js0$?drE^R==J=qdacU~Q zv`i%C+wDKQmrXw*{dCdqfDp?~6L+N;;nOlsHFkvlUsWXxY1cTNfvN=>B_+}Ol*y>- zZX!go+HYh*Eg^x?W_8$X5sBdvx-_Z9Oj!#NUu8AKctYknY%@K2wI8hfBi>B2hr}?0 z?_8{MX98L~S>8R+j&v;6|FpRX?S~#cP;tC0TOB?O+3#i5=CrY>=)xN_AX|O`IHkg9 z(W8%1sfo=AFh;hfd|+&;Yj!hFf3~@U_6Jj;`Q&fyg$PL2604& z<6#g9vW;v75J3M)`RDll*0&-bW`*n)(rV&t2l>gf3NXSImAN)Be|u+Mco!y|wl&BE zd0;!XM|GreZ?!9P1{B6|i|iKK=O4~#HNimT?M;xJCD-B(s{{oM82p6x2okItv#T-{ zJjb}F#h!XMq@vxa#VG}SpJN?%5Dsr&I2K?{I}hDmF?v(8;wiKhXS>7c8Ybpy=W?A% zC;YAeI$=3S$khQOM|N5+ZVjIWcj1 zL9Ah!6xrT`AhABfen9A7gE#vln7kQcTgt}~_#oEY9y2HAOTM}VF|OZWNM8#c+z~y| zr}pr(RUn~$3P_Swg+Ku4z@=`sS-*}!4#zr~(2P)aog|8aZuqTSkd59l3W>cT`ZmJu zWdVhC+_m!(3)OI5d;+M-H7HJ9W0Kc|)q(@x8J{+Xv(N=nVH2KJ?JShOroV#3c8=ut zbzl0m`=hBAu&G^)3SaZoGEzH7rzT*-mmxFFCnu0y4%r@GsD~~hx%;cryR8_}a-v|8 zNGJh+ZwumE-40n^Ib;{+PhmF+0Mc6Iq%3ttt-%!X#m`1+6*-buD}!v@uHwsR$)M8- z+#mU*c(z_cp)~R}jAvnqWu3NMhdp-FDYNCH$*(qry-g`gs~rw*)IH6Rc&Tjx4?u`U}nEu&xx zQ#}HQ*xfS&e;s?%ROp`6i?ofW5e#w-1Z9#7WrF&~vl4#9Shzkn!5gldOQ1yI-zL*m{gHj}%t-|~NBkQfBvP!%EVF}@;q#J3Z zTM>{FX)x%Nl~u?bz?J{0`7PQ^ng@X{@W86^4iusj)u#imwC7pdfE7WuzDk|V?TIN$nM zoeX6uP0Z{3i&jAv0+aR7jW(NSU*4Zu!>UWZH$-qvs(*Nl%5Lsx2&P`DiD!ZT{J+w}j%by^0qtO-tdI;;<-nWUhrSK22+&y7 z;t9=bZ0Gyvb|?kYw=dPBC|GmQ|1>tiHi)Lc=D4+ZGLU@UIUkxL&VW*TfDW!UhhLx- zS^lZ=Q&3i6`+UUJ#!90lMcsl#rU4i@$_|y&D<7SZbk-3O@Kwsud^VSro!nOZh2Q~h zhCDeBP|WiSxEDIyQMb`kuCRG!sD$|uW|y`nNrp%)>G_>Y`=HWG{+vR7OT6{4)d42H zo=8W#b6=JQbOMrG>3pv1r8vWJ_tEb2?9C-l4Pm56urxBdGOrP}tZ8N%#B*peA++yUoiXjsIgE zsLQ5vx!djH|8Kps$-_1 zbpMy;>H-(|^nbx-xZ^!2jm^celOb7SwL(0rfWpk+sOm9m0E=2Hc48{ZNQ>G^ezMpRF*X-j@%^kp1N5QozPuU7Il zZQzKhR{d@WTZ&^pMQM3P{A!D0Q|OumGQgBpiE_o3!7`?zNJH(vk`|;p=EJLTf|l~6 zN5?+)*tyFJZaI zsDVa!-tuW5@w` z$_O`U1jB-YtN9HM_O23>BP16rOtWU9NH2#RMeUtId)sNl&GWD-+fy?XHzJ*lZudd! z$_iyLmEt)JJ#M0|2bvVtvhS0%QnFv_2xpt&xcq2!x@z>{-P5CZZx)gl(P0cA$AHcC z1$U$m?^3ed%JnkHZA+kiFH5PGDacr7Fu16=?!O*>Bds$E)nMG#`tuTzGv@{-5r6wi zH(f?`*=k#xsIkBki(ICvWiSWW83?LR1&&jR^S!K^QhMGl$$qrnpL_pJzL%;+G;Z_g zrA|tGHbKFa<=1jwci8m!*3`{V_(wPUl3{&hxSS|+4$pta5;;7WJb5uPUjJtpLe7~s z*)X)se`D*DC=)Ja_i> zMO@+Ql+jO{ir7Ks?}1$#1F!e^qifEsm?*tzQkmDE87z-*@^~e#W$(feuMc}gX?-KY zH^6=bjhxbc-mdPnkmgjxNAO%Q#wlR|a? zU=F>Y{Z^yq8ANh-EQJqI_Q#M{v%(vdn&NA5{?gG1C3*LLjT3nWFY(Yd)wc}G zYGworO)bV6*Tywpr2Q($UkS|xRDj!7&-B3%?Yg45yUWQTEN2hk92?nvX@_>otwSWA zMDJDDlH6bfUu{B!iUOKxB$YGZqw8XoG!|iDUu+rhR_60KT5ROjE~HtMtS<`Zz$kcqSk5Xm~5S1X&5%!VW>0m$y?31jNZIjTQepyl2ce=Afa>^GEYcA`>A)OGv+)# zUjY&=7*-TlUSZ;ocIn%H9Uh(*Danf6@tWA8B9;%+Di^E&y1_+xitxG4)NPYWt_J8= zKc62hxY@FJ0D0uDj%8J0RTt^0tnFT9Gg)AdfuLxG0`UP_ssEH=;@u+NX8-VY2qvMz8 zYp0<1agf-}H#PQYoM1wE@~F3}IW)Z9yp5I290d5wL~4DaxVvaA76Ti_L$kr(^=~ii zx@%p-TT1UPaKy6>@%&)iS#w2I3i^|NN^OPKnokjHZ}US8Y|0eg&l{<3s^GIZ7b`c{ zS!`#gDyi|m72X9@+PW)E_01){vxGK!vbsK=(lqaH_NePC0Pou6Vt;Z%S_V6BM2+Fi zl+N>iZuMW-I#_>o6v1I6a?^>9+oapXV{ehM1+_Y2f{Q^)B?F-y#g}_`3$GKlh&iLa z?p*rCN~&RDh*B3bDBI1mwAHRn-E1bI^7zxh%6&Ra#h1adZ-@W!GrT89z8{4}4zuh@ z!H3&O*kz_b^u1OF^~cS3L?K8SlsLI8St02JrJrbp3ED=qZ%fw}smBLEnsqgjvR>qm z6rlKd-))lIn6Qi3yQ3t9tH0gYG2Nf4TC|Ch=mhCHF%+B&XAXP0zF%lb$AsWA+2<>Ar5; z$o3pZCgjgJ3>pTye5kW9tUhhi{bA0q-IS$s)09pH-&KVsC6qqF%=h0@Z3Zr&N*Ud2 z`9_g))_6r;|1@X+91}YQ%)v#I3ia1Q*}MW~;%%j{LuwI()m8?-c#;J4rHo%!?b*@H zQEWcHY@Yz2e)&C?NjMu+#wP?*BQybgv|`JW24UAV48#2@9GT6{-!}XIA#v}7$B6>s>+@UQ5&(Qh=#D;q7etw6R=N9ew?5%YRm9R+m5!H7X z$FyZ;19bGZ*6Bd+mHGUfHC!OVYvHjN=_dhQri)6I?=F~gzd`dcQ3v#(^v3aLhAoh% z+vvvmc{SVCml1Mf9-5#@ITW=x5Re4HNxre=&_TyVp_t<_Hyu?HM)o&I`L7b;j-6x# z%&RY%xp9$};>}Q_5AuAYw_OqQ`oVRzR*hZLyP$MhJf%Tf@+AJR}!lTQdT(fk?f`;5lMWbU!oy{tAYjJ$rnK6A3{sxE$&bC(kvM6REOtfj39IDrQ3(3K1diYj7arFdhi7iJa7z+4RTT+uNd(JgL zsy0JJBVDO;V@0+>@g@9Uf|>XB4xnCdCFXmuH7%R=N_zr$Q@@oJqr0w#pQEj-74dHz zn=TX}te8iJf90MAKf+G>_w*}=ZG;_+G{kuXqkR6Um}IR~c|}?~VZQzry4p29kj)&n z(oktya0q1`L4L4rR&E0fYNOL z#DefZwdNd=Ro0v>8?|$#N}YNSfm{$2Q~$W$s4NAN+%Uyg0QJt?lG;{4ii8tz5U;_P zFd;VOWYyTwqDl?21P?7Y1~HfEhREqNKPiFNj)U4OL^8O&`>@aV=bfV*xAmYg~vpaixb z%!4_bAVrbrN3j9&ONP^Vy1zNqz256eAI>wtBor(!`;pD{VO;EVp}dGCFEA{ zZ;v7Y%#s)s#qFGt6n+nW@6}RnJ#U2d^H+_GywTFlDv(b1_8nxuXA;p}4p&cIX;EWN z3G34`APr4&khl;`qqh{Mehy=b~dbwbOU;O)6v_WIx=G=rsC(_o`_pascEdd+wnO zKPCfDwx3AD7ZRFa+ff~}4Cq<(rWt)3|BGRp`E7o_OS(S5S62uA*)ZGF4DU>@UO)R` z?(~V1Yre-C!w3Gsj`4dE$GlXmGb`&?4f;G!^F6I8@s)8iAQ7y1Z-R)PZFZ!V#v61 z4V#5Ho%dA}FBRN0@^}L*(x)&&T1y}RwE2z=9;_FX15z*sjlOuaTNziuyC8CJj)LyC zyISZ9A?yJY`hE{EKBh|V^spIDvA({!2(t3@((p$MNUQ1$cuJVTMRhAsBf7OpvCH|l zPh3PI0z3;PnY@OtuVP0b^O}NlvB(&QypXO>j1H&d-M`wD7jZ3(?p&Aj769~TUrb3l*jEtCqRP7ZS#S~ z4F2r+oKM{#5TGYeC3XBO=x}@&!TONWVM`k`Ar|5ptvMOSQTHqvpS7~5zaOVkaNn9D zD;4(wgB|?PM#@a;jivxya8S8eMITy(PDjfg>x0(U8I<38k$Z3Czuam1cNa@euHx$( zv8MG%OXPoV`0o}Leg$fGpz4-^uYp5xftLRB8jwt&n>O`wH42>Y_CY&B%>zbxuyONg zgiD`4HQ(&uu>Y`cY2S1A>KEE5P6G>H5Xfm89&y#v2i1 zP3t}gEIh4iqDl)-X_;ks^K(>8rDYso5E5(zQX-gI46d|?ip_@VPTaRkvg1z9CltFTQh<-L%i~Kw#?om^;Or7 zEa;<^EXy;rr%w1&Nijt#$mx?w#^ak*F_lB!;H7fYbcnJKX->%Qj%-@@5N|zE*Qk+lR!+l*oT-x$X}f_xZuVhoVHz3 znMH~(jq2?H7gaNUa+EH9S&+Lq$BR1;`tB_xdc{o4US8zlt2R%^Wl^b6H2$_33vkdi zTitBKLq47B9Sq?E04d@uX`JyS;S%r(5V2l?#K>7#-0~gS*jXoJl`EzHMt%Vx2#Z|y zF&OKOLD}LwW@uY0Kh>;hpiX{w*ybYZ9RbW~QFZPOz?OMAjAx}#D ztj?r*fN{nMLQnh+e$dJIi?edR1!~g?)Q7)b(0?yH(ZWB1SwE^T?6Ytzf=^dA?7<(O z-xt@=9dSO&dJ@E)z1bE}_p;BCJwCvCj!9qOAxDfHvAT#R#R{*8t_fW(BbD(XAEN$Iuf8}V($AOpZBmW> z?^$J=VaQ~nAenJz;Obn*(2auBJ4Uy%h9Y~{f7CYaR1P>$TsbjKFFmnrUog^Oteajo zFI8O~t??xtYOLuLLBingj#0UXxmIktLCI8G`YrE?`P6m8@cqSsD(8SdnIi)}T{S78 zyPtP~O&2i@vvYaKXO>C92`&FE%>RhSbEpuhU{rR=9YRm)Lk*8kRm#zA>`z zWc{l2to<-A0Hs6P`%eDc2akWS0O%>YPJRJ_Nq7GxQTR7x)4~+6pda{4QoVIo$b!4r zTAG5du-9Fd1o(V&r(e*uDe7edSzZ$2s-iJQnN+T}rdfR##xd+L*8qE2tr)ajR%I?Q zL~i%fa7XGV(6PjOF9^DIoLn?A|I8Yj9sP9_uT}9p<)~?ywYl%B;q!-# zN||rGH|YX#d6O7GNlr8<$It7mr}c_&K2JZUf@cF975aIuN$S!yd(SE-g9@=yhX2($yJBSc zFV6NSaPU!g;xVQ>+5{T73DGkK+(~?c=$NK}nIsxL{=|RN`Y_=Qke=O9`Os+@XYi@9 zVHTrwexvc|TOTn$8igx)wvEYg*w=Qb?fZ=^Ycv%pC_ISb*}51MB(_L7^Pav z8{WRfCcZ_-Y@q7q!*v0#GdF(zmK%4%FqNRGNwZYqcWWPSZy%c#xU#ZOM0{4q{RVxN zdzZ|t3&b?5bVWI&c&sMNDU-{!L~ilDH@&bEn%qMqx-|EpJPf5-mL}*P7_Plv&2CPRG&)n*TfzlhV8x{J_^J zM)h>dKCuAYRJB)^7)h7mw%5I>*{>pp`2x>v=GE8>gpN`+F8Xr@ddG{0s621=3I3Ye zT5FyVpCOI&k`kg{dX%nOz(9i!ib|JPm+U1+_B%piV#y9Ube>C;Okzs;AQ7M#nrLo% zguP1sw6bK}=z6i%6VWBOLG1YFa>*{?jL%knGm^zDcWl^qO#gcNZ*VY-xiRp( ztS}?j-fR!nZ;*W@#3Zy!IhIeMX46=EauAHD5Vsx)7RP8*`vN(~$wA8*oWkDH?{3P< z2y+nsgzPnr;6ZjbAYa13_Ez9Tz9rDA?)~GYI$xCn*TGKm`lUI*3SRF^Ui~=9F>Hdp zRpJG>VyZL6PzWI!8FQY#yfE0Sb$-jJTZ?A~jmJ)^B1**6?O>9YlG!KPYaUGxO6po-L zia2~<46I_}_xeXuLM^-5@7QAgOl?SYo87h+mTv&%h!A3v3ica?`p@ugTox*z`h{O+ zFgD$HDOhD6euKS(8@ub*Tk{AU%!X*N(6qzNOHH%2*wpTKZfzOq-^Fk5_Al2V#)cMV zFM;nyn8~R5^621ckE5+$JHTVq*&>Oc@)`DVpdDG=7r9vVbT?kl@h<8r;y9@UIp!CK z5=&pHTwSt{dY47S+MS>rdYAsf7fe3-g8H#p;2EcC5uaTSFZzfCRx;dAhxKa?S4eO2 z4~Y_ChGxXvFT~hLkxL{out~ReaKK}XCh=glLD~l_Ox6m>+psPSLt26vnXGSs`IYVq z(M!=+-KyRyyF4hq=l3W3b0&{vB9w8lo7$PIx7j*8!+nsAYhMrU#W6}p9;eq10%j&P zFa56P4t4na(I3RCmG#p%>plE1thOJD)ORmOL*Ra~U;OG)=rduNb7>%;_h>CNtM{0b z#6N|N3Um!WmIuiPg}dm`v1{$i4pZBp2hlhF{h>VE$Z zm7HQ!hgtsaV%GIvhSizoVSbe`LWBYD&XO@3*?L`y5rBbT z+m=KGOEbxHVYPbkU878wPLmo6>U7qmy+W0sB7jM0VMn$vc8 zD$H!bv^HrhZq3$wDQOpf8iucvkt-{+PRqb(uk8BgG<&1-XYP(M(i$HTa%dG(FEA2( z>3O!MD`mMD@m8U#!!G0z%5X6PHkw3Nw4(eLNN)d(-jXNaPb!|gDK^3G1etuB1P59_ zvYYxBj(FSP-w?DO0$B7VZsXpD=iZatPSCC_ zfboYIp49zb{}`1tI1Hi@%WAd3Rxfpow4|5{FLmli-YDvkKxD~bd~0_(-mZ*5iCsCq z`>F;Z&E`MSJT=LddDw=~^a5g?enM7w{`2vBl1jhD4@#b=WbX+4^L;_}9k?yGZL<55 z&6q1Vg*oVpgaXd?pov@S`QTK}?ZcBTICNBp?wL&8Vp#eBgVAORJXdbXpjXNHq%Ml% zxcynIXVmXKK+`wJ^!J*n`k6;Cp!b z1y8CA3|dmmc@}PULbe_#K}?kLV(|)2-jXko`Z>%-dx;5wTA_wO(90;uix#@`Cks+% z-%G>E^9os4W^h*+ zC`_opRaM&V2{{l0|9`Alf%S=8{$Pp(l3VK?LDgfXnT$mD`9dsi>;b~r-zo1&j| z;w?%0-`)j8vzaS>h43n-=%J>VUndNEQfV>?KR*U#ZFfw$=i1L?mK}*!=fb!1NnSpzRzQkRGfY1oi*NZ{|sXRCE|%?-3|ZV zAF&aMA06AUUYLn#nnCJmYkdH?VJHkc=Vjb(J@_Iy=|6flw21L^iCFj=rli!U@VO-j!N%$FCCRa|=ihB9J=RW{9gowtiLK zo)~;%zAz?hjFC`KhW6 zN4NW2pJ5vX%T52rd5k#y?~222mbQC|H^=br))13@^}J4htiQ?n+;!UGT`V0Up5x6s!7rp@P)sqDh|aq17TSP`(GClT=)|+dX38X`9X~ zOfw6<35FLNy7c(&uworVn2f-c@OzPk%r@kV5X)pFtCq^8gclue8~#F5yNPp}k-w?0cPf|-yz49+(#cP1na?IXM$iVx}36-%$wzuSOuakbJ0Cz`<{f3$w{& zpi&5P29%Fk;=4=khWKEL({pTwrXsHb;fhC@k< zdVz`ffaJQmMu$QHy0B_uW(VE;DKPHKZ~tD`1!!KFZW45T=>=uwjd|X$CtmfL*)1^J zFv4NV56Uj#{^laMA^E{__~>H)8)tu3@vM zqwRB-Z4vW5e3aGdjdMrl&oAwSYfcL*GS2g-(-z5vMO^FC&ulX6wUREYzrfRZuy6V8 zIW$$0Ld?C@tV4YElN}Q6=Loym*06`1$1UENj*N_luN8Z}n?G4T`VP%7F~|bVyf)_> z>)5g%f2DVxKPGn#v<>qOi~TaBW4JBeKA@bFZUoKX61I$9k$&cuL$GbfgNc4i+*dS6 zc}ZdA*j?l|G%6wSeV4{Q2lrA#{Ox0>F*`eRSCm^}Z? z9~ij2)!cx&??u|mcci;*90)5*l*w>XQKUF!I(Bgd4%*>Bhxe?dNJ7<>HrZ8zh=MgvocP2!#q|A zMGVKJZwX@odh61|J)HtE9kn$@%Dt-LTR03mW|R6nW<1jC=cPx70EbBCNuZI1y$2k( zS`+<((+qlgyfL!`H@UvO{#dkIAP_UEVoSnRlt?{nqK*tWcE{EJL+RnSN%|KwH`mmEVcxL9n&u_xCZZ8yesgu(5*5k`Z>zKD3 z3BRQ_`LWt|$PFAsb!*m@3~{0w>LJcMZf8~*_4mgNIgw2@!LOD{UygR|Ke|6F*W?2Z z0_8l%$(BE~KgI7?6gqgK3Kv5+He*;^=G?3?3$osEMHt)%p&_BvsPpUIOBT@S5=-%F zh5gD8#Sk9E9lR@r`Rm(fUN;epqZBo*uP88&6u8y|Zx9cCzLHkz_z{4k*FB7;vf$H= zmphMnQn(yo#A4ie@B93pa?36p8}mq+A7A#0Sl4CriQg7D2|aWwuBq7$rygD9PmNo@ zP1@hND|AKRw*;fvt1YlEOu9J0)J}j{pZywAh<;etCe@c=v)@q(T5I7O^`}Z1q*`p3 zRugjm24rlT%}f4iUf(B%LpaOF`nNMbkIOUkTpP!?F1-~1Jv~`JBYHkKVml_btlMu+ zsb08O=I8B%MS9vVJwA*_n=UKOD+krHQ~uL>u9-*sV8QM^$(j0Zp_0SoSAiau;K!%Q zx1X-?Z9PuL_+a8li*AB1E00-wP^hxkoNZfkd0aMx6hE3ssT7vv^3e%d%f)j$?kJ)) z!$M@yv=%~->@_RX;|rwmUopdylq2CMUr3tn2^}_m)-yhj!#+jOAyyd(JBZIK1QG6c zC2Wv|^nx6CE&l9vB=rW^9GwRCxAV!aYHg~Egl? z??sEkk1R8i2|SFZp9>3OruQ?}cNVP)zr@KE+X_kzY@c)mVGjjv(@FdwEn8WXI0cTY zONBjBD0!DJM0s#0Ix2!^P2zj21{{fu?cgY63M0x=;cp%-t7Jb-V9cg*vISl+#K|xvU~l zI6AyHC)&=rOjG35fQOltxhVW)GQOyJues~#ImhBA!$ne=`FGgAk5*{VW!Cdtwi_Vz z=y9%lReEt)gXc?8@Zx?~T}#-S`X6u07elGXwL0q0RhfHl^uu(?AU0|D;ISKAQd}vcS7ev=V_8)`M@4P4V$~X@e0n*GFl;9m z`^)KxzkU7LPU?F|lzm&_w4cAx31%58619602c&z#^C|QO{KocS9SDHJ?(5dpuO?GU zcxfXgdF}AfE0Sgo&`E*tNPc93=W!Jkamg*Qd31H+I-s7TZOdUg{cS zFs0rEMZssPS8o{F7D@=>_LHJPc~fxON}bKg=b*XvPL6xS@+q4ilXOkBmL>ca;nV5$8#opvuh1m z%WW~HqHIb{7k=FL=OT-HCHCRxutUFC9s}cSVwR|5xnm@EA(uVoQ0X3;*!cA$1I&Z& z3Z5)SM@Irc!lUvP-ebLg>fejcYK?>i(-uiHw=u`xof9Q94biNjsOpHWi3p0A%bS=ZNB7}iu zUiZRBIZMk36{$bNvuF`P0g{8xrQLQ_thA$b!@BL)`rlThWIu7H-%H?ICEHz;;Lf^J ze()&&)d|9XHTbr3qufq9r6pFVcl~=`3fsJ88%!0xa#T-=Clq7Vh=GBve}u)1E>+u! zPPTe%OW(oU)YCqSvF0SeANUS#JRAkPBlqMwb)wgvOnkb;V(Ds>!UR*b0vb&1mopHT z$y06|*BwJj>#sSernD8m)&bpq^M@`|_jgHJJlR;swD{#uNV4-H;x4MKteKei$rRdr zhmLCgnMU6f4EnG0Ch|qX?=CW3iH{RYDS6^0Q;F;w887Si07@FW$*nWyQ2~DM;H7KDgkp{}z zpyrVYb~n|RfLW~NFWKrtX6-mi&==44A8jHq_4&f77tTlfJN3Ta3+TlgSOKiQQ1QB;3Ty^&kyy{@Yt-nv zH=lS9>s9+6gJ|xB*tiGi)6SHrzX5Az<(66iFPpF_hOYcTBVLIMaylX26T-6yc*6Nn1nP-{P!VpB6$5JK(wntxQQlAa7Hl&zj;`U5!z^n{ic1K zgR0uDi=@^mB=5RGb+kj>q$!Mg#*}KEN&T%oj3}ybZ&gNR=C<^njsd89$p(f~IzkAa zYMk#)J=J*1D2VeI0r_cZZbF6RgS$#qJ2sF?RKtW^K-tb@YW(+;{KYa z54Q5UMO_k}gCACsThfzZ=n(2h_YDx{84QGI)7eq!_D?}JU6DLtB9|Y;ZOxDE~X4nuFe+We%I3c4K}P!H15*j zYExV3#lw_jp6Ao_{t|c!)Zh5IpHxIpAa!P%w~EAASPt8uRhZzUF0# zP|uRez4aU6kyX7i)=aP8d%X9=+tmoZQ}(R$%9sJ$u-J~rX1ksX%LkbSBZKv4FPWNf z=No8b{L%C18w@?&m?*aKZlMKVJSEU(-*2>`EMUoK)fVW+#FUsM)Q=_oj@VV7D$B>d zG_jC6@u+ACDQ+33cIuAXGdVrEvIi~4t&e_Kv42iZhXtj~H3ffCF+$Aj*U5-+p}8>V zLgoJ8Au#_!5Y`1$)3tfSuC|x|U;#^!4?YEjM>^UkW~!JT#A$NqaJZ&c>KO{`=%yKC$+bZy@)70%K|gu_qZ z`>4^={x{+Dl@$)}{iheLMXSl@`+p*t9$OreW?{(>>|68`nsR1Ab&C0TKAad*5WgZ+ z0lQvboO=^~2f45B8F(+dUhsyIGBK-}n=_wv(+JLNp}Pr+*x8z}Z%5vySAMRsmYS3u zoaABlmNS`~8gV&LbKDkR2vKymXQkl^U;7Qi=A!0(d>KXS0bQxx+(F?tDzD?(_M8iKPhJ4y?in$;vm$m#Z5kvIH zLQI9(@6Y7(Ckuy)X`PU$=3Rgzzs2~7*%F~Y!Y`vwEzgyI>PKg7kTi4|>zAWrTkjNt zPPSVjt2%0pIm^8vXnKK4LOASY#uul;>dBk2d5mAz#V%t#qxmOB}^f zbGO6W01+u{X|~Pu$p7NmdTI8dcsK1Yo66)lbu`9{z}ior$HuDutfW7-B@GA^dC0}e zCzG+rkYN=oK_@2HbK1vWyH!%lH*^>M;s!&wvP=ganmR<)*x32;@h{?F2C0QU;YTz1 zbCtgyKMYo(*W~{F$a%4vkKy;xpIkH4ouX7n!04PG_Kl=eWPf2)$&ta{qUr3WJXu|; zzpnR@@{vTAEa|{DVeej}PJhFXEhn|RNxXpV4~c#U9VRKjbpcsXM=DDI{|;Gm8|}sl z^?Eu5C{>uDjzD;l(1OczWWQ3^7_c3i99Mm)!XqnqaQFLaH;1$5J4Z*}rm^ks8otMw zI$`3+xBfWk8#T=BX3}YTQ9Wh8k1P!O*ew}eb0}1ZjIgZmP{DbT7ZeD0imU%K)?qKFEmmVGrCXvBLz zcB$n!lAE{0QQus_G4_yNRu+)WKL*Hk4447CoJ*;C4g>wEz@!&)(Z*pbKDA~&+z#Vn zImHrvkCLTHgcoVpGX&+z-kN4hm8V^QgD{`HK$rgDv%VQ69F`<9Co2b*%|)rsG8C=Ge3;@=7CVG$*S5cgl0-?4@P#KF z%AGg%M}^hvO$O1Nf7R;$3Znr;qs_;UCH#k*&EqzZFm@(ba~2Biz6`El#OPQ~Qfq=DMtZLE$e)O=B#XXu}n zE8(8OR-qf@jNw^{ zcIAb86Xg@SKc0A~PN7K^<01Aij*`RE)siHSw)t3bzWd?jb(s$YJ6eLDqa>{rUXqwI zCoI)HO>E6cZSzg<(|sX&$S^Wp|DW6B66Oe4nR^V z%5i~4_|=`#tJ(pANMZ;^E}7{dNf`6F#m7-IOWi_3hu0o)lyL!yQ%RL#a$QXKotp=}Vbb*%SA;`o-KK9O_;g0JytRy0tSOA1?VX z9G|BSSn_aMs!cd$9qzbxxA7s-H7SN$fM9(*BH$ggt{_d0oiXZLZFh#D{u%Fg+=}bZ z2%goOWE(7*2R$4-wRcqKGynx!-G5~|@pAXEGpdpx<6xS%uJ{t>$vFo{#}nd<0cBnE z8JFW3hZ31qLlpxWK8eZlE!i7xkqAlkOoCExAoz8%=3I5HMg|oP~gcn5JvyE;aP-M0@T2iVJ zT9?=cFj@UBWfoPU6FaTkI91$`7PWP{1MN@Fa2>ogL6h4qFs`(G3Od@3nO;5i1fmER zyOz*@g$l>Z{&w*j(t44@)?b4KJ6`5Zx~l)^{Qg!iA(Fr0W9;v_|2!yuIRwJ}uJHDb zO3p}XQfMRZlP#|396 zFaNji1{<$%gMl2YXSBh0@`u@T0Xi{4B=F2Uqa9*Bc-`VGc@iE)(_$G)5whd?zn&%r z#mZ>qN{TT~;ZFEfPk2#88p_)1QOoE?r4V06^W*r1_9xWob0GM~%sLlI7H!9aiI!I( z`}Ji+1v@!hRuh0ar~F>8P7*tC%lJdBhzZPpQAXNxKuGSs;133DEWZmTEe zP=qSxySxwgV_O{6z*_NQy(!H*BGmqM+A-dshICkcP`<v##q)JrFj-A!#> zdLW0^JkQUtaFo#G342deU7l8z%LV;bt^FgHvkTwb6F{$|g&v}`YlI50A4y?fQs=ML zY0;)+yxFL%Yka$cq0VzEd8~H!1;Po$FZVjXh!MlLedeyD!h8AEr$M8U^g%NBk4P-l zJ)AfLAX42ln@9(ggVq`yj5~$5fW!(xWKNau(>)0M{0UosU6E2>pezT|N3iD>C~(W` zN{XftSxfooN%0=TG8UoLi871fYnxw%w4GVLTV)t_c&QF$AVakB)&Ap* zr$NrZ@&D@)|Mj0adwAnF{;o1|F)$!~Mh!0xC`=z&k_NT}@ZFFe`HMx4N>>|pDqGVub z&5##Y#s!}d&6Y~9_bF(;qOdE7P`*1$GcLC7=RNq{4bd7fovD6_=c;2_4V2U2@u|3MekNHcv`=1jn_Z(#h z*^>YMUqRTCDM5>ku`ABbR;bIL=MaHPQ!kvCyEi{q#M*s;4`z<#aU|M0;=mRhPuoR0 z7KQ7Pwi;w>ox3y@#27=V#NV*Ac*#_*2ia2tr~ai9Kt?g0{KP!&=Lsv|u=tVmU@3Vq zVh4hW!j*BGJXi#dmzd5Bl57+N>+eqH=L4ZyxPuQ+SY|ckv%A;IA#ch?*^|j>EpB4L&PY&*y~m zmy$C3vc6QJp z2*fatDc_g$>RM;lbHUt5IG-QsJtyoW{60nt`X-Kz%(mqLq z7=hlzEGYm1k)#xMo*lMe5i6|CgZw93r_6pSp@P;#8zE(b;_L0{v%XY@QWR(wEfg4~ zD;cvCf}&{64G7guP3z)$-%j|AO5>xX*wghg8b|=Ljb-1S27HtW3Hz-oSE;w0tU=#u z!ifcpW!tWuhw9dR@2})9e$GF6|oMr)-V7vtns-s3p8BCOy=Y4riiDz%U z0?(QdXcxN^3F~Qs;XbJ>QkKGH4>9-efZ8Sl z&P>Ti!%9jN`_0z-9i+-3@a)Ph4`e_}5b2x2-A>O|-SSgekJYEm}F5i0xju?bpwx`anfKMzLRH$`T}cR6gNydmciJxXFCj+*%2! zHavnLk*IzPBoV3oBVP9C`%QqVlTF=+IUx)K3kUl+IV~6CDJpD>hw3VkpVa@O?7HK* zT>p1QMK&4PyR!EvO0q|?$&9R!Ee)&4CS@0i?2#R!WEUYaLS;q@ABwVn@7rnUobUR* z{Bb&b#`E0w{k-ROUGEJK4i}3Wk^<7X>|^X9%LsVxce-4Q4#6*VUUchx#M{D{KZn6L z5avo#A@wW_FiKi(CvmlV2)|=a@f~-$@%Ot=wI?l#-Dx|f4*|USNNCxT2#3FBx&mdn zJgW&!q$Pw2WL%I7l<9XmV* zCy7KYj7pt{wzQegdlNW5J`bN4%&CmP@?F&Ij|X8!0U+m4H?^=OoL@w; zl2X+l8;0+YUOFhcKdQ*fU-e3-Qc1Mf9MoS3y#_<4rWiw=iRVifL>dYd{VZo8^W>fx zfX7sOKXuf1um@>5M&k=F8JpcE;TygoWd459CEzgPBlz8S-~fenC$Wq{Izf!2!!)%K z_a#k&?T3F!ma9Ae_ca1rgic6N82M?be))@cUlx-1L-mLCWUvKH$sj|M*^&L>sj$aA zm}Hyg3{_@%?Tsd*@BwTRcO;)AmGc1u5^HD9;b>TAND(y&GhLd9UdZ@)x52`ZRRPzn6{0vA% z%}Y&QsRpnXQsI~MBD3_Ed?1WqBWZ=2wdqJhXg;=djaCZ8=pnN+U@(@5 zN+0~gRHqJ6kp4DJ(~P*Giq?-6pU!6BvSP!7ZdBH>!awCX_kjB(>ZaldkC0(^0ok`eWb+-` z=kq{xXOq!6Zxr{;^3@!fLg9}1edcpX>9z8>C8OvnxV-JW#hW z?j(DhY&e-2LJ}IsUGtD)8ilZ-ywM{sHJz7H#`*5VN{Ia=unM$aUyz%hypt72 zoeIe~<7P$$oUE~fYPgYmCmJO)uB;nWH6_5ofxTEsHg^&U4YR9ETvd5I(nNgr^l-{( z;Yo9#tni)Ru0K|0!d6FmP2Uf~?mUt%{&;;06q=X&^9}bw<<$bRP^QVEQOU*q9-)e?4eDQd@&?XX_`t`eRbjjRee@ z=5->BPL;*b13R*O^{f*k@a12X<6+5MeFJ@g=UQcrUqD!FZMFdpG84orzLb+rxA5+e zv2{Vt4Hbu{1`c?^3aei~QsAo~u@t|Xg#HWAup&p=zG>?wk`|z*7rZKtES?9>XgkPS z(!x}zthBQ8tVs9a*0+l(SQ1ese9&Y)rLO65<=`_xM9IY`oJ<1 zn6LSA<5>Db|GQC0eSyvYlGZ5l*!dugqpaM#BNcV$!?NGOsreDRiZB53)#E(}?PnES z%##}lVb7jmZ&5sfg;aM77e7#by{`slNGHw;4806Nt9#IZ)B-DD;r$jt?TTRff*jz?a4 zz$|<}O$mxM9n4+zfa$ zi`RD#Dbs^Gf6_Ing*xyiH>b<-I$065!sg-Oa13%-`|zYnXwCq08~F)(j6zJt-AgAi zS3h2!YC}Mt665%8`l;AL96N`EgAD!;?7u?K{ca&5hwOuMro|bMFh_ruf4}UbEodiY$L%S*~S?5axc0V=61IAB`U^C_Dn; zgo(7l{)~Vv<=GdmwMsMgTJ%K9ml+cv-dEzPawdf7@`o_%RGk4bB8>5(;WG2t>5==u zV;Cbsi|l%$c!Bf_1W+6yyXlk3YjbR;xpqH_?P>6LW3mq~T$@yg*`H6r9OnESvK7=m zLKf|DB4!Dq`~XNBDwS@=J8>UWF=t)^thwa?B*BBX4=|gZ4Wt}wSfLYc7dUAEB*CF{ znqea#ArCdgiqY&s0~_|0#D1|Yd8I%{Iq?>tIoLS2cSqx6IheXMQSLqrqwUNSpbkhd zHbZ7U!z)MPRiV0gD8_8>HT*%*bFDZ11_Y14oc#*s=9Z!8r7|h(Zpx=es|hpAXVj6O^w6%6@2ORO3;z6T5ZyjNJCU zSJagBS^6JQVJpbe< zX1E&)qt|R+Km#ZUa)o10mDW7hFFz|T1wrugEd9ZeH^E={;-f0q)ytDqZC<c?j|T=_Z){`wp6upomc97Gwts-MyX-_@6;zHab2O2!(Q=M6Ua0&@KfXUuQ|$ zoCC_kK9r8i@HP|OTm{8z&2&zKC8 zXK7l|^E%lcXV2G)unt}oyP&VJ%YO+KA}S+Gm}lzln3q$LH9I>3h=96*_t7vpnz$yD zu+|AV8ENHlftOyVMCxlpbCk$>&`i3sa9{j}9=ub#6cz{-A6K@#g^Vyd==7K>Z|^&j zDh)G`Y8?eB0it15vH|1|GIF2qkDB5i)ctfR&WzFD>(;m8eMG`lk+>3y10*nDPx0&B z;MV*zU%VoY!pGxG4WVeht~huDhTZWvM0kibkYIWvZK}rF7+^G zf$8%9L`XQIViG*8(yQ$#CFKeA55%uk=&wMo0xQnAQ$nK?x+zMB2=xc+zp!;m;05Is z^!y?Z{@@E7N$~64;qu_OAYrm#OfgG%M~eZgMrTAa(!+ylpK5t|xqM0rC}uN&N^^l< zKY3=EC25(!R08Qc4-B6cm$zD>pl@PQWKMA|<>?VaJboQQuE=78Ren~SO!1dOv-%x_ zaun+OFA-VXlvID}I=Qd^jcgf3Ckt}(joT;ST(B3Ki+d7}z`;O6k8HoI5UAd}{cZg>uc+F@Nq9Dw2LzV)L*oyT3`uT?ydkohVrL zX^kn0;XO#4tMl~Z+n`;h!s}Gu9+515C%;!iT&RP4%2Z0n zwbs_&J}=?;VI0=e9Yh^155BHf+GQn+U4A#2bc^Yp^&NX+;~Z}F%`;yx!&0Euxx3IN z_yyToF_HeDR-QKk>qF`;jb{slm_LhLQ2N+@I)N6uzq5SqNx97*FObp0{_xKrLup6l z@UQG8oFmoi0uP=(dh?lRKkxrsZTy|_pqJnClXf74s3l@FS9hnMKxZI3(l6uhWlQ+= zk7~oum}fVn~DDlF=po_wx837YBK!`0RbS z_3H829nCWI8vi+!I-mcRDUmCkgj=Uo`S9bOMfkCd{vYR6#82in3{vSj2_wz_SW$Fu zUFUEeB7bkkuj}4%|4H{?+djNktoXSf{?-T0*M9Bbuebc|A^W-bk?`)gp^Wm2N(4K% z^T)!L9mBfvACLd(6nOvQRka-j*=br_hm8Mx%|EX$n;T^}O*w-2ufzO2fByEzccUx( z(=hxx13w}Ve``)iQWL*LI^O<$n>S_jSQh-V<=|s-G2(0ylDdq3N-B|J!?heShjv^63j>Rdbb5v!(yo z=+wt#MNfad^!Jh8ag?$XVd0X<&j0?SNRr?3Kez0q{0K<6=N*OmQl~&|qk68wKk&^6cO9I(T1XhXZnSZmDCg zbm>3dUKtP8o<>q8W?9=erv?Slk|e8{LnYro@S5q z{J5s?1C~pvDj+%fIXrJi?AY092kbunKTPxX3HZxl*>YkBlNcoBsGC&r;vK1WU2}Yl z3{9bI-6u!=H`*1#im)+}0rvRr{MgHS|1bzUg6{yo>t~6c)a7%kD z512^s1?s(L9#*NBA9YaM2gXrLM(79p`NyJn5#l;r|HtO-NT+|C)YNf6{(q9_>tjbI z{H7nv_Ho(qh{8X&E%@(~{qv8=;5YE8!u`a*^$;`u^Il|HD2fzzepU>t!1>qNBdj=4 zSn%UN6X_2Wztq>0<=SJ9RWm>-W%1qQ#*k$l{G{HnOnp4e|JlYKW&DF{{qMGeN|{U8 zng73MD3_5dC2yaR{r=04LN`Ex(&XHT;J^Cm@^7i5(2&v`k5UQykLL*Xf&o2Ril;Ah zAE7Ba0OURAX#Woae zYC&++0@h8;^WeF69L!#~bzmku0y(cU>*W6Ra$SvuhLQqtUXk(43c zE0W^hk{B+%N9u1sTueANx-HcAi02O7Gocd~Zvc<_xFlp+^tH z2>_g_F3!B!M|`6s$^hq?Hb zUT<5C-+Qp#aM48f9GdU>cVo$X6ifAQuSXmDXUJzu4wfM4oZCvG@bxe_t^8 z#9v2u#|MG_3>;&=qop0ZXJ`MjclmuykR$y{61qOZ%6Y8v6=D5)_kRpXf3N%YI`$$f zc>YTJvKVpze_Yu1vj6^~KmYi{>$$t30Cfe+{M(L=t0ur&1F5UOZ`(gVN`?yFUD724 z=f6t|*CFIzEdP#xaOd9qwf{$=;n1B5aO(Ya>&OYnd}B3t;s`&9r}ncJ@aLtY>-t;# zmHG*}JCEg9USv4B9B>_?{$JNp?MOn|=`m#D)Usn4(Dk}2VlV%zbnR!Q>8~|D0?W|t zA{77Eccx0>4{-fs_5R)QQA%aOO;VBSzsY!B=U<%pOjdKtoeTG4DR$g{7bQ9Z`>s3} z`{`r4+HoE3{Ezo9Lx4J^*N&D~@EM-@7vKNKjs8A+zrUXgEU}mGaieGP=Ju&cB(oEBAl?{_2DN za~vt3Qq~4c|LLZxsj;p=k@i2B4^S6`Ti+X;^xGsxu%|ly<1PRCnE@>w2pq-}FMgw) zefmFN@-LU;&xa}k@$UmYe;y3D*3<>u{1gAn_WRE%z`HLFjW6qClRE!@s|kL%=V7o{ zAtzA(e?LpG2uDKwmj1q8f9>*b^e5Vk@0uct7Rb7gKzOJ18=S}ElW`yo_%ntY(W3dn^U(3Cq`okb7u zicx64MOMJ24r4&oE2yl0x{tGW>GC(=^9HE$*NWVQ`tT^f?~1AffO{JJuyE>APh3x% z!kbHGc*Q-pF`W1Q@}~j}^w0z0-o&fpDeh(XM4n8+^6EGMb@Q(^UqCS6fzOA-y8Q?o zivblEc{Pa-hUR;YLW$EKjiAz5P+M5a4Vw~|Koo;lrA(c^PM~M~9W)LZve~rL+j+Ln za|Cm$`oB4(pX-quA5FID;ehIl^2qV66%7$gEsQt3kr8)>>oX8SMxiquiOY-BAlSWR zri-9+8;HnEfEpA!3%o@_?))NIb{}(Q(GYr^0$@Hk8Q5jSa`C|uxEq-J@2>w`+6j8Z z4#h=)nF@TmG3zu6Z4^Sh!CRlF#%Sn|&t#{|J~XYZr*~^7qXK!dB0wnJfWeKdXDlBn zMls$;=+4c60AIq29yG+2_v`03!d;B3E!Rv2<%Z0WG%u%t0K{F{aw7l>AVEFcZ83o{ zGTdsIO{~fOdU9A!%@yei3t3*KW)ryj9nAh0$OCKbk!gfMzDEN^0n{&4@XgY4# z(u)tW=0N+!N3=~mGf-r@HT?uB(FQ%czE`o644O=YZ?t%NZ7$8^WTjm>{%uBA&yd6U z-W<@G-T9{y{no@g_aoJaH!7_?mS0P;J9kM<<%6)*t1lg_VLas0XhP_b=&sWI=o+~I zFWCEw0|1-c?#UxWRWN>w6hXUrKU`v+qyV)0gFva9j}WjU7~|VExntJSpJW>!0fKIC z%KH&C@?>vHlz|?MLMVcXJn|cFK$ENiFj9uaj@G~J)K53JeyWc5*HP&5MlSf++F%N} zJ(Ih;-svJ#3{o?-9ihWAjN==PC0fA2$w4F!O-)=1`RK!}7#(5v>8{r6FJVj~6-~Z3 zj7C+7YwQ_Df)bb`+Cn|k0RYN72(5a`^hcpvQJ~tqn6GtLiv}khv^nwxgpN=ARTMnO zc{;EERXgRG)RTDbjXq7<4pki!DTk4yLF;YfoEoW%F)D1EAlXGM#a(1#_sodyTIJh| z@1=*Kj}=?;C=0Y-r9MC1+B^%rlL)D)@`EvNvM-vzGU$52P>UC@B{)ND?yG!mnfUr6 z%}EW!9}xX@@4tOB)GK2VE`~!Q45hu(O4w*H%Y-z$@>f0eqokuvS*{y^pUV@})!7pz z2SP0_6UCt)w?c(ze$;wkmAObIKoO5?B3pU|o@XL2d+vj)4|ND+H$)r@E}Fy+T2=oG zgKq)_te-oSYKgXjrlSvPXG*E(u@UdMCTSfkRo>Js4s$wzhX~AY@1~g zt}LGHk2miptAu0R&>GDaHuj0q1(e_eeKuBmn}(swY1DU0u}`;xh4WWnHLs0D_!g`;hIEii2PD&j$&)y9cjYRmlr{b)uz<^V{7$3E-~<23Uzb8HuZPd@-tfX zS3tEK+xkJ29iHCpj~>bq$BfW4$}oY}$1rA7r(M635S?`Cmv*Xf>~~?KX`3S6ACZg( zbWih&z{DuF!Dx;H$SM_q;&u=qM%`sptJNtzA?8*DSIIirk*j(xKwdOa- zYy^ZD^(H3h`2NZ1*6L6$afFNg9X8lnV^^sE`5Pe=4Lbx4(Bqfw!MdW!?GLV_Opk)2 zO*r{I`)xHXp<1o_dz)Xg1YqWn?hCz}n+Opaa7Pk@c(nlh%IQfmm|yh;plj0F?O4V1 zFH@eU(my>vf4sji@{Elmlj%)~C*2hf!I_MvUMJp%>^i7+5N6$VRVlmc-mQ$(%}9{! zmyF1*K2s?(>DqLcyOvQNH7S>GlnQC&-tn$`qNMUEyYlka^uCIE=Go0_2UN>FwBlBC zd7Y{{4mtyjuQ!7Xg)t_dC&Do&&30;>F@C56{QiO^A*cE;BGw}l*AM_7O}SGER^ z!n036{vG0x2C%2GRg(aQFx-wx0OPbD4H0&Rr|?FE*kE?uwl7%*(!;gNd^B+W;4na@ zu$)})T&T9o9YbSXK!lk|YjSp44fNQ2%vCDUgZcGi&%t(xleQf2H<8B0xC#Dq;PjI~l2%Lntp@7LA~;^dg>frVm5Xy8|nSt1(nuU z$L57s2U36+oi=U{giXS|^v*JsAj5*wf9IOQ@7|71m*_I$3i{PXvnj?2Q)fU}vOcnhF@?$zQ~HtT?Ipf+ifFstep;I5&UY~_0dfDoxe4+-w225QP7Nzvklb`phqMd z13-;5z=RBnig%Yk3D|a}`qjedC;@@GNIY{WVs~0RIrk1wG$M;if&H%MjYcagxoz_0ozOPS%a!#)DJTTn9Edx zS%NhL=_QJ}KaXMNmp!ofZT|gJX{QU<$M3bXOzb3Df1Jy_F}*6y$d|`r{{#XXiRxyuNdAFAn83;QihpVxd=9;>5g*ggz{q9L4>>A-?IYL;{`k9< zna3hm0JAGW?)1vDFz{|^W6%5bYj?lk%}D4E+@t?8F_|z?Gk%o@ayy^s##zuTl_Ga{ zZFlQniwtl9BTP4EQnrB7#nc}VB< z0C$Aym}C~!r?1?qmm+lwJ&cX2Zm>MK&TlGFwJ8Oetp69W=hljSu{uv(w*4`iNX}`r}tdVCj?bUPrR)K&s_uxUA zgp>xQyr)(pgqd`G;9PR0){A{d3{`u`yqM3(dPVmUr`-S~lpyn|koXg)pY1wUAMSM~ zM_?Jl9$>>>F}^|?$}AMpLZJ}co27BD2!syzG@3Q)o*@;2D^JU`n~ZLBHhgwH z!*D3}eY3)a@mL`{U(JISM7WB0Y(0!dcrIcnZ3D!bd0Jr`$hMJ(s4tYoHVKR|H0rEa zkuEZOy@*nM9DzrsaaVceqp#N0IEhC2g>^9Jp6qJpe4Bby0*f0~S}&zgM903Mi*nLA zKDP9LsE0ALM6NQy%o^Zw9JK_l>#WP`VDDK;0zZFQFIgG#T6#fJ8qrGOLfx-lZ%37Y z22e&T5uG(l5@LwndkXJ9nVoe3$q9yU*)Rj=$hou86^wHy@a~dnWbtatKWSve>%&CG zCnT_z#GIDMnFepqM9PjHALZv~0hvmGC`A^u(F}&vOGvV8XYd4ZR5df!o!?Ps(haDr zWWJ#klY`gQ(O^eGOsR0AcT7!_XTP1&-B6gEnrtg=KC*Y1(M<0nBi~ zRAcco+ur=yP}+VhUs9F0yv)Jz>RWrOTs=55Sp4e0FvzAn_^RT9=MDqI9xpqT@AJSU z#ZJr;%$v@}>QpU&jy_dPV;iR_SKeD{uN7H2J@YczRI-ii(^k3X0>bb5;zF1hbEOv?yA=qT3>onf5%4h%4GME5}>LG{0u$)BRS2z|F*Q-m2+L} zk4Cw;yRnrEWVEl(S+{cSan zfVEh3?>>NO!7;BMeqJ?ZftUGCtt0UmmD4d74J);a3GZabU7Ks;MC~8bDvn%iqdy+o z==OA~qasjIKF%RSWy<=6TFvFy4oyi8Gg7hfaA7^Z(=08B?p31uzvW?Yi<64&Q zQyqI&aX| z{$G)8tPr-c5^kbuBJ@!()3-83Q4VoPG}$0&vHZi?GtYq9E){!V&wWH%(z!&J{1FcO z!>Uj)7Uf)Yc(2b>9-q{>9ai}P^6Q7WybH3yn8jMWCs6cT;7KNL7G`|6+w!2iHz=12 zmD1Z7CGpK});W=QmEYEjsKL$n#0@SS6#CEHTAEXw3DrK=PTk-_m~}`?uno#H-6>PB z+$6!VgKXyq>hcVUn(X;P#os8TjEjm3SdYL19oposhCy^p<5%F6lB?T+hY^3oNvk^JB-boKe3W?}D`mrAEb-$-QC7yx zR4HKYjC~Laf@(xZHepKU%vpXRu~m|^mymKgeCt|l3%w(Z z`%g0K7JA#&n#Pr$I3ta#1Y_d#X`qlO(Eo%zL{CzYZ0eL~ug>hs3r84OaB%l?r22WH zGN}&bWZ5*=ENVLB{g14lutgIO9+c>G`aH-LILRqxG$tC&y0j=$4RmBnzLCR)DLn=TpLbP5 z%@GUR^FNlkQ}H3pv%i*XPJy1`2|Q-^Hb?z4oxlmTDrLT91jXHYVC_)E8CoY6L5LHC zDt1WP7Zn%pC)ssoZ&N7%S#QuPLyc==SGx*SFAA))y9iPu^bF1h`YGAe z`C;iUUC$x-G|CXNp@>6z=G>dTF*#*-0Z=n*=*n8V9DHh6W<4?Q3HU5~Gq!$slusD| zL7YxcUK-|%*e?_r47WN6qW#dsDl`l3!L;6J{Rk||sE${_co$&y8ib%J>_sEnViZSr zWhA1?pKTx#52qUA;>BA=g}$3c+YzQb3`NPjInN;&n!lhi=vI@mkzM>e1kjdKGb|69 zxoI{s(onTEf2P_PMsj+T$M+3D7=BC4zGosVO!H3WS$v`s!%3vcG@?j#xj8H1b8B^X z#q+B>t>Ux<_BK*whZ{=$1P!46rR7B4IRwn{-$UMdZ}Xv+==-Z*)Zg#~^TEv8$wnzW zbP^*Q%PuRjN({m9!WqX3t}9+8vfpoIA~q@${@K>3M^%W+%-jtO5j$DYt?tYFXP|}( zuxH_VuoGrFMDNN;Ki>ox%OO2VYAG+4^>n6KAvT_MC@7~n01eF~0a$Elvm(BzZ0wAA zwl|Bfp)4$j-c>1gedSsfoNii$rPfj)mMgG!lvPM!b2M%A8NvtUg^r*55;fTY-b4#2Py2uyb)>#f);B%&SHQK54 zwb;95MB+gl&fOKNlWk3Ij%TmH|ETk75nkF_{4@Y^s)O2?%t6LOrLTaYP^ZPIdZinR zQm>x8)uan9=sxxBF)EQmnU>>$WLBQtR_2`xw4@xa1qC`!y4{pA78w$ea&A^3Ut18N zvH}z#9;`+H)MZ9Z#%<@`{wT`)Y^h%XYhJyJu+_X8{5`E%3j{CTihVTUuoeq9U=9m+E2qUA~ACL8jU-U+3?ifKCdbP zWyZaYo>0GeHf=n)H6@iA+o@}EKFN)Klv-9Gld0oUG>`reG#X6#B;C%`?i=B7gd99l zDv1-{dwgl4QEB)h>0#DQ`CU|<@%uj*oNx}l_zY^2k7Nw)epn3{Sb)P!f=h+_YmNYq zUCoQLU%+{&pI2!ZQ^9{bPFYyoOp8Sl^ahE+1G6n?2vcYU_Jp%QDQTI5P^> zG*S!OQxO_Y$y~(WMS*`Cc+%C2SLS}D$Lkoj49CK2NNUt7Zna5_i)>Q)DdoZxc8;O0 z1NgufRLeY17RYj_?uvc~w-F~!y6~5i6Hpzdsny*?T1QN-V-l_xU_6Qc$HNf)w>d*K zL6PxMu#GaC)*!6q(CkYv;4Bdb@89)`JkrgFXh4r;TtP!A*MM-|+Y5``GHKl~0B@k3 z_vK)r@fvNdf>-Q#d(xL;t>oB3y4V+4WH(uoPRhiU27V`h7#W;H+Qahhc}DREt5!5# z=o0Z`kTE+c+H%N~wzdX51ZAH25v4-532&$YaDPx_HxSf5c&_!S?}&P#7DeSAy%)D* zIyOk7+3k?)YFiZJAN>+&kD! zQuhOw(By!YJE+5}PU_yQi7>#NGSql{TTMv$yjzX4YO*AYCFE_VYcCT{avAXwZkH` z25^|{ExyEPpTfu4Ll}JQ*11fqnaS(QgR*xp&g$>rC`#)bH88CZ^iZ2Hp|7DWu47i$ zCOgXPJR(hCU;meC-SiRM?>7@kr0HA=o^j4IJtmHD=aohsQ#e6PUQEw}p4W&^B#T=f zyQD24kD|WxmV!M}|NsS_}eS4!dhxudn zt{hz2hj*u6&ly&sDo>%JIq;;_dTr%!q;|!>Xb&sZvTK|oxkXXFEXaOZv~u_`g{)+g^_+IpHUf_=a=qLbv)*!##E4| zkAxjv)M@1fyYX(d##tIJ0=fO3GuEqnB3QgYRX=1q4_w3bYl5h%cm&$bsdHW2tgI=*+9YS!?FIN zby|#E0@JC%8%t1~UmmWrvH&`TNuiL~N z@2lf|Zw*s)+|%}aDEvy;lV^KPBMi(>tCY0{*+84#?dvFw#ol-01xS@y^lT$@k)y(e zio+p(F64(aMnFCU57px%D*McDU(B5}>RQrTp-Egb&4@b=H0%41!=_9jcsXrXL0}-% zK!p8BMJ-Pq6$-PzGR9y01lpViO~sEyh`C&6QHt(*rw~Ql4*nc9~E`;wDKFa;l`fE}^3wP!MchCAQWvRw_DCqb=`qJ(&d;bX4E*bJPfYfbe8 z$tYn;G|ocl%Yw0FI?|e`nmDEUDfpX3#&SR46zIHc3{^h+(cnJxu{+lMNyJYN4#5M? zo1UE4UhlCyfc!$1K!l*+UyX^wc<$po5CUuuiM$D+LY=(JVB7*u_Um0vAaWf+ZY7{Q z4GBEs2gofNFTlhOj^=~%5NJdxzq+q;)#5`yD`+3^1M^f2|8Q*jcUl{Wr|~kLkgJ#6 zj1x_(e6YAsSytatSu5fgdBrZY-_Za&1Fx(+cVu_eTTj>_6^g#=jPO0>+rz`E5r)o z@6OGsA0|tQCO>o{^|7KaH2vQ)_;36GSpMx)8Sc4gN2ZO>ED%DF!R1{Qqvt2j5KEP5 zXCAueiQi|?4LCG4$`f`7OT$U+UYyx+{@r9#lP$4Br^pL|j<-ADFjAkAd7xEzRg2ba z$=ZcX8rXE4iHAeEmlpF6jZX)|R5|Cb?@xQ)PhL3yz?VcOwPDmMB_EH-PnO|0VLvPC zUI-LZkscMjYO%@73s5!+gzg6~HBS}d_@^H}yN8hGw2($R1!uB*ceS1ixYVGW%%t2x zF{76QKsVPZR`rq%CZIzwl+ei|gKb05moX?-H|{jrpd}9~f{w^uDS_tkJR)clcISkR zd86^_qrgDr;%X2K)Ne-u6#}!mnm1mNF&Oqw&#eDM>C1d$Hs|NKspR+VTnPRFwoA?G z+7p27QMp}Uky2(+8+xPK{Dv0OBh)g^xqMs9jLXs*&}3(lWRjwkjA;Fmf8esuF?Y$0 zRS_15Q7vFb-mSk{R9r>CK=0JAPIC`>JmUImi$2pxGJ#x8TM$OG483 zqiFU`qXs0c94Z@(5w`qcht7eF1ObvEN!0*lhjTL7@ssi<#;q;XC0>{Vzzl7G$)F4g zGv?4ZUaR##{0KyrqJ?bm)F=`Z9$W;yk<1E7)QW7s0>0tV)CuED(3gJ{r0V#9ay6Fg zy~(&_3fB??i$;c&i8NB=EF1_#65-6@c`+`ag%w+ZD|ZU-ELrY@s!bgc1FJn93tVAT z`-dRyYXG+C0X=A%UGu_e!o^7Gf23V$U9M3o|B8l z0%%R<#pE0%IHEFJT9#JHGq1}%yB9%;`Z!O|T<$5Ztq}XG;w;mT`u6sp4;leN*PRVJ zxQ#U2YIYHtfE4(He2|HwbV!%ylr*I<$2k;FZL2^a#_DB=_0h=50OQGYRQOmtV3Ff> zDP039lOA<+YCKgA&X8*zQkoxnwVNB&E_k2&xT?WueMtm3fLnF}0$!pjYfBeb29I)jKq`@KKz5C@<7B|$s&)rd z#2iJO|Ehb3dxoboR6^7@9Xp9vcAQC9=*g)1r zk!CO6f|E7{l=PH*zQcGAstQ?Ysz-0o)uYj?6|jAwdIthppFYf};v|M90rTk($Lgz0 za>xVP>8}S@JzvW@z7e9^lwtPNhvidkR@y7hP5}Tv?8S9hvgI7Rvh1X1Wc>Jm5)y(+ zTNC*2 ze+fKfE#H1GlGD^DdC#5Oj=AXGop|uVmh+D;1G6af7IHK_qQVjL(v}$8ZqIjnN4xTr zDVSEgh$;YiH~51M7qg5znn&^j|d6=&$e@UZxsNf}0-(zXmpnfYs;kB|hStzSTQ#wV9#E{o9 z_>R26K*zhcCwLzT#By9CkqZendNIjCG|HGFv|y1a!oYSZ*RV3BJ6^>K&82dN2VQ}o zUMccA7;DbO>=p1Wdk34Ajz`!$QStBy`YK|f-XUE01&^}F%l9q_M(pd#AnSB_o5|}X ztggKm`{?o#6mTheWBcq`TaJF~M4`c=7Zv5)9^YkXgI0fag~yy^dy=%nB;DDk!(hEv zH1D&0;>xbPvFmz<=>A8x)EwreKP!*ZtGF{9Dc9vPE-GG)c|L!BvClHcX6Vzal*kf6 zIBcAHqOasAGLj-5Bt=~#=N3U*bO{?tv$lPEMV!fI{ufL)TMgi&Rtg_b&7f-(7b*_b zMd0nl%if7-zY@*GBHR>z;rYZQfNNz#E$IyQD3PfnCN0R8cTSV@z3JvM6jpCB7*Q`e6BS_9?s2z&s0 zvQXSAG!aq4P{bOCFqv&{FJ#@h<5zGSx;kF$vhAXrI$KXy(n?m1_?@Ed@IeD&YcX-= zW;LL6s12TdF+L$jn=?0Em?iHY^QDj0LmJoXs2> zR2sjZdq(2nlhyO(@<+?)TR5~!`m{bGVFq>o=Q@=U>q*t`ddyJSnI&TC1xPdnf?JYg zUTr94*7jn(@xtatCExMSoYx!VMcXFeLT8(Cn>PzYoCU>`%*~BN^3*}I>RvPCqXOQ1?!wQ_oc|3z3 zYLlnw{4ek80u}Uf4Ai>g97s(kR5|sc5ynARGxrnq;crK8U(B4mJkUxa-qvfM1cy@; zABiKHl#W)M!N$k!?zr4=RL&Y-S>V-kO+AQDM2>nw~kCRqRfp1n=%95DV$YQM(den5r+&C;QqC;x~zE5cLKkVJz)LuMhF2mvfl+ zEzlpw-N%|oIO5G;koAx&&vwNE<~+JKtR`6R>$#zu@=9PzjP*fy)4r0ewf9?*^=fW( zjv^zSfT=Xq&%3%}!jEqp0f>gsovmOIOC(O!JJADecbf9q zIvf(U$(Ot@?k;Fl@z)9`o$q`CpbpS|ag547J8rhMs!x>hSOf0LL?j;&7@`+xj(U-V z(1QIUd8sCv*OC5h- z2!b;?>J-1)6$)2Lp(5$?CjxZXDGKAJUsb7SlIzN2*YV;Zj!M4f68h>n<(78JrzJpL zQC52D^PCPMgbLBg?+A@TsweoR1>&bzuc*9}y1O|msUzqmg}VBoEndM~K;{qd0>Qet zPpfP5Pw!oN|2j{dUf(&B0ks*wLufQ(Rb5AHdFSfQ_Klb?*8*qpITKQu&2s7xB>&Rj z21vK6SSg23Ray;Sa|^PGgpf^A(5#ki^o7r)>B~zxPHS&prfkH@NJ6>!#VbM7kCmYy zHP1!UUFoExDfkJ!Tynq0l?as#`7%*%_3|YSWbJB&&8c8Az5^W z)cvj1?&f9DFk;yCtJ&5L;)%N`HXhE*8|yvqVUbo?ZzD|_P2&Qx-ry-32{gK<0pfKA z6!g{wwRjOuogAdpHx^uV97tD;DuO=;0f>cPfYPyGHeV;@{R61%;n~GOA1W(f^4JJh zy#K|V$?n_Tq$iP>+I<8YiV7ZSp4UiNV3~%71{1>M2Tvh@=h8STQ6n8m=bOhuYF%Qj~ z>7Fc&ROxM$n`0G3PO`>IjZ5R;O7^vXtrJbO7w-RrdT-U9mX;8x69tEoO*VZh6m3s< zKJ3=XKod4sl%ENC}|&5ANm;z9TSjT~%wq##nn0e_*zYjYbjgS6C+$LQa%&$@C`p}nC1FjXb1zcKb*FbR zewOGtLbyvK4@q9Wotcyv|32x>Xz49rbv1l9%Tbq_mL^Ty&V37^WM2jCiCctPj*sJ% zt$>hrNI>MmI{g-j6h=lSc%NB{q}M%F!xTxRv0dlAfmHlH`^ts8&>1%14X)huce{Q> zGR2l{Ep|NkOR$>_l8!A~1BM7;nV`?6RYCvyHf$qws@WNXaO(bEoVw4rbv)Q^085E8f|`0Tp7g+-uJ!a~FC zy`%dK+c+(!Lb(g^q*~)&Cfp3Je{BD*m+yI zv7?RUMa<)e3)sA-)IC=hwQl)`dnJvMiVZtAsIA?n7nWO=(l5T}_1u8ZsY>LoazHxlL$c*Hp?WNj`8KG`!*qo(#`N zdYx#b1Ym)N0mL5Zi2_ItUx`#dlx`td&@0Oo;O`2uM;o!$ts{p}Wz}6B`)XN;@b&dR zw>e=3>wCw%>mfxMm@5a2+gYqv2V^;LPfxV*w}|zeb9Efu*msJ^^_o>*a6c)R38=F9 z+Y7CHp)PDQHbZj7*(ndCR#r+cxXp9v-l#_YV4ATQ3Ol;qQ8HhHFr;TG>Lm*AD|>WP zb(0#fRDNI}oKMdslZpVQg+tjZh3L~HS$M(kZqGQJqpw$5MyWdWNQ>La?ghJca>CS43X)V7(>= zCGlpF<2U$QB>Knby$shtt1mq*UL^s8llbuI_xjHOD^cGs#OmCB{DRwT_G_lajHR!Q z`WK)u>OS!%^oeH0@U>kDpL!r^rcLggT+~y^%V2`I-R;fCvNusYRbnG7QT94M#(WW? zj>;9hRf=nNTlWBV)wQ9 zDCOBvd!@*J_M-^m;jS^h8(32eR0*f3BIy+erq+Y+g(-WExeQMoW{T=}4ebP~CeHksAb zV&ACk2yNBnQ_038-xWi`uQ9tZ!Jv30HImLNU)eGe%j(@$GBEG#N~kAC6O-ScB=n)? z5HpnHa53g^F)2KX8kr+F76I2WZA~^N!W)cC2U%5G&-B4*rNXT~een3|bo%L!M_ycs zL^xntUke_57vn)bR#w@f_VOUnYnf$MWcZ`Wkr##S+Fo-ronoj|JXhPl&~fLYP0d={ zAQkQ@#3b;e@a&G8N~U3LVkarKb9|i?bw&Ie)f`bUwtacO=HxtJCjF)hjlJ(OwMzvC zM{KfaamOD|Y2Ju)!9TVb2@_(+iOA=}=?Un=G8T11r|4zt9!=a*-P0^5@t!+Y&;*@S>`3^vF^p?@Lg#$ z!zOXUTNq1jETuG43a1t2y14~BLqZ-0whvwap=hY{$Ds&3a8OZ5JTpB`qf6oF#?6W@C*vBnW|b20rv2J-DI6b} zrVggb(zqCZiG71`vgWtAj{`7f&lBefRoR)=$*e^D19LxX0X*0(50139L>aF(t+bSn zR(~oCd%`%L=Q$W0ERo}&MW4p=e~f*1JlE~penc4!TZoio6tc5QMP-YSEi;>p%t9$8 zY1li3?7g?F%(C|?KDO-a-+8&~zI8v}@ALfr@sjww$8}xjIL_ld&XS@Jhui8Oj0)Ce zQ*#944w@ce^w62Q(G@zB3z2p^Rx-rYO3HYBjkR`k0X`8LeKim>8fxI zt>Xp5E;)wlCw96doz@0n@ap)_M}#w;xR?W`FG)w{6M_?&W~k?UkE>+4Z`BmP`u14w zrQXq_ca}Vz@=nBNOI>U_sulJ{B`B@=E<@*Sv9z$M57x)u26FXT8aG4MOY*Vs;m)@E z_9?G3BF4jn*Hf&$rW{lOW5tMR%id$Fx`;X1N^;u2(HD($!+t zz_TCa#vw%#jt&n)$Rf|YH09~s;&Ge`Hv;yOPi(2Q9wjlwTzLLK>}PG(qjNXYxglOq zP>psFel|8BCr%u`bWqOOB~5ZiC>~!)C3GcE%&OTrC*dr$CQ-BXbnn+tgB=E|Z{IYk zFVq`0ysG7VR{MtY?V+j$RxJD%jOUxW?C#(0zK+sTw(D+}9UmX@(jWq}aOHMQr-;|R zA02IwN0f=02QTllBq zK`jr|lOeh@b@MtSjWoB^iSDNki6aL|H^|&Y`@A1nC-3lw2#M#Byy~Xx(!N-!B( zO|SA$x6k{&P}wmI zo}()VG3PeIJ@x3B@XmJ9Em(!6MfUU%RH?6xrZvwWq+B_A?gIa9n=qmsP=V$wy%B5P zQ}*DSR;_gfXH9yIE%p-Q+w{i`=gZaUIWFW?k5or)pVs$@9w)S$1KyeYc1|{9=HUP|sfNYneMisxj5D9u(xKcq zbUIUW&U27&)NJAP;nH{Yqn|^nuPeC>1!ZeoAH~pOmHND${G_-FT#rb{c&$@Z$RJEGehcl^_cgE=La(S z>_U&)m^xBkD0m*S!!R@jMH@}eZeD)YM_3o>n_d-jn$E?&hk2JuCh#lk=KAbiu^c9Z(77TJn_^wa$&J1cP;t0cg@GVVvRh%D2~8c z5F@!Oj8%ClCWX6_dwCon6zz|^fwmkgHSu7hg+LrGPUYVri+zEyDo#N>TEZ{RCPIf9udt$ zBde4&Kj8J_!T_ZR`ns+rb^lA&;u%f^l&hj0v>G@)XJL%iiHxH0@<+O_IT?1}#%Zjn z-(q)B!#H*LUrdt22I&uNcbLT8axCicmn;P4?p223)3o+ocVZHnj{A0b44TITAXaIx z*@y()*FH4cljv*u*wq9ct*M=P--r4MXJrg84(VR=+B8=9x?Yj7FfeRr5w5&%Wjl2bQKfQ}xvpw$E8h2X2F3r6fl>>E0uPf+JAYJ{(i& zI%94;;sfm{(TZyP(7<2mG=Hl0Y3s9R*MQ>@E(yhnxRKm@|E#rpif9?1WE2CQ3v@x+ z#T%yNj}dUgTnbGyW7H{uHql=@{YyFU38U0?Tc*igFdFp^&d+U3x6mu`}YO0!dmB){El0aYUD9$}Jl{Ykh z0@wUgz1C&1g9lOG7)`AWU65aHToxc7fZ|kA59P(^(MLT%6z)Oo`$F+!>$kj_i!TOb zsG>r$H^Gm-C;Z*fIIu|WvGhZvmWtB{XimBxVbn3$bLTQQ0BgATQ7Z9=^m&LVI;sM$ z?LJeK6lvQ0?(cmW!O4*AEdUyP?3o25fL)Xj!-8t`8(>t`eEk=L^f&7vcF}dky24 zpV%CrG<4fl=HyT|hm@o6O-@F5&t^nGlZdo}kmWJ7IkBULNUV3GAS@ACr9r!Bf^;UH z?9)n?^q&IxxA+gSFPd2oq!=*~`68u}nKPvTaa+L{hpeZqW>%?x1!!*d+fA(067kp9z*5N|?bFt20va$}wr8>J|4}Tg z#M5_;572f2_VsrV;4atP!Diqn}j(hfZ;o0!icFNr;cF z(_aO9C(&>TLX66v3a|07`9W$S4FX^PIt8v0hS*!c5|NPV(4Kn?qih7;O01Ux=z#G5 z>_ELsv#@r5g*MS1$?X8n9le<)(o_56=>vrm z*vTehGZIQl+@2P|a@s$z(KY9V_jcOewvVsZjPysWnoRL-lfro&(eLs8Gg*8i%>pQ~ zs}(#vx3e|4Ai9No1+}2Ni_KlKydfr(bfqa^ui@(!FJ$NLs}W- zYe>ai96uAB+DhgWagcqFu@qm(6we9Gd*M(sv5^|{W}+7?0E6)<#59HKrQqbpv`P}G z%1pmQsFU;S(+Ja~^lM`o+0QW!seI))q&TzAh+<@znurD-I+r53@|77$8pOW5Z3=V6 zGKnpY-s%!9>Vx4QqfW3U@9lanA!zm31uLDX60Kqw9|3fD8%j zDQrC|Y@my44Q;=AcLVkI_huT+mrzi028K%J7JH=AI4IB-QoV`YY=~h@&V>$IWX{hp zwvz=em-Lwod?hNw*l5DEScSzGmoQ)|Y2xeO5YHQDI?TQoVJ8{Cf|Fn|Id{J>LFV>v z#p1tT!K6$SMAOu5A+YqHPy-AF9Fj8cEI>yKhN=vTr`okLNz?vlgTot!TKDG{SVx5q zVZ-3`_2Bvd)-Q$k=a}RBY~61@>5BXr#r4^mgRRnE?U?-7eFfog7L9M#F%?jxaE$*@ zP_-zOVo}X{Obwc>xtvv=hgR0hAT$*ErqtRwDRPZKkNTshXikQMdcE=~XNyYPZYdv@ zn^N7J6bU;h+%RI@1a+V+3>shBn|s!LDCSVJ;W<>=K*MojPR&ffNxi?Z0(aS65^ksm zkfhvOHk$}kHjJ;Q*NTKIn~GQU{+qkuD{{#r`FYphYZN;? z`wY!Sh%D%zaZB2WmA@#Pk?kO@tvs5-!P@gt<-p_DfFi4CHZ0E#UP6*e??S#Mu^;~8 z1

4enRlAx1Qi?AlAb>-}@;j+4kY0$e;mn^PMfArxKn+0DSGK zi}jSv2E~^QmbWE^TF_lA5Mhm!|Ljjs89;&9+kL@aQb4FG?%9=OaL1&=Us)}lf^^2<>CqE(z>Er7NdvhyQ4DceqDqV18OmG` zG!m4$9NhnQnT{9wt8&#hfF#qKOOt(hV>tU*I~51ili4RV+mcZ%sORf>zs_zjyL;$C^*rrANktbNb7wuUcvV(-6>*4?ARypf zz3?#VAt=-|>T?*2UhFL4dGe-Ck$q5|X!xE~6RCE-uqK;1BHAF?tB;*aKHv+@5vj0% z`amyIiHmX*?J$;KoHGU$EGhToYcQ($_^}0C#kG-Q$z{OwRbVfKzx>EyJ`7PWyLpF2 zuw|HaOZ`3lZnmSuLj5>6Es-%YUv7ueyVv0~{39gvcQ~_0wbFf7W(W}b+C%i(VakaP z7U0YY(4-q@nuK7kSvUyB9L?8CZo6CMj{>DeT6!BgRY<deR|krSG+8jzb9kO6!dxYNw8(B;oEAXuA#Y-T+E)(^grw6a}j-167B?3awwbJ zNK?;;w*pQ=Z-kppqR6Xz46`B}^<@ia^2eLU_n=O$Q`d8~maEFt1%Utzd1*8Am&{{ep<* zq1wq**+J(ie=*q~inlD|oXI@NE|(1_G)m)Kped&GU>_txf{&q>`bfDMBgwcH&UEr0 z;pbmb=07q-7-D!Beu#eCc(O4tAxq)Qnu&086_9=oOvE3bGfA?kN8CoR=xcnV8!Paaz;_Q8zt)mLP=L$t6!`fWomShJN+L&2jI) z;lFIDu-m;D&IIXrcx)%bj8xp=bGzf?=$&~ZFePw#Y#MO$a+mWIhCtbj0E#;<4$y~C zfm9(-f4-zRK??}E40~Adx}yoiUuJpgA<3ekW{J9bArHc%$6zm5jsst77@}qG0vtjFhKFcfQNXGWNbS^-8PfLcx)5IgEU|6tLWnCq*`czz-SoFioNG*cIy!b%-cnB zdNF*Z5Ewn)+!&4aW)CSiN0kfIq*tX&Frz07Rssua5N5el>_X)Wfu)ZpVnbg}KdVO~ zPsg-b7m$}n+)`WLa=q^4+yOOK)0=>LTslFtI0LH*JWLBbufsOMz)KD8mCn8h_qczh zR#>;v>j${3%wQ8#r+sgaCg%-94e*9i>{)}6qJ|c&i$uo+ZSR(E*D4GYfi<*;gOkCX ziDq)Rvx}DfBhdA?_x>&Y_(cc&T)V5`cAt*HP>n255z>eJ0?f`=om zg)%%}#jw6!@zllr>~#uhnZP*0gO8C{#5MNEhEZ2HV$RvgltT3K2{iti3wj!;zB!0y zy&}{Gohz6UrHqG|&<;R~K6M%QQ~?@=dmN_6ot!E~IJ_zGB49k>H-;dMI|>HElqo05 zCNXvg&2C0*epgzuK8~tUmL6pTrub+E7$Tmk}3cj99L2vAqYPo!(@ZQ`-fFCj_ z_#aA!%d77Aj@=GR5gI+khj(?cZ^SR_fq%avyxU?$$Tn|An!i~MZlvM@1c75kMJ zh%QfJqCbDb&-D1HO>YXLJS8@@F*BB!G@cQSngl6L4Ji|LK^SG$IT@e9Z)krNT1zBwj_7%^TvG~{s2s@p)w@)P41ad4wkr-bSZch$s1 zI~jDm|45Sl+K+!J=Z0V2C#R=6S(T`+unj~LCYA!TJ{Wj)%TKG(&FYPVMXL-yTSmVZ zUhGHH$NiFg!Lq3e5hg8Ew>eafocx3UD16=oDz~}29Im75y1>?Z^|B1iMHu`0{2qMz zBrPCy^?ouNVzB_xO#|-sifnsC;6#mK-?6e_ceuxE!#XrtK^l@RXBxt7KkP|4#z)uz z6MbYZ=e@^n7G9KgR#KD|FhF=k?x@Uil$c?_-xhq?ARK-X)w|0 zL7e5#%WKeZ?MbP_Apjv)SpysZwfcm5Y5gn#5#o40<%QQ1t}SQu>?cyJ+%(LpT!AdL zg=8y@l;o}wSQ0h6<{pmU5A%;S;IC9@_vt{M+qRU>a9Nli(SEN~&>YdtsH}Wki%6U& z`5vF+yLaG?aP#XNP+!P2>1u)6kQ>vxCra|wR`C}DSrj;dkN$wbI{j+Z6)LVFHkPaQ znXqAGuFy}pMoWbB8#mrIiI2I$Qr?5gVjv*jo0h}qs2ZyOMxp{Hymh4swo=amWTGPx zTB@~%S{0mvO{j%j+=wD6@{FmCu7Vkaiso5boE)Y9aWMa-fh>xPj-`J{jvb^!_1`Cw z0hD^iw(;r2_N!ys>|Bt6uVDbgIImN-k8i3Z37A87px%;P*q?PE4wfrRRS~1dnrBk! zZOmFW)d#&YfoBrI>A9a_I1i#JysF+Ig#wQb946=^mCAl&dMDmlC!Q;Q+9gPu8J zk$#VHxqA*Pc@G!t_e#lY!?OkXFJ|Qcdn(rC_>~uEP&X&X3MpxwSisbN>|B)3NKZ^l z;FS~Y13Ib!hejNU_020H4FECaAwh%X^Gf>PAOxD>;cylA;RN};S}`5SW$XP)g^Wrd zhOI!&KNMJpn4+me5l`aax@GLd`NhW3Q6*D9GJH zvI`P62s_FHYdJNrsz9CsjdRZ=DjB{@Nt=o4AC$A~aijC&%ZK;$1gPWbpH_?~0)haJ z`woJNfpVYzkbBNLjU@y#7Je_@uP9LrHJ~=Mg;N=YrRzVa+%IjU#`US|v7}=LK`VkD z(*xeB;4QxdK6Y)d3>#f9YRj>LwxTD{W^yV&Iz8Q?gQ~DQO^K!1HCx$>c@Eh21ZV

po|(t35@ zsoy?kQtRP4b8C|l24+vxJDI)SK_oWRS|jSEZI*w247yzs9GcnBeOmc77GYW*>(EFRM6kWfE;9KxGB;B9;kj8T24QKV@N%RsUno-n-`HB-7LF;WWqzYZ+0 z2__mF%^?>0a;C=xaXKORz6hexKDnWY*1o;9aj-71QJC*F9~!tg3>p7Xyo0FQNPc1v zHa_){i$8}7oUQ8Hy~lN)P#Mk=#*1FGs+aTl5xtsq-EN$P$?r3p@d;{|(?R$E&$iqNQdloL0Sq3+UF=*(neBbja z?c1haWR>sSiu8N_yFwc#ONQk;SsR41xVWU)s2Bcp1=5Il|Bcx9NxuDPPj!g%0*b74 z-c?0Uzr+zlGeBequn-e}&!PDbQ`tm|@y-*y>^y#;A`%K^Xq3$lh~}O`WX}5n$jZWy z;|=HZS`Z>_;x5BbbIBtLn4U}k`JRFqW(%R71wz}At04Pu$Uf5*Y6S->?NCz-yYmRO zFYou5StPk#1SVh@$S*H?#5e9TnMXGQ*G@4AJQm5!X>qQhl21p3#9S-3fiF%bi$;Eb zxqMK3f7786hT5x}p3EM?c(oY>3~7&;-BvoBslaL~bWYMey}NIieqd$DSA{|BjDzVF zoM+?rAYH?ESQdi7^t~(6E)}0bfh$tju(MRW&wckHqx`dd9$JqjH8Q7MlgsI_rxeU@ z1ArDi#nt&8R*7Tis~OZQ-961yHc)K*ozV;cMfUTpkW7zXe8iNAWHJ$bZWw^Q=K;`D zbVeA-Y{FH1XRiXO-LSuo;utk8()6i->(e@td!Fn(KdTDCl#Hz$Ok%MVb*mD=##+nA zM9}oNj&pZbAg#hI|DGL&v@3oGxXmyr*>rC*QvYpP7kBAL8#NB z>?||gVqPbrUA-0!E1`Sva~KtS2>U+e9aLmG) zn`{#S%+Q=!v!Pi6k38_~<)zvAuDAyjFF!$f0q3R2%N^(%5L8LJ>vO@!h+*ZReXXzN z%ac9c_k}CWETLaad{iL7VIcb4IpWnz&{X8k8!bB`5mzkkDJ3QiEy(Uhf@YW=PuiYp z>S`}uDKdVj>HaZYtp9~u|9>Bp5JgBVDqZGgudY5HWpn-h{$^gUw7aBIB2zVl_0XbD zhmtcSl>GNrQsTHNCequdnnQGMi1FZS{DFr=m$^hA)}DO&7}cBU50!m8-b!SzOp^=~ zVSrMp5jdZ~3;X||u-YpbAD-H$NOPN@Ln*x)lm?MLqD zsH;zG$g>%MESBl9E9p>KzF5Fym3&nct+@bQ@eK~861w>x4VtF&N!PaUGV;gJl6@9pv}_rS5n5u+xhXl&ecutlxZRJBNd*?5IhRw zgR9iDLTxvjw4i1VAo%s6l47VL;8GM-_@90B<0a;J#QwBq>zWE{uehr(i|>s}#oP(h z!|Of1`dDg|G2%h%fuiMMVk;!gk^~!htd{1)#P_qg^jT;oeRRC~LkOzwslMmAc*Bct zlfk{3WW1vTW}r;RD>B8$h@u#h&}Q;3YuFTKj7sZ5FT^| zRCyTcF@#C;A00ru^uz3-6C59!%wWLQ-ho39%@rKO3jN=QM*K2#qEbYn$7^ge*TOGf zco%E$OTixol@RZ61({!(?A=ufd$<^@!H^!xTbwgxuZqdbPEbd}y z^BU*3izIgP`S%PGH4E*dTm2Z^L`kqbFmHye2(WO7MECvr(n1u4kBvv~wr}qj9BntM zQ-A%)jS0;|C|7jl@E5v2Yc^URxZ3i#0FVAh>b`QDohv`*m188y^**`r5*6{5iHt^M z9Jswd39DnDsT*$@wa($R-~WtS>J**oNZ$I%Q1sZ<3VZJ@z|FXu5hQ7sUOd~+Z_-w) zmTltiU-GDn#pVjVvfEJpZQ_^2f8Q_qx?R?uq^srpt2v-+w9&yqds3bYJKv7n1Jjas0v8w8I4k;JFHhSUKvk)$+P}O zl7EEq^6z&KmrL2_3IFRR{Kl2?YNz{U4Xop(F)q8 z0R&M+&1ZHO%07YS`J(}j0&)j@L{rfF7^7_o^v_|G3OGh{V9yrxYoIQpGWz$96m>Yv zY$c0jYrH*A*N8UNaSENg|GI5PM$sP{5)wAyBiMfbIwACKqCaqhPVRk#-QW0qavWLV zPa_5{Jn{MaV95(&Ymoo3wa}*a;3zw^cNa#P;SmX|@Xc~(iqXH`H~OzT!uw~N|FP2L zlsZpf=RbinX&m(xXYW-c72Qn#+!pYe|GxQdN_q$OjcN@n_{$sYWvY7 yWYuVf#$ z*q;5fdjR}UtQ0r+_kG?IceDOqKSf^=42s!g3;(Z`%cG1s^IsQn@5_I66&t77V8ZYV z+yB?E|MP}Lt#I)o2xp5841a%|n+n$C|G6UY%Vni-$t}Wibt2p#(*Lik%4-Zk$tDN7 z48KD4KwUju*E3nOFvo#Wa^Z&CJ7n~_iD9*bQkI_oy)E5Dov^d6{IPaKcpu8(b1XoU zJ{wgy&i3}@+w7m9=#t>%+pm|lRE4Io(d$u{k5ns{$F z)zRb(*g_IGH`z!ap{MbhcATE^eY^Ya@)_Vz&Gr#q^AM%S4I=yRujB*8F@B3MySagW zV4A$~p8-{62@yP#zjg-Ec`V#`)?9!+;^_^4WRnQs@foh*xBy5q3!Pp%v;lBONrf#A!Eq3jWFHcvCg=O5~1}U(KL|#z5csS>e)W?y~lnZ)*x;GoBEh@T(kTd*q0+P1@`Hs@Mab40iqHBHf3;c2I^n zf0jlD4F{X-zwR=f6)4hF{B@#4gNfv0a{-?kpkUV+pK`%81nU8><#noS3l+TLq11iF zYiy!F!Um(4p8m6Ib{`^tpUB?R=4Rv$%A>6|G}P#)F|1`(#5b<$EPVgm0qTQ1Y6Yel z5U=JZ-e5zU$XyiOtXFUC>0JwRPt$GwDIkpD%UOT`07CkuRbBvHYZ-BWQm*58ZS>-Ntz--1EB$%TP*`j6jq2Xu0Lo16` zD4#uQE*&tsvx%ROtT`JQfx$9$O!wN4;ZNa%9uVp=9e>gU4a_Y_H~6vL1$S5(90|A& zm*UVDa8J9J{O8LbuIwl_3Kiu;6D|`E5j$Vbf7`Gh3r7L}1r_%hCI1)AXC2tT$+k>H zJ?{Atdh^N`4Oou_N%-WV0?k7dzdyOw66>1pKfB@Acip5PZ>Q_72(EtNR{HA*)jko; zd4yT2?SLY^FwNCCx!wJBoB~YUQ_CY)>m9KU>_&!vY_GpuCi>BLPq*!iOJoGIdSeSC zXU^Z){p%!rq~92OUEXL0J5&%20W$}B*I-snslhEbV+f|H$mtzYpsF0Z$HOb*2K{y{ zzs`l|MLgz>MnJT6EOJvc3&%FImOkdeo9UfwPZaUBt9nk3q({2pSc~K?_vkc+@UDO< z=>;aror09kmGosKD5+lLke;lb%f51};v3q|EiqCvz;?`l0%N;=VA{Pt(s@EelKjck z!mLE(UCsY++eJM?|CuW=AY-lLzLLwpBDwovugglSvRw9rYE2acSO#E*Gu`(eRKqld zo5OE%C>P3v@>SJIz85W}gk=5gB2Ajr@Rbd)J(8mv?WuzrTYTfI4+n}IW7j~KncsNJ z(!C{*QiBu=rw#=k2H!4nOW}B&Eo=B^OZ@uocT~;%2pjBq7Bq3!FDvI-^q-RCHfqX( zI#u5+u$ZUX;@@gI410ED7)S|0rFy1;yV>?-YQZEcItt0q|FQxP$#iHoMCg7B@)ySH z^71;v$r5kR#G(RnCg4+zJvru68dIR_#KuKoKv)2Adh+E*2V~WKmZ4QSeW2J`2+LZW z)lyL2trIX4ec;;JR>v0rTO$oovCmM_g4k{&f{oWmZsnM?v(gCJCw+b|8!m{>rx56T zOaoSWUQ@*&B+`Qo1|pqg3cAyJUU6HvtWP-AaF5qayDbUQx=&Vu_bYyvY`C8mKj zc(yk7!f7sy&?KmYq*1V`bwQ7?9-yWn6NOpeW@Wi?yOa`ceeUL%2t-C3pc6KXP>xRP zT+#!MrO^1fEpO-{%kuyl?D)~pWnJRo_l%GWS}VD_()wq;vlcTCJY4W$hFkQ+;^BCsbblPy+TE# z^Bm9512#{LTNC46e;8#(c;5LB14AE*UH2tcCl$1uYQU@OLWfPxGg79WB-Ko5cPnJs zY&HST>H_LVG6*y^XNPyrv-V8S8SPTm+KIC5o)WLliiLL&4dQG4m!v0#71EWG!}99X<;&2U(=->7t{6x ze{Obv=YAeOA1OU~dQV|L8Md4g{>BgL)INyv;&L+&nT^)@rz2taK33aN+IYImB@jmr_ZB;QPnSw;?c$p=> zyI<%r+ow%_Lh&ZVLk=hhJKezAHzy!?5w2pZo}3A&+-*2V(yC!(OYio*$T%iOIa6^F z8&CgC8nlPUw3ma63{ML$iCpSW802(qW=sl-A+4+BGi*3!+L07I!7n^XGh353`<6Mhm&n>km84`qWgtzeY7um!^a^m|1CI^_8@^Ep!(D{%q*18Jj4}Zy3Yw8>|}jgygTK) zpMPHh{z|3LOoDU?C5rJY!Bdn&;#z`%~{Wb@9FbUl;VhEo>7yCMr{tCDJkYq&G>Lzn_yZ9x-*iUS7*gN zM!l7!Z$=H;<<~NaE6IY#S;48(XlC3Xlpk{lKL2O%D{4V-o^{W0@nDORHqkQrc!>LJ z_m{SdN<=}EVwUhulUYn()cJMK0Db0*MV&c}U5cHyU^#o{W7PEp7H2t+R`KV{l$&)8 zab=HAF$vQ#GfVpxw|^Vc4Xq{=dyxzw`_W6AUwU#a=LDT2JT56G-6rl;s;-&$AY1Cv zFgRP6xA;)#M`r57!taN%D6wzhKNp2l&>HbP!=APNBUi+kdg_oBTw%UV0I8+1z7dcu zc4ghRGZ0rIDtPerd3fKgDEgl=mL=OXp^g?{A!LahGLk-M9f-=$TccfWVX{LOP9Jh2 z##U=bi^;b5>L*ygU?#sw>Y-e(LEV36`TIAT>b?^C+~(S*t%(pqr-NA$D^A;q~HJtKhA z^BX8gG1bdOzdQk^pGL&V_I0aNAUoT=uyIi;jA3Jl|F~`X0wCwzBe(_RiR0-*kWrj# z(XDulADiSZPQWeP1+%iarrpQz=;Z@&VJ9o4YKfR*vrx|s7cSe-yB3T(aqa}@M-7q(U(0H5RV{bnK94zKLo!wyB% zuP3<{xOtpb%e62!JN7sH_AfN9Y=gS&5I4N7GGE>=r4JuQZfE&gpdg`-L|(N0PF@*w($Fc4hC~0B@7Fwusqy5DY)=20w#wE}a)Vr!rf5{w)=^VON-d z&4^O{$0XpiQ)S0-qMSv1sML~|Bl)B0`c;9g;Rx*Up}zsdhC?`e4hj+T9jvWx4JgHi zB|TzvF=mGp$3P`Y%qXQMlCj4xeGFngX~BrG#?a7CGi_MRB2Vr*R3ns^t}dd`Bz)e@W{FJ@CDow|!5zPWu9PyQ1Dr#WSvHY}@{oM@~qqq5Bs4gTA!58+PMg%6(JDA)g?!LW!wqLNxi>{`_dV<9}yw0w>Gp{~d)gU&XEq zd23PtjLrn56b%8T>{FN;-xHQS&&gsh$XneqOAG5vy3fKsKR7|=>}1;yoAY@aHH3fp z^QKWv zYDg6F^~7xR-VU-&B8w!uFX?(UKTghp+Y{}@NC*zED^Unl3Y9;~F6Bq$}u@*AmV-F>|{yD|#?ieH2;7f30wgHJp> zJWk0J+D%8Ufw-1}W~MA=o@u{{(m7lml*e^{+$FLhf~#$h)%{w2F*9 zJw4}T8P6#m$cA+NjPQxMN~qv!B;}d3Cn&=K(H&*H`2&7JmM>joW-HEGd7SwtU_6H6 z9%Z@-l_F3*IuX7Al)_5K)kSglSCew(3@Ao}=vzP?7;{R~PC`R&`=upLRrEp?TnCkx zKWstb!?0m-9|$L8reC8yH9?Wu9?I%n4Z+;uLwG)G;Hc8fgiShTx<10mEGM}DHA>-w zo`vM&efgPe+9iGFc5=%g%tb4=+?t$JQ~VHs?AK$!XnuK^hOc3*_{-cA><|Vf&FOI= zkxl1!tB_cIMPD*!VXVkO9o@rqB0;?o{9?5|wN! z%|?e7C0CRYi_-H0nzOi8!ps$tTz0B=X_kj!56;@XyT}M_UA) ze9sp3Bp}!D(RM#TaKJ{%QIT;=JTK#;d5_()Wug5H$wkQ}h@`j;ByHMX`}&VH1d~q4 z_(rwst)0SUu_xQd=e%OZ)HMW1%?jCvcMUd^+fWIvXWr0&n@7WU3ETig9Gs4eHtrvte%8l9n>dQYYb5l+J9m|9&Kj zB3~!KSxRHqEZEIX-pMm7G#vr^Ew2Dxe(jvdz3(@e1Vip0zrFvm>i&jD*$sOuN_*bD zyN2ria6!9;HMzx#{`r^aRmYV<`ZHoOktyrV8{kX(|JXbGLP% zDf3>1)4c4swMAXqA_Qrn88KCfZ$Fw1S9rcwg!-I>{$S%vI{c41``eZ5!vUi)?re|M zIjEF$EB*~|N9zL4sgCIDCqqikhvaB%JL zmpS)9`k%(FvGJiK-3j6%740*)Q)itHhz5SCWmeNNYoml>*t~J9`N-4 zfO6k+_r3q4?!$GG=t$c=ZdHQ=Hj2%r;owa>QTY0y5n>URW9GmJy`jdj@_~*wZr?LF zoV}3Fkvj`cSleC;9Y1F)L9^w24@~vu{kh0oG-W7~mNIn$C!)&U8M-G0v{+Tr9%HwV zJ^uLZO;^#y77i!rx{GnyvDX)f#VFa{#<&)UM;#&@U%h;}TPB2OUUL{Sj>YU|zy!MB zwM`eu;TB!r$${e2c1Mb)Q0ELKht_m~MJ)L;O}Oldx&`1pUlM|mzsW|;5#n55V_Hv` z@9tADcJ*SuIlGU|c&beN-QSNTv`32#9opxZ8$W4(m z2Z*L4$6UR0`I}N3!+vtH&KN05^=uQRB33`-kf;c!;W1Z$yr~ZTTEdrlc)F{WX>%<4 z3-pzQ!xz^@KGq~wYYjcz+0Zt-S)s?_?u0L7?HCO?I=ktaloe?9y`5>;*dS$JQdw+` zaI(wvoz*vfXr%NLSEHEWpT(vp!oeEbO{D(%B3g~@XY-StWP0|XH^>inuAxT%an+9ViIW+KsM z=hm&n6WadyZ3b9!^D&nMFVi2yJ*L1y`1s06ln={<3$_d1pPg1G>d!E!cL@#d9Lk44 zIKM0VJ@;2~Au7c*m$ikUH`BDN-zHU!!u{uQ$$s1i%APtB+n@CYueA>=6^^|l`FluU z(L)(1xJmrSsO1hjej5NO7Qj(`9J=k1uOVRhuo0m@sv9dauAyU~I$NZ94$&HgjaAokB>+Gt< zPIl=XxRQW$^VA-KBBSGCtAK`46kaR9)$+jj8S|2{ZVN@NIp%M#FHaTUYW|qw zLBTuwLU1iNoo-anup#K7uLJVFlGn<5A1e7b1Z6xVPua6TzgF)bC=xw(6%M2}ib?1C zu?<2ca?J&2_$>Oy3sh}imBTvD7HAfb_huREIephCn6XLkM{)N|Pxhk0-MXPTog(mh zKU+?O!$WkF-3HhYY4zH--Mronxy#OpD{TYdU`$a_R|c~D_j98$Qi0c&L%L^`)y+o> z_8aJUt8|;v{{+?-MQUi0-^BeDcy-MebU20cDk`pQfN!aIT7=d_6%`DpDmDHNEq+{g z)A~>6t~+8FdO}M~U?bA6ucFrx`2tuPp`=VI+HEGl0_J5w}WC(d>X=7UR^ zfllW|gWDjXo8Sr!J0(lvOyCdra0~9wS=0%~?c>H}(*~013eXL!tK@T95Q%G6fZr3% zKn^uYnH1YOlW!a)P~EWs@ra%`3EhT2F8eUka-u2M)cRa|+$n^ULf$tq+eZm&5ZJ;Px zp1aQeJYuUWT`w8(8+&k#p&2dWk*(X%?;Dq!l>^alY&*Nw`|-gK4j8@qC1+Z7lw5R0 zKdu2ocD2GrRrn5|p*hfg^%+iiz`Q}MZ1}H_$IAnhx+$4#dC$_E$J_Id-DhN`a)+cX zlKfR-}p7|#PCEJr(c!UixP@+?vhT^Oz#c!a57BQ51O?P#d>p)m@1 z*q)$7s?yNPEfsJs9R$X%U(TEJ~HCY>tt5drxZunGKZ>_>_Y zQS;P>@R^&y#?6H0n95r!b1KjY8rC!WeDjd#B6TG{;7aNMxavpMiN(zg-njfc9PlhQ z@xwc^sp`2+vC_f1Pyn(O><6%R1{x7^d@ruolDW)~#m+|$ZHlSlwvN9%gO0KSqP#3v zk+{1No)`J=i(4z@E?_g(1-Tr?q!;pSgp5dF;!C?awDp;i{i-E2JEgAW@vsl`tU=&y zzP-8DR1z8}Xg4XAKnnoqYgc|D=u-~R1o4g6GawIXl|d~62sXm2Lo?z~D4Kh{HJ?gR zqT}XDEDQ?#yc_WL7i53d>OAUsB{r%YYNHn}KbjDxK%>`hshGjdLpcp~t{c!8#7=c* z#!XnKI6Va3utJW+t&@UyLN{r1N9#r=7@gxXQnD=v2Y8IOX2Ngo6;(Ol(ZH30)HZB+Hk#$QI@N)~uhDQ}-^yXN=T z7kM>o4TRq9?v%g2MCFwMa$*VTKJ$Z-!8gG5-6;%QPlnHnKP%q;qvigsOus@WvvBNO zvZNSMFTcaltm9v;hp(mvw~*mj0b-%*Fx3*{9S&**_6?jhLF>u#?>x4EypEh?FHC7P zKC7IBr_D+<2UIRj1A!KOva)wN6*=a;Jz2)B(o|0%R++HQCZNAX%b~uuFsi8%cM4nv z5rw>RX1)#P$*%-T$xzUirQl!u(F_R3$UFOWB}`+e z(eNPlmIbu1b@Ip=7;FK3H|!;}xTv(wJr5kZ95);r2(4;Pt!H<3%jbJ}5@*T;{&+3O z7XCID8SGJ|S=&HL9MQUtb(g^7xigpS9VoOkN|kj(%bR64o=F}wyh*#OL)Mi8p78>M zkM@VsDBlNS5!g9sEBYb+*FHRYulIMcbfhgDE{m~39r4LU&@qz=Gvn3+ozYb-Mw^Q6!=!~-u$UW5L)m(r%Ry(HPW znfuKItc!)sVq1=DhvnLyhAu!wA>%gh*4%5T>vzLA5|ee2S*XgOVOX~6zuJr3?{jY+C z&%a^=>5ZQwY(7LK=cVj zBnEglY|uix@54Kf`=3`l>JZ*|x_x>pKvA(Qrnk);qxEnPF)Q|h4~qOK*|&gCdM3yt zVI<;Qc?HZM>vU1L&?%maT&74j4!71?14ZX5TH+S;CLnlr6<3ir;X74 z-omD9CKbjIzxZQ0!0X_B_t>Ym=RngdBb@RDTL!9w-B!);K$SEBO^9I?J18+_qf#(L z6|pwhO^xD0=wcFvO1lQ?{m&YqvbI^eCHT_KbW2x%!|PCywCi9`b@M|b6iv5UId^gX zG5vjPEKi}koW(Ry5PCS-Lc1|)S$FR+l79$e?oHNqs5x%)LR<6eQP`iaP@UUP?AEWl zb0;AhBH6o{(B1Enpn?qC>-^7B=2uL2XUb3wKPNjQAips#&4H#>yy>ZuW2x_lm_TV$ z4B!9mGynp!6^XQt-(LqLM|nZw$2DxXI3dP&Vi|IZQkQ6*B)IZbYzX(x;|>`*-%y%p zn)r9Z5D7k_~1a_;7#?lP`bKRwFNFc zOFcT2YmK=+f~$eC@CQVr9{OhNK$UYQ zsqg4k#QmubwhhuD0(1!cpg?!?*fcVLyGvo9=Jxr22Ig9XVwXr3K|UlCbOPW1ba~@f zX&8c>X)d8MQ9z9qN>sf2`xCMe~7 zjhMuk7#Pf&LiI59lMsw8orbQfER_^;sA`Dw#qfc&%ym{?sU^&B(Rq;pE{T$6sc zR`S71310Sq%GvdVJK{}GubfmoPyqS8I()*Z(TVA4l-$P2ohTggAZ#&?urc>@`~ik` z7jOzp?}vOm*6e19?KKqjj^QrdXh$_pXg-+q_I$v^YwU(8?181SP)r0*^ToucuCZovUl0WcPN(90M0s8vx7M{U%0x zk0~o6{P#wskJ?3OS@x^sWW;NL%9i-|6%cKfQ~kzX^rPa`%%_78#1^oKo^uHs*IZQC zrv*4hnB$oQ%=LT^TK3{41t^7wv0O;J;z?j^mq`Bt7(D$;4(cK)&E`xYQ_%SFU1JAw z7pb*lw+%4QCVD%s;=0rhn8(Ez!5ki`sjl?*jbQ?k5co2n`Nfl*HL-@(*UO`^by-p|R^F z8WpRLjtJI9Ko2R?`vK0H<8RZnmhhKT%T^FD_*o0X=%BH6D_%j|GKEj`tiKG1`i2R- zpcCmyDPB=)zC1y&@778exU8n4rmFVSA0MMlLE6Xq4a~Xw^;D>cbB;_fSCwBf2C5D`A_Fs^?Q(Ab z>eo8Rz_J`WHz1s#oNPiM|0OR@E^>D1-2*1M2s5B(zl(Q)dRbFpU8Lg}6~dfZ9;p^> z_XV)%m_o!Pt1N)~ndOG49+LD8pP!^Yc-+dZcb23>QQpmi2qtjd0e%pth=*+OVoK?@ zlPQ!CNX26J6$8yFwBb8*^K5@%Xz$e7^P=|mRIJJ%n+p?th6r+W4>zriv<(QN900eH zcK5|UcpUTCgGT?7<$y3i<8R*L-dOw^WOrC(On$3gK9)fEMXOADG?1nQ++RT*F|9H3 z(qNmG>+)oKJpvAdw^txu%0c6JXG=CLT6wbqc%V!q7}P^K<%bQMwTuZ1*PfDsLa@|s zoh<|+x(Oj=tgq`ndC(<3^G1rDvv}&ZD(|ug@YFRh8#=XT?NGrRwz8r2k!LL!8 zto>juX0bPJ8=?)u4Y*j?puNv1fOh9`oeC#}$OyWjBY$Xfd)#?*XM1iZ>_sepzwO(_ zt%N}u+|y^h4A{Hh-|5U$WxEwFbBfE)zXt|liZ>}8%CvtE|xxR+ws694BJ=c z;ek!)iI?@#uCYnA)HBkP&laTR!m9lSUZzK*e3BKQ1l_tNfYuy?U-y1?Y zoVw_c3s6;;W9rTKPZCRqRNOj{9NpyA;QPR9u~ej>^M86}d&hGXc{8A-g+eO*K=xIL z2M$liE!9@8d^>25E@oMjgDQB~w7@#y1Cf$j-K{S#Z9BXTKHiFd$S<5OG46UFpmq)T ze)4=3P#hE5U=;#de52I0sIGigJTcWlKinI^OFV{}S1(gdYCqxfF~^MEE+;ifErCM# z@xrkX(|y$Lmy(L60Yqyh6Vw)u#QQY3r|k0JQE{$`mWr>tw%kv`jwU}1@h5*M zcJ5NjH^l#55=k5klx9j9R5sXZvuhuOnR4dpX6fRw{etB#kWYSidsvRjV0kNH$R77J z9EsrA7xojI2da*ogCz$3-S^GvI6>wn{X(nuEYoii_?{nu7X^Rv zZU^DomTIgbDUihiX@O}2!UuCW0XdNRD^MMz6sPyXUU%7oxH0G6s3c}F<{N#}TY)Wx zAHX^2g&55Ygb%_O!xOduxtfZ6TlC8(i%Q^I(eAP5f9;Xq$;j@%BsC$hQYgh8I2ctH z4sdce@XpL7{~u-F9Zz-t{vSD&)6PLVcgqafGgMO9j=c*ZBYP{f?Q$er_7<{7G|cSm zS!A!WN&T*uqI8eX_xI2JP`A!`kJsyZUDxw^KA#t_JrH!C%J_ovvS>f2@o1z{igHGL zTs|NeR#2$50@jC$v5U5;2&8>_T5GhLcfz7T{~)k|!q9?8L^9C#JV68s+7#bJ7vc?| zN*H$6Df9jdVV6&n>!81U0Dx5Uui(Mn3K|GNM$WMMA0Lq%pkPur>@D@OE@_3|0oy-6 ztz)5}hExQc6uEqu_cH@_px_waZPO0P%D{AEx~?Q%c@FvoTyd31j)F1`%dxh`oLf8j zr@hVjtBS>odSR1YgC0^)Lne?#2+0ci_ki}KjIX#-b?KWybFv3#;#Hk=-I`2vDo=D3 z1kYYxum{sjuDB*&W3{-!_6&o8$QZu{kPvBN8R(j%VMIMxgAPo|A%5pn)ca!8sGnM> z9}xcpVeo~I=QjXNX4M1@E*gA+g5SD+pGdc#C}YkWi|;eshkz?g)&q*u3eEe7?gapl zBBTOA6OvK0P1f;+qBAsG`5A?l;7~~}0Gj3DyRoQ$@f)ZMS=V}i@aO;ZzCjFN%on43 zMEg_H)M?8^iTL1y)1cqh3Y@5*ZI?br$AKanLOZ7U1>ggSlZ~e$5Jxi<3rO;PzC%Wc zaAMC2EHQ;8r8GC*DPDX9JIW2a7!m3uyjuW};>R?IKV>c^m6Dm1=Muaqq?daE8i9#0 z@FZFb6Gdw?3ZQi;%BrK^X_7|kz~l}mcB8iU$=M&VxhIk_hds^ISTelFi|eVhO+QxA$4-$gwV{nM3rJ}IF=YnS zkmVKv?_UN%0OV15t|^peywO}@s&QZhwv!hxwJ(!ub}iVwkfX!kW7DN9sZh0~zmm!! z9CeXi^eAM)dcR+8)|D0V@S&EJn3H+ow#WN3oZ=CeT_=XSkUvVbZsfeu<<*ev>_p1^ zU)wOu9qCjyZxvK(Bw$4MdE@$`!h$XCCkfJE#Aw}A^3ULKds6s<(*Tv_5xRurt?SyBcEgTX);RfC6e%68lhtic4>bs zzc?33-2JUne0~;2FC6Fd8+`*TzTi=|0k~S{AaYI3KNi_PYfGFc0YS^x^F6Dk6WDN! z_(tx~NjGdjrOok*j~uYJt}eq`i&pg4z(A^$1}6bv(+yvTS#+1ULI26`WmFd~e5swL z(!ZUZ+4SRi{ByOm9njSY8nx&!2U)EbRKr%LHPV03b9?eO99pY3RxficlM|iV6 zFIk2T)5~QRSg?#lk>LjIYGqf1Ei@a6#dfd9xd6~v$7x0Z*7++^P>4)4LX!gxmB;{Q zimuTes2t0byW@DqK`HS~H%yeJ97qt<+=27&p#bum>;pufEX2+FsFFz^;JNh`+S#zz zOY?9#t9Q4$=SdWt{QoV9Bq+hV0C({e;K0K?FFKV%2U~d&KM5-~fz1}+=c34QgluF$ zv0(O-^<>cD;zfAhw)-!nY34znqYba|p@O95?g^V8?Lewqn7!P8#+d~w5MX_|@|2tR z!lrwxSA;^*I2?KA1G3>6NEjB#ZXn;3k^@wSYs>q~87E;Uwcz*ivoHh-)RscV-FN8 z#8i6@Kvzpo@ogheYI7L2FdyJHi}W}m(1k3(oZbcK|B98_xUl`iQAdIE68D_Ca1hW8 zO(@sJREuL+joLFa5cCq$RX@=Wwkgjc{fO!o^33dj`nzF&E1I#D)rqE^W$`IJa~T69 zYJb+{@GsfEuEoW6t5`q;b`DhDvB#`_Azt-lVdU$cqm&h9`qr5nwPDVOEZ<8bfU^Zj z*#IQTKLllAQ=$?pJq^ussO#Gx1H)TAdTJ|3@^sVMX8>g!f`pd!5U5sYAMK{KNj}d= z_;?gMxd;cR%=?O$4M3_(hnm{<@H}L02eLq}^1Q@o^2P@s*MIAzUwk6bp95uKK)>7z zYR_HFth@gGd9Sr^5ZnUdfla#5SlrP$J=74>4p>&&nQ+7+AnfNKp)|8~hO8kc(O;tx z2qB+ri$1f|7+V*{5{f9kGqCsZP0xte?ggIJw!cSOlR$O1?@NfL!XzgBSik5K5;6sJgX~Cb-z+?PpO3EN5(z0l zyt23ZIsx^ixL5KrME_AJP}wmwxp~TV8EkV%LFJ!SvIOLYR+LTv=}y7!&|=GV;G}vw zH?=|sO&>7Y+n}y$Dy4rjak)Q1A;n7U#bNT=1;pov8gJiA=bEToYUPc&ndA4H?$r2Q zYzc4&Ah9Oy!Ly<`pz{Y1cf2};#P^7BerG4^O`!5Oi~&<59q1knT@F*?K))a1n_Ga` zX>aVab9+5$NP{C@N_zS$a$FlcC%J^(;W{H%tIShugUX2*0|Q3v7G#vWqVj?yXoaH+ z(x|3vQ!P*zXX0-TMftVCOiTu=pg6|pL_09FebAuL5G^j=1xC5yE0U9jIDCvUD8YgS zYfg#>w4l5jUT8Yn1|Zuo#2y!IY3U_x%>rJ(e^!6j^DF0c69eoKTf_gWc06~P9)#Az zDcv=SWzKOj(EA2xF9u*dlYLRs(CM_1CI~K zJ!{8|nhD~Bg~W@_l^Zd3IdO|XHxqG*gcZTK1g__gIqbpzA`vL9O|jC)P}d3ZA@m)C zWi{JCj-bYHER?tw2F6{+pFEGDi%aIvnUZ0a0hHdTKJpSBT^cP{A~;4NIz5iIrY{V# zZH_Ci?hu@HvDyUbsLQ@AIY9}T%3HBEawcs^Rj2-N|8-TA_MyH8X)MCvV@7WdD>&z4 zYw?2WrJjexi}kVbgsf`nHe5$DR0)W)lnBgI39H=mmUv%!MJXaZ7z=bq-zXnEZfPej z#h)n%`3@W@4ckLt7frZ|eTKkr%4~Foii6p_HBCo7?>VThOF`$|P8dVVZ)!-eWoYS3<0heO z;%aTh^^p%)L|$xuc<@cDKcF3OU{!HPBr()EUqo1WCF9he70>Qun=+3j3GjJTE%c&BbS^y|?5`O~lT z>?&j_d)f(u`*cDs54b>FEx3F3nMJ_0BsA^pX-45xTrcF%frn#8EZ@}?o4ZjmF+NeN zGPsp0BvR4VpWy`v9gU6;M6Y)QH3prxIC6ron2>t83*d(zah*@oq8$OsZh{aHxvwdF z31!36SCp6RrVXG=p_(P9nQ-{TNUn3KE3_A5Xl)Fk5Iw9yk#a`2`k+B;S`;l+;(Y7J z6GNLYrL&Rjbvfp`OMsGVNuJ=7-*hgA4ic5iN@t$cq!4qmk_NaP4nNqDxID-)dQ55) z<|)g6Y~a<$Q%JgtZ{UE=;-R6`DC67G;VI!^VHPgw;Y(#D-{2+m#xs2@qy@l!Q>NUKPn zsak405LOfgM1;792}a=4VkO6<6a@gniG$YpR$y_ILaU{fZ+?@|3H|Ko(+hQei;9H~ zIT`v5C11C&$=)*NRGWj%Y$>Nky8?Du)-U(=16a_!r@#G6jkkwe5ZJhd1OW%DJBN0s2tvY^t_NnNrYD1Md z=ckz+U}5&$uw17!|>a2Nl)sYY_96JE7w+iAq=y0x?j}=@Xlt=QYwDcTgIkI+abwVsF0%GIrTi8bq+zgA?>9G zyMl|yp>gS|a6(@x&A#vC13LYIOA>U)dXIVW$wXe@U=4X|&u8?=Erv_#8XM+GRBV4x z7euTvaGWxkP5&Ie;kDtc0Hnge8#Jqbb_9Camul?!^@LbsTbCH&pKlZ`# z>+sS&%|jm8-*Y#I)|*%e7H#_%Y712ebOS!iUd~2>rI3`^utc*@W&^YOJAjm)0SR>N zlq$l7a>NVSV&#(p2A@JpHAVfa{{dF%aIvfeFkxB*`-arv8K% ztuPMSXWD^>8ddUv+nN;b)|2( z4)}uZDV242io;W}LaRrR8bj2jrjAbN@rRyJ9U*lD$Z!${VmEbTE*tDpx-JXhq=uhK z`9+yNS@5Z9+Oi%U$QrAA8E?&h*j!V<2Gs98nCm-$(P4f|{Q%7d!_k)1N6?YY^w$@A zL>y`r0Gx2d?QbifEtAx;Y;Z2Q1!YWLT!4Hq5=Ow05hAa`!_TfrBf;-lPg$Ts-WI}~$ncx)g%mcW$qpc6qGD)1#f*0Ru0W~reX5O0kf)b| zYD^ETM}z~c;C!$G6VszQ2mcJtyBLVB{nPF(E7-S(&u`=0d8PLJ9a<1cX4{U_&maM~ zzqTR|)K|oZc0I0kl*!D2u2p-qcwqksKO%uKDD_B(yCLR_siaZM^J19V84NYYvWPGhhIvRaX7_LTCA=RinC76bgP&@?&zVFv7J%^L=&uiV|k_kqKkg)b6V zI9wE1j(o^r456cVvYkFT`xP`44C!_@e>(KD#{6>L>B_$&3UIRZ6~I2QwcBcbLhv}g z5ZP%pFq-8$(%{nI+U*ZVxTNB^dULKk-c+HT1(dJj>>oqpVT*K-*(D}6Z}I zyH>CbW^_VOHiFcKJw@qMM)G4d188cp=!CTc_#J%R&SDZ$RRg4#g+!Vs&`*SvIw7z+ zA50FwER`A`dieg>w)*>&NvD}Qw;kqdft2HnTlINmsFD!ZMh!S~O4~CncqNAa4P98j zXmrHG7&g^>>frB!!4!O6mz+gI4t!F)4R9~HJ=+z}?R9=1Y_^Bj3H!!XPHZR#G?*&M zP#c`2L+{BN;wSN`uZ{z&fgQF#q%}$zpvn!5w5r2n>k$eU`5aIJj_dDf@Dwck!z^a6 z-g+MPiW9S{aETVvaa@|s01Q(J@p@i(Y6$nb-HO&5LbVR)k1Xq6F&vD->bVu5cqg_) zl7EJPs4qZ|DQKZ1^+NqHXgrG9+=PGL6d|(H0611twhDrZz+QlCS;D+R_9l#3JZX5u z{-{#4M9Neluy97f%{mR_9?DbwRb!9^dlsm+l>)DAs$&+2Zx-$_IQ|BJ;FBF8AKz#s ztzrS2ejOKYNoam!Q=sz_fTCX_3zj_q867)wYrjIQRI#~8MPX#=H9O;{=eFC;jR&8L zDDRTOJ?+h~#@w?t@`?Wjn_9g1YTWSK5J|S1nB=!j;uiT}91w6=VBZ-H(5)ab^y~Y{ zT@AKNaQEnGo+>zd ze(05*z#eq3enOF)5|#tjU$03w0V+`%w9TAuYiLbLUh_LwY)54Y*L&SDl6?m(8Cy?E z@)IjMJAhJ};qgO8CEjXZa-#l+Ysuj1;rYjriB{7Iq&~WIojO~dr`UXW7^XBfsHg+) z*)rXNh4oXfHq^}Hwk=qTsg5$?+W_Ui7EbyekGccV8`8gD{?@((#mRTxi>cNi%v9pi zJf$ipq)Z)Ta(15ilYN@I7VQBMx!R!c9ioO=W}hmoI!sG_%Vt1NN!i@2512q|E;d@! zqT5ZbOZGVwec~zKmwCaf190M%F#4QiHecI7)~W!RM;ksvt8VmPVVEAQhN98wXO=K;;qmNnurdVu*9{nYn5#4Uh-)E z066uL6fWad7|bM{sOs#g?y@an!s9Tf9T+cCjHDmOYLCwHpe9~>ynbUl)F5I)5&G<& z*CV+RyDnNJ=K^`U8+dWh6h-|v`hwPPh3CqBq>yVrs%9qm(sZZQpdrUi2?|VG@17kJ z<7R5NAwBzrM+j35hOqkTfpXbuC8t{|fs zHepS#c#IHO8>n%LTHJv5B+~4ds9wM$GY9soEEor2>}8VyxKV4z49F64lLL+4!eJlC zvLm|vOs*Of=7x|2*9re#fxv4q#vnz|uQ#;v(~m{y9S!%P|7xLCOz7YcUmIR)u>0l9 zF0-2r$M%SL9@GcxO{I#W94Q(?lS5OAMkrv?=A;J}EUrSlO*6#Q9s&XiDt|*SDCY;l zu}jJHHA}nL1@nD8>LyBNIxat~=#_h-mH)+oZfP*W%MjF^N=Lo`5U#2L=liz@D$Sx) zZ7{&;!}MoB1=^rHG(PHA;^rq(Wv666RILMPPZlIlXFf!D70jwAD(ZpO zB&g#94)UV%Qc^wfwz*!70PkJv_{M46p$Ap4r+soH)(7)UhvnWePp((PU_y3@@>2Ua zyh+j;g+#^EA97mA0Ps3z_{cmcwu5`$)MJ`+%3>Ko0>F4VS{$UbH`OJx!zz$Qv;*or z@Lo*-Mkn4HydtXe+9`E(5T~I0jZvWX%&Xpu^3)F1m2F$IE)A$T10~ywL+7DbjH6*7 zAf$Ym57w0(5Jnw#X}bUMmVXk!{8p-QBi=smbM=QG5;sWj3>`=Cg(1ZLM>lcIbErE2 zz=9#_t)cCJj3rPe9qFmTTQMIwdL4+lY?#;EAelp1g+1UvqhN!4$=`_4?jV3G_~~aO z%Lk0Ji~UESW;U+nY`-6T9st)LZ;9T(q>~=>TYy1 zx(ag^;Ouy@#ow$%n(uM|bxNRT$8vkP@K*fELeN82LHm`@+C%337nTW_UH$&@(WRh!i9Je= zl0euJzrW82A0tr}%kS}HvrmdT-Td#xM6-3}A*va7WSd?%CT-FJl#^j?>jZp^Cv7hz zrj0=&c0kc+q&Q^qOw1D1KNp&}aw1S)yjT!RKr|1OutjU`#hBC1)2Y@}+UFLV+r5kI zx2qM}+zOF)P_ayo$U!~%NOB&UdS=afQ(-pFo%i98V9AFKFD+OpODvT70Iu9Awtsr? zgw!R2^Zq*{n8GxVcBT)eX)mcK`z3}N1{t(xCdC+omw-Nmo2(cPb-D?FG?NqP?W0@_ zUvDSGnL?TW;he(6heW5v!zPWI_$X*bspOb_`$l5xpVg!;JJ}z-dtoWXrDsgD$vv`*H*T0|x(1Mgv3jwUWc+du_wGKeLG3EYjQ+wLr`o|G4f zPTu{OXuWd8dK&>~ly++L*)5$DU!?dW;;}pnaQd?j9|Px>Ef;mPkC`cdLvW7>bee5J(M45|Y=n>8V-4Wl|_Uuo&^11%odaVY{pShHx zuC!zU-BJ*7GatK|k`j>-V**bj&M+a-1G{Nz4_8bHlu>r(9FA7OhbWnHCFXfA`je`$ zJ(QJhI%`znurF9irghITU)9mF?IO~M9v6sbx%@xs*B`5qsuX%~m9Ms6 z%Bn=o9hW=xMiUvGvCTut+v<|+RShj8^51L`@ux7)P2-uQF&-sr?j&` zJ0IWA?vbuj`2=NGQT+0l4|fmPSMgCA$AG?5a?TOVIUYqz0tr)FAV0!@Mi1BnJ1vR( zp(N8rfTJUnz8~n4yYZOJ-0=slzry&wt+hmdp50s4z;F^31h?e9tcaAAIcEndAkkr& zkd%m0P2h!%hqNy=_S_+pc#a%1Q^tdCB45sdPA^K#EEsM~_Z|o)dgo>E9=dEgoRq32 zLKQqWWheJP?Js37m`w%WX+_Y8@v}oNx+LcH*UbAeVI6)6rG4L2WvgOc&qUOD?UbON zw}z5`pU0KzAa100jqw7`S}QcsM@RjpD}intESd~`pAyeQBcs%Ibz1h&8Zdf&eUC|}j`@r1v6?r(5hOCeY9ehn!7wI2itf^~y|v1Fr9Je>|_wLtrSPfoUr z1hU=Qlk=q64bXoMgzu(Qg%9gywu8pu9nXUXJ18DK@B!-57DY1I_u{n~AO(3A4{Dz?m*b6iitix)a>Zo*KDTCP8buS}Re7;Nw3pC+dE6TK zPUJbN(4#rFbRuU8LKn9ma9jNcoVa41Rr(Ez-C_)s+WspJ{fx5YYT{lN40SN3R`JS}XHj9WAZ2uL3mA+bG!^ z62iei5Gdb|O0Yd0V7TE*@SS#MVZKg3H1Z9Sg6b0p%o*TUVFKd0SY~cCP+`l+iAtP* zy^)_k{aMs_v!dhjUDEswDg^l;8&vr{`|bfymGYf2=(as%u`h>q$Dd$l(~&_Ww=}aH z;N#ENr?^ZaOyjDfQAVSDAx6!voGMx*z9Ol`=^aBu8t~K`QOc#yd7|wbG>|?!6l^|h zQ)GUCb336xTFBUV*e+=bngT8G848i8GEM>tj&Q~0kD3r(9;lwgVSOw1f{b^k?lyGb zOoe#(LAC+8YE@btyW@J?V;^ZPVgdhl#6i}%LUY&>kJ5iTUGi;TkXSX-LnPr*jzXdP!Pck2G@A+yMlsX^=FYn_`f`}{OEtYQK5VWnPYVBSF4olY7P+7Fj@QiH9V#?Fi?I{8FB zNO5ORO?4tX3#%4e+#gr$& zbm$$Id9=&h$Lf9EK4=)`OMBsNuq+sI(8Zmq>XPO<7u_@pQ09k`yEyll3BiJ;w$s6X zU_2xjo#Os64qf3uhswQpFGsqq%D~$*ga*15@adQ8(^=S&^B=G^gWQ)uYuERr7O92- zt78^Oe+0Gy9qRUD>?p6-v1hCZZ-#@&{T&qaA#604{7G21KbEMT1`!+ z{7DzY8lz31FEe=aV0RM)M}nmmeVGveuhj!)U|i?Luhys73?obMt_0yDgS_VqWJc(6 z`5Fs{1wgJm3tWKgif^qM1`6%K!#LJS_dw2E51Ocv44sD@b@hx60*pCiCB*wN^8gf{o7j(Vn%$5(?QV0^Le zS;b^ySbx;z<{}(VHz?QIjO&o=JO6lN=Ky9a1s6NPNIJAtcFa=U3XEn<+fLZG zLe~*FxtJH8!<~=GEFZpKqi;00I~EPS9x^||C^^HpF^R}gSD!I%c*GveDUDM>)WBn) z(A%KglzZ{Ld}#MzScc6EdH{*^j#=eg>cJ-d$#IGIq1Wq4yFn(D+HqnyPh#NJOnA5N z2Z`dWI-R!WAT$Vy`dGvnGWD0~;EM9WK|=sCUBr5b73KId!0jqCCA#-_ykLR4huwzp z`?bB#cQ+@gG=t@u!hlG+ZB0Axn9=cdUd!t|59yX{SKZGZV$+t*vq~wj;5FQ2) zf~h8fSR8R+k^B`X5v^cc0y-e2=rjE_p%Vm25XF}ZMrr0l!_&_dodLRlvWMt@L*}0d z@tRgVhk4&=(I9RGfQ~ZoUd7DEy9)A#q4Nba5O@2XxS$_#(~H9vHI_#jYk-y)wBe{4 zK5+cm_|V656{Y#uqu6dg$d%fa zB7kDYsq;M#xb~-{on+EJ)0ACwq`4?Mr)Y2$^h2|NgYXnTYE$qh_6OP-KWYW}SW7?j820*=MG(Y(v+H}NRCgOqz_y&5iya3gOue1aOc3T6#{LB4r?@>-)Fpclsb7 z_MP&tn|n|wpdC#mm$V)8AS%#by17A>%^1@|Ya?qQXdu+wIP>WlyDhXN$(3#?yZPNv zrd-@1bCMV1O-Cg;0^oIbCiLa%s|dlCjnEfv;Hg`NmmNcPCozmuhx>g&6aV8F&j4-mO+Yz01UtdE9CLhJ-l-E)e7LKx8@pIsX?hcQ@|Yw)G!RU2Ige(3s9vQKMH^@x6QYuLfPok@Ish%R^kaiqWp20I0Q} z57k8@ogS^8#N37MV~gLs{axnSHVJTesCbdfJB1}jlr9bD<`(ijR*U!;JrM>$y8|Z} zFH$_blnjV(dB7X~819jq&|uDy*p)`yM9@zT3oEkl`pB`E#f-x$K{H?sx>=Sb&r)rw zPR49A!_MQ6*nNG0UaMg^whvzv)kG*+n9#AEl5?v5Bf0vgsREFN+#vpxFBfAav4wfd z>MHorIe_eJeFO7n%Cgo`aF#*dv$6|0T&p#dloiQS+SNa-V%EP3szsD7$l-s(${j%09gugm1Nn+QvHO<) zk8gFBfzC-Xp(lso_j@CV6J7gdy1RO}>r?fGzu-o1c2HkHjf%eb4ZVLjoUiN8{Pw=K z6bkwW{RKDrC6jfHyS4IK!WgzPTd&>v>P@VBQ8I|#-S%Mrk4mpEb3?>`S#SOJt&?h0 z11$#egnd(2jmZ}+vuxEJR%>Vp6sv{(OkNy6*R$l&EZN0V>wvdQHNO9_v zx)ptnX|*<8D}Yv88^3-X{j~Os%OPzPjeh-0{|>Zh&B*`5r_9rUVVl=ei@Q#t#MA$Z z2nl+Vpq|@uEAx*DBKk?rKwu*oqW&wT|J4nJ?=PARZ=D&*PTjt^3nAe$}JLwXwc#*GOnczN(!RD0E)GI~%X~+zmYV0TqC%`R3`Q9y|%QuG22J91# zgbpz{L@OyICZwfpx;$|pi2hhlD6IT%O7-BuPa@$&`bTcPE=;d}4?M?}C%#Jqu4$f; zus*ae&=$r1k8M+gX!j0RBFWIJcQz!gW%|cg+V3X_S^F1*B6nn(U8d#y>eVa0!;a(I z3Ly8i-t_t+`qxfdC%d;2y)51*!9*EI#^>i|efRVqH_?~AL2Jz>iujb^hO=Qen0!EN zdvFicd{KAcBaGy&%@=R{ygey8sDgd5yp)JP1JNy%WH0?cTwc-gHgdi!?eabNUBoKX z2YId(4LGrTd$7Y8or}jtf|j~Za>50YfzC@8*oB2&vlQ0}RuKEl00( zBYDjsPb_cLRbL~*I%K! zE8V$%cTfC(KQqF1So#$uj@O>7{N6(nnzfa=db?{U;kw5Fv3o!3aT~_%Yys=T7D%ZB zh>?l@>c9W_({C>v00S_3=zdE7^Ir;@5mo>GxK;w=Z~54-Ti^zqD@db#zwjy+Fp+g< z$GUq)-?onFzP6axF2IEmj;q=U=Bc#fO8Mz0+b#02du9F z=9mG_G3gMBe+2opkv$KxHqmIQ{j{zX+^4wF^n2}tv#Vbc#z4xfu=)pB_}?HG8rOIb z_Hw3sdh<&tNmCK@1prD593F&6>_r2?6TEA7jOz?>AE<3yWIej9BwhQRFOi|jE9W2I z_!TRJk$BP<{PBMysRilLb$~^Ml%N`^>OIllR-N}nC$rUyq@qS2@UY!`S!i-|cq?cB zim8GF(dFMBJbX+C?kSJub>kiG(tmmT5pVnZaEC9_MJkZJRePU4aBEk^(e+DX*W_`* zzm3CLw*ZdWcnYs3OU{&1+`kbk-Q-gMLusfE!eXwui;Uf7TK$DOrl ze!}wGyH{E4QqW``38k(>^Fvm@5`J>JL2Q8}^Hfq5D42cOPE%DmvQuI!3|jHtzVWwr zlvcP2d%EQ5@82Xyo8(;ob&5K9CG(SNB6+(0MD*5xkNniUeDy^RC{6xYkpI2^!o(mb zD@5NA@c9*JAl$zD-@}@U5NT{YDHf8p; zdRpt>i#HRx(aPYfLGX)$j9BKAtLtp#Fh>7bmz1tv6vRwI9nk$0F5KX2SU;<&B7Vcv z`oBTT`WJr*r1u_hqsD~TA0$G-aV#^-|M6QHw)81Zs#09L==Zn#^422W_Fpls%n=@u z+4sYzSNDFIB~i)WF`Tue^w$?SqnT7DS!`3JbsN_&!PUgPy8P~r4W@l^whzB_4sXA{(af6|ALi2t^VgnS}w?lvVuzM`>DUBW!{p+?*DTI zZt;fqLX_(-2DRK*@-WVSd?^g}#Lzh9;Mn$0llcT&l{e}P-SjK?USDHa0tZ6Bh{3u~ zq4Fcyi!5z^JTXD6GtdwBuMJr99x!m~X1Q6T^lRwf#q3QoUv*E%XhGC-Bl@sHL|*?P z>+gQ`PpjWBi7_+32TQ!)@rvx~*Z%yS1-hc_bwTsbqm>wyeN@KBe0~H+DtqE-_dh@X z3>JsO_4Pg%HF~G|M<;|Tn;5J_u;YC!K}t$nLn^)9`v15Bs9`q~Z~LheZP3`Jcl(#Y z57l#Gv2EyzieR8jjSf)j8LvO7qJv0RwN#z>^SY^0NcOI)W+5ypI8SQqU;s;y3!Vqd z?+G1-laMx4?zedO^Tic)v+w$L<7&!xakVGIWPA!w#;uay`I`RgS_4NTY@`-F^R}O5 z)Y=I<>9`%I=}K!r<<~5@V>!mQddJWQa0Yl(G7IYk9>V$&>SA*xt!~i8*^0 zFIx0g&c5FwkDxxOiQ0c%yz^r)mq|WO$XQi~9;fKpwzd~n7xq8Y_Eg@t;gnc#zz^<>#Vf#gU z3EB{%+6*}kySWPh`1O=V#tdi{41$DPk=qn_tFZbbjhJ6dam$a}rjaIMTqho0*+Oek zx)@pL&CfmE_v3kcXBHt^xX2+(0ufmTrpTTG3E&XhU*uF-G zG4J0{J~J4u_wSof4GxsF5z&bsk)$uF2sG&4jUoX^&~U?OFBh@W?lAm+t%4tKFbE~x zF~*y33sG?qzk^Ebi$)L7^D2jrgWT^?G z*Uylo|-fN=p+zrd?f2MS{jL(71pl!Jc=*e30OED|_ zD`!4(&6dAK<8`@F-vc-exy_F4UHKMp(D`C3kLC~<0YO$N@GK%8H*p@aCyNDgqUCr; zD-1HZfpp=}t7wLK8DoY#@(aiz4b2wK@{C)LF%otYMgN`kyg=J?!iG^mw`>*Ap*rOZ zWMpn1}suTL2sT;q1^eHcJD=^OVVrR-S5Wd$| zf5@k8*lsA(x_N-ciJzzovk#+(t-qq;)+^z4FFNMV7SeoA&2!IuR0$37?josP-`W9-`W*n;* zWy%0)uOhIT6QSu}Z7(F`OjgT@BtU7}?*ky`ma1;Tn-bWaRwP04~rPVn4+O+YjCaS{)8-Es$kPxIl_|)c3Gv*pzkIR za-9`(ZZsUWaY`i<-1Or8+saaafdY?C0I%_A(`jD##8eOuwjrS~ zFcOoHa29pncmiF93BG8$JsO~Sx?V=;;(k#GcnqCmV5jt8+V8q97{en^n-*j_fi%wP zH(#0O_dOOB>qUce%g-oj0K3MFEqU`Q7GTkKzugL?$1vLQTI%u8&ac4h6*o9L`++?oBf2K+%ubkHXk;;SbxC<+ zT3vp;u`+#(k+Safm?fOr0%A@#c{e>ibGywV7$|1+`pn@juL0E@CXokQu~+_Hurxl@ z+;w62}tKgVgzq=It)W-O+ii{sESzAoHX6o{3;(w;h-kr=%$|!OFD~*B8+vfH98?H92xUrDv}%eUBK?StNI}F_+pP19Z?s_f70E z?VTV4K<*7(y-UFB3o7CQ6Ez#L-VkuV$j&9}nFG!b@-9yT9VUmv{ABjNQD84{Hsm9U zIdZ5$x)iYP=F&wYZ}}Ck0cg!23{o zH&fDXe**`s{Nn=8-kB(Fh|2W!>PCoUhPb;82(2MXLXN;Gtj__8vjDGb%+Je*TW2 z&|jz~!kn_LlX~vS!gu$!NB#>B1qJ^g+#T`>Rf|>-_g?{1mj{e27-rgRQKecOL9S2x z@onT!M5-qvC1#NU4Eq{fBDoqxm!BUo0mm0nlx<=gZ>@70PmwD-(-9>3q?}QG>AX(G z@h5{F;PEyJMg^SvPQ;U`Fxq$ChLb>vOOHWSr1M=Q!d=VioF6>Nxv0-MR$B_Z`UGB=TdaZP%1h*vI^!E9Qv&QsBf<<3b-vUjB?nDZ%Qx;oXk7S9_KFgL5pbu`$l^5N&g<>n0Lk>md{U=6F=LNA+7n8 zCHw@>L?++j?quWkHwiIf)}LLXk*k=Qgsx|(g$d%_wseI=X%)?Y1a*qYsb$-Ii(MNX;}v11|==xpazQ& zQPNJEgoPy|ZnyqcOf~H})%wez=AC3%wBTqG&%TAOx_r`c-6D*S8DVVS0mo1H}NB{3y#$dh*mu-3BBN5+9mV9D*M2({f26-F$?K|{ z)z4|Z&fpy0vri}cOXUY(BV0;IfaARzs3x6}M&yy^h`N#x(c@=vgj*#sT^#el-5d-k zIxGmmx5yKbhoC!;LapY7L+9BGA*nhl0tOrzVBaa?$=X?0Al(z>;!dZK9Heh;1&Y7i z;Rv8yCJ#rc=!0xF)8_Nl7+WOxfTsaosmw83bzDVfvgKmJEB(OEqz%l4T;GSl*qM;J*u>DR zH8xg*w6hGZr~cO?SwOSBzje@GKyW}`z=mESs$)btYT#HA!!mpxjxN%8|e__g;F2^_5gEF-WN`**EBzUI!bN)Y7Pv*AG^g_zpT4 z=CIgQ`Lj}IW|x@)VeI3kv$W=f!~80dW zhlyKN2|FYQWeewdk+zEeqrB20PLL3*p3QgAz4_U>%?)-~zdT#N1dc|z*erfq0oy*bmnUV(~x6vjsbCku&RzFI2pBA<>ur4&4w(;`B53$l$5 z_)Fc8WXX&)#CQn{$Z&Lofk=TO5V()tdb32OTBQeXe^ zdb|UPZTaXpi_NhX`vq?1^Ml2C^qdV5O)5!6HBRNp&}T3nqsZFI>1z9-tzSPNtbj9j4mPx7{bz>AQWM;O5EeGfqgFC_`O@g#WF z@`{RsBy~uB|AdZ8+Q0gv&R4wmm3TM$T2Uc9@-X)}CbtYVBaa-!Tq$b$r-6d>>|7w{ z4-(S|aC2cGPmElOH8t($9Toi5LvHHZQ+c5>Sp zc;^xu!b>tl(KrtJE>>Umc1qP8;xlzMc)viCg+)CspOIU5xn|RlI$O0qm0CPDUIx zpk8vugd8S|AH7yB^q7)!`J}%3D8c0wE>>>zO|%OHG9x?jZxe`sY}XgTQcwWVtepM^ z4@@B$6Hn>--U%}JqaL^ykhEjbZ#ti)wduQq0|2XzqUb;u6g%bzOk#SZbh4lt^#?41`x6y8 zgI}6V5ElE6Z}j==Al3@)Xi#Do;hmfA5*NJ)HOzxL^JokR3Kh+L^e4fx#=4C`CG(xF ztmpjW@ZA1wL;&gFAf$Og8{GG$L=#KC`;T>h|W{9#bH)B54*9}wOyK_X6PU2wNx zVll^f;AkJ1_sl->F%2j=h22d4mN!oDG^?96oy;suXMWoMEK&Cmua!H2y3N!_hf%uS zB!?yK?wshCWzdagAP(V8fJ!3$9Kp%b2pGj7PGgsjkCu-aGz|cQ`W%3x^;up~OCUXouxIa~JL%%N3mvhJ zaKcT?Yc~|UgzVjoWW@JPF86S31VeC5Kj?Q8fx$Zqr7wlQ$b6Bwph>pzbRJ7>`A5JZ zJs7X8BVDA?3;sm()}SmRu#;zW;$%;!t^3iub$(np(@C|81?)EQairr347l*=Bn?BS zq3WWaSLHF4`$xDM?DxDLsY8%8W5E%*d={%zO8WNakTlj7TL+pnha38FT$W zmbB|DAIUSV1a3y*jO-ZSF^2JnduU{>vSs~D$@KQ7?9}0NQ%GikKm@>9L_9?+s1FqT zxVnV0Z*Yc6>rlHli{z(0R^(JZhXyy~SLV@yb5|f3#b+vZLW$ee*c;@=4pVqbNi{$k zISbTcL!OTa3!*wi)V6k;t7rBa8$yr{@>UsA)5AXlX4=u`FbpYX)Ed%1;g#WA=mlhx zetg)>DD6*7*tGNT!MSM=9riwnQ9)jIl(smX++~7A9vH>7Axb)M1H|q0F=;UA^RFwx z?tHz|@f#B1cx0{i?_heYA1_@}`R+-)c>I*{8Nbgi866HBuUm&7>zHrzx&c~_9TU+G zOmUVGVKpdI|75dAE*ro#L-3Cimbg*4&<`4pu%qB4BYV*_>>JKIAbfw!x~7tI&lctt z-qqRxP^1fY=&&h@m*nfNFhsKmW7r}KgL^-*X|c+_JzP2x22j*zS5`XwYz{j46_2F) ze|D9dgg&^GCK&Ta-KWUG&mfzBB>Nt6nl7t^`6ntgxXX$);@67rc=QIyw{mr(4IjUp@k2?J*sJdwb-+@*7)s0OZ+E=!DHH+7Vwu_`*e|{pzcslky7qicy#HZRR_r@;VjDos!@W8gH~pr2KJqwULcPj;6iFx%-qE@M=L|6sn;RAeK%#GO{oBHo8 zaf73X7?I(J-t!Znfs8z*o}XPd1izA1W*UAQoca`PkZc*T`#Cg=Ho&y#7X=IbfwM2a zFV|G~R=)hM6ulH7B7HKzIy>KdpgPlT<~qK}*CmmVGXn02>1YHG(uY7rAB?vI_hi`E z7a_`e+0c=-pdlb|EPJ8RQYdXe+Ht73j3q)B=EI%pKvxxYa1ja!|1=36a-=OwYXxnH zG1mAHBq+gvG1HFF*S>(cLlOlhyMoQV`=S6}0({T z83G*5j;Hb`HVeP){Bg#|KnD_oOoS^nK8L|_<0N@VG)&4RGZ{u4p&jnTiP4NZ*69XS zpK{)%EaY$ht=u2TG;|sFUrIZ$GQ`TR+8EIQ-ZgODWtA3bMEWTr=ejM zcIHrTi%m|eiA$RGF+#W|?UY=$1k|<+a=W~Vop7U3A}pSu6eAQ%_@{$g#(;L3Oqd_B z8BP|4qvV=2b1x^q)7$^DzrR$Fe3D{(06?&v6DJ)Ms2X7K!XQa4?UFz7AdGUnzzgTF znDYZj1>KRo*^?VY4_K0EdK2%&qXjbzTAn}a z3OZ%wc>N|e91WOB1UxQeJjG=!NWyjvyZR#Qbmfm8)=FF0%<|qRGfPePtv^o)ms8JC z^$0iY5UdT;4AuB&S!cuTf+Eu#-##hjg$mp{k_(6R4=jDc2s4YPTSPE6`qdRd?11N8 z6ljwUhDM%B9{0-?-+!2r(_OLvQA~J|2VgyM=PQp~NO;MpB`gjv#gFRRMWzswy;jP9 zB~I{X%s=vC4mA(Z$e=_qB&J9M>H9)c*0`?W6WFA$UYtp&yX|_D9b4UJ z4?x-bLV<%9L&CFpUm?w;f)WGk=)L}L*xH|9#p($UmH2PHWkEsc_iZin(EiD9*A}75 z13A4c><7o32Wdr+S<#IZ!n~2*_#~H1f+d~m0?0@?U$P8>_WV?bk;KazVD?>Dr9`&< z?pe^;W9}exnd+O-OjwCUSoo}CCkNW-_ll1&s%!+oVGuZ5b!LtGs+q!(&RDMj$@8g( zrNpHJ<7|N=7ySQv-CRBmEa#%3qdd2x@ZQChFuOzZS?#URXWMeae`z5E{V6oR{B}bt zX$(Q!IgOl8fn(xy`j4uDDh7j7>@D2@n{(Ek#S{&p?B|%5@Y>!9Z93M`Q<*KVV50?? zQk=3xlUfQsz3p$RsM4kbx?!X;n3IWn>70^z+FuN<=6XZ_ys9*ROGdsCK;K4gjD;sd?|mZOv0bExPl8h4W89acJu(ET`&T&W4HNQE;oyi16g0 z^I;6i1PA%lsgPkE7%V*tveNT__{mp=z!VDije{ukUkt6|&B?w66X)kS5?gw>Egg^z z8_a5r>U7Tp@!PS8*_@_qC7#10F&R%T107f!cwK!|hoq zj?k>Cp8wbgG2yeT8_2urW?X}-uRE(+5Ean3cRIUE-) zsBP-1bX9Ki3`P2g&509*0e&hg{YZRDzNZI_m65e@vdjIMoP5Y3#-J8qBOY)y`6F|j zKQ74m`yvH4Ds-0&tb-PgQV}8PDb+b53=F@z2BuNa2V=7b!>2KrUX=#>vK=-W{}LR6 zMli9-szIF7Huo{HCC)pFif1Nn!#>Pe+WAP(`btQ_&MoLCsiNK33=?YzFK%J_^ck=c zcoR>SD|hEKU}gpN-e^clg%|ZYvQx=y*wN4<9?%?`_D2fx&qReWRH-bb3K0$ z2+t1$J5MyJ$@Kp*_TBMRzy1HnsT_SuWi-(=lU-)C?9H)P$fm4}%sy=^+4E%YmA#@u zk&zuGkv*~{>-V}y^SpdFM@HoSx*efk#LJ&4)8-d(-;t zL+V-t)x#zN+Da!91h=T2n|{0ElPkly)gYmHC9oMrSVM0Z_|IUhS+n%=N3l3&0YEO^9jk(KEAbo>+DrK`5V5&6#ty>nanOtZYD!A?}x}A3Nf3QjR zLuC@Qo94T$6MYvEwnsLQLm*Txy-ABUqm zAJI!+v)IC{pi_9;Vf`A~{=BFm!Q2cH1;&(apmB$=?o|cnk#GWbFIDRfXj(=rT0m49 zIXaQoic?p;jir#RFc{HB%Yb>?>f3OeHCRj&DmAl+N|{YNG8J_orAv3DJJD<6c3Tw^ zP@j!Sz2fM9HJaJYVo4?;vLttcf<9p>p3QMw#8iAiUE1;AO!kV~gYg7s1L)MZB&8?V z9*KcMD>yq#6Y5jx#;)BWhjfQM&vegmx;W3zg|r)LN6|QYgRtH?iNOOd>0y)h6q5`e z5#WSVI;|358KE`gISHnBMvI)44?6qQ$UDDYK*%zW(Hxw?m%P(|sb%^$5P5EjilPI{ z51Kyd$8!rQp=&=GMd0*nTv)F+e?ut!XriFw+kqJlHtLW3cYZhg@lPdC{(D;TD;gU= zhDaZF6I>kY1gv`$J%8X>p~m{JSj2k3B65B*`V-wHWW4>_|D=9wip~{czt1OLQsK#e zbVz=0C2am}<2ZXIhPo%Er)>WG%wMvY#iOhML_}(xaK_yJjSPUmuRf-3YyZ4n^S=5| z=GJM~e`ddcdzWSBbpNwH3B7Ot*xuo7{~_FECFYi;uc-q#L1{-X5Y#|zmJLJ99~0Le zDIr%Ld^r!M(ZN*AoS2BQJm=Sw7ypK2cC@`AXX<0seM^A~>-*+sA14s)_(33g#YMOx z@)}6=<{)qfaE_>VfC?BXX^7x!2Qy_OgdVD}F0^ajhYU^Kgz8CIh0vi8;I?|hKAAWC zSLmtxdrBP}url-^Sn~?+c+^w)k~{pDgsA+CyDYsvXa}I;z8>X@C6=HCln2$u@B!oz z{o|JY(^?Zs+V1PX!p66L2T)HNlQ#Y#3;jP9${R9w9fTAdTdE!V_2HDK5&Y^VzR`#O z6QN5FZ#c>6#BK6JT!~m^>>3;&Ie%UC$Jb<6X z5RfDRz%-|<^-6g{8skF2Hy zuGheYgy82Ql%*sN{{M52-Oj-3C~=U3{bY5)P44F#-F#%=ly~GmU%PBF+-LyfIjZ0G zQ1-EM$^Ukh-(NUzRADJ36j!RiKXljFqr<%Uq<%dA{}!m}UjV5veaFX}@qff=F2Vtj zR{i7m{H)6W*sl&Z`d(aa*ZQh*f4^7f;vfI|Pjb)Zqg37V>Ityk>wGQ#zOTE)`!+-H z&qw!jLHW|4wNUMR#sB+ffjs|@M#azN{~yi=A|%&KYiauJarFO%?*hO6=LoH|Jy14h666}@y zPuv*-Q(Ah{^4ch>fZU=tRNq6#ACFk{Bs^61!}Y%p@)wVli~fIZwp;`r>CKn18-U$& zn4!&P(;pwO@whjl-}l{F$aevVQw!Z+xwFAkw0S`N?gs5{b9KS0({@A7L$p4`UuUBE zi&S^~WE#=t6Yu-y`>G~Jc=+=7HJ|qmLya(v4gXH~_9nRW>p~`zBwS6U4_*jRJ@S`9 zDSF5$Lx>+qcOejZ7tUc?u-( z)ZZyJWod*fG|V%DGTHl3yGS;WIQS11JLE)=XB`+ZSPXshOV0QGMgNB#^dmNs!HD{I zayk5{m}vI*2!ginukiu}6WMuGOFAut@Bi!Mfgk_e=0D%%<}TC+X$6ftZk)e`MnAHi z@9nyuvEt{svGGfUUL!xf>~0Mto%U74?N@f~#$x+#ghJrcAZmGrCC#m0KNa{i(r!$@ zulQf1#rk~)qWhG%5w=0Py-1q*`|19lp3VB@lHGLAK`v#THh*ii#$pPn{|v~#J~OaM zg*%mR(%7UOJ|LC#C|`SegKkp3ZjGk;9M!oczZKRUI!+AJv47{m*Q8*a>Wih>B zCL)Q?dBExCcHR8SeTbBGlhU}cd-0+Op4xQqLcX|yo7P`1UT7Ax@9zWgqPE-aqi#YD zXpK@YgvkApl=w=!*?f<1Kf}YlGJ<12l4Nu(NMb@mRuslu@|5^LhuIJ#oCz=g;mZgw z;4PKTzuP|}S&QX^&A%GF0bWnlE#tUFb75O^J#-)Z2lkC;{ruJ-s&>5Wgi_kf`9kB?mc{eQd? zbSbYc;W;6Y?muGiSFFZ!5)b`-NN&86_2qpL-RRpoJk$3lLUxMq^5#fuJ8d`Jn}y9@ zz~)X~f3m;qQuuxx6Kv_>?&9+c;q!k!Cn4HzM4o@A^U`Aun;+B zA4r(|{t4TEUh?L3RKUh09k|RuoGjvo`|HO4*Qes?(Vl+LB^xX)8Oi?+I;_=q2(8_C z$`9%6WZf3L$d(cJ@7Tb`j6-%d06#d`98l%c4_lj$=VU^fQojSZW6}j zoiN%ooEdD$KOqtLIpfK_7Ume_C<0AvJut(tndb$($j4*s=kKidzsZpFyRHf4buz4i#)ofQR~+v)z$BP~N8ELn{8PO?*${?l2Ryc@e<4EEg#s-E+ z9ugUTy<;i-ya$P{Mete__ES26j;fRc#m&b_g6RQyZd%nb$|aPxUm5`9Z|Ds(|88%d zfqi_?+64X6`ZjP&hFOoV05nB0hf}rAm^=kMXdVnpyP&1056%6EaTqkWLBqTY&`sgb znw4=l-W3?CCeAD@fp%gHeucTJYS!iW>HS<5?>j=zSXv4{2Cjr!O9NT zZ!n!`Jt`EleIL{V>?E@lMj<-_esfz9q9EL}v3^A>csNk`K4fVSUl zq>w2MV*;rPzAQW2o4xnxwxZ{20fw{1(3Xz43b5`m;!gb&@ds%0plNB~`m-3M)6+b7yqD%qmw)boTK@re9 z+DRhi@b6*$SYo5)4`<8EMofb($*U?K8k*SAR+T&PrC54d`HV{+mJrG!MDOuQDRxXS zbVXC*d1j!Gwr<5o7^pQ`cj6L9;K}uxV9=kZagzEw2CK}JIvEK5b=YW`8({snPwUB+^$qN?=+^zQLr2esI zc}VotLrx^rFn9{75VfG7LjJ^!dT{|-xp$j5Fk>$nL(g8nGR8W)f|{xD@_kC@+SH%4 zN2Hxkiwrm=Q#1+_D~THn=0#N@j@Mx*K>7LM7Hs)q@59sbDSiA$i7c$+itfzAT#;8e zU2QkqRpIP~3OBVU(!%r10#g$U;E{0K>VQXF^Q-k~yToq_VM`eGm`C@?KGnr#~mX`Xoo;SYQ{@W=L z7vkmZMkgv^4j+blKBK^*u9c==Xfxbo_O4(vp<;r_qwZe5_We=ysJs0Y={Aod1|qay z#l_DifC%G`glNuOdN&qrXPIVw{zZU2+^12lcBd}hANv6TFkldPFqYl`)HT-oQvl}} zo*PbbYmi*rQ8V|BB75VkmTiG(&y%0P?`i{Pi0g-^%5~eqRFoaIxt}uh_e4_Gvgkc3z>1q`ih@4wsnc^+j7lr8W*$aR z+N=Q{ohqmFWfhFUOaRtnEgdT`gJ*QE~|H^q_=;yrXyaD)d?aw>ia zvHEsavdN>9Fiq)%ht_k#RN7&-2wbm~q-|y49F1NVL@9p=uRzQH)z2gZ4*W_cp@u&b z3wqyjnADn#cjfOY&4sb=i)jjdq~FcAf^f+Z&PQBxVBeOAh3lC$z#{xJ7JO-0}eNm#Jm_d;C-}79dVeC*=KDY#P~`mmiZ5L`9K;XMnp zR!PVH1l*FsVGD=nlE}H}@HVg4mfA1I?tUx+Gieh*$4#ogx#5x)Zr>zEx5yHtB4jN& zV*sec_@!ire4Pp+1;;C8HxH*A&Ru{=*a?H-F`(Jc2%KJ-m&#Bst;$(YZ?jyLk9gry zE#?A+S`>juB^r%SeLUv)d!9~SOtA9%1fk}#PJj|F3u$MoBr*To>DT{0)o4! zELmw02H@>i)C~*_29Y%zk! z@Jy*`iCVZc}{He5_1ml!rdDCP@E4YdYbY0jGnaqj96wsgy)z4 zGqgGsDG38g602ocVP@caUmiqCyX-HGtv>^8^vOxcP&)x=-{+))?NkN!!_6QC!Zbfyz+8tC-9#=k$bP-Inj6zDhhjYrc|XK@*OR}K?N}X6=t3? zWg3ub*BJp=dEPW=AuAGgVVHC8q7)sgHYdCq8Wft?{#t|lu-+!f7I*Ts>5(}ZVe9EBwUdD|E5+ay716`ex>q7&Vg*rq6G_>>Kt(rgV#UD4 z5G>7<>+lajz}v#X$grvI1kO*6$pNe68u0Q^sx} zMFlZJp_nqw@U_*{E7VCY?1fgfG^*$E1(e`y}dlJsLMZ>Nz+@HLXC!A9VwISTMqmEl}m^i96pXMER7O6Y&&%SvXhZ zYQ}FX{pMv06%e)j zJ?341asC-X;L><=mkS88*a2mwzl1$h{l$n6wPBJem~$Cwb;_c*13-Aj5 zdBH@F>mgguX}2!C2KL1L^8AC@@B(wc@FKJKZHKP8eX-KQnn8V|mAzQgXFFF2sF^m^ z+_%d8s_fNT9IDy3tsG{D-p)hTTHHp_&ojoWvj-))+fk$cN!V-I?Csa~|40ZoJ5j$T zr0~N@+G_|q3R*V;pC#*~dqP0I{v_QUn|P56-H;MVn`eon@X{Sw>p(*IuvbV5+s0ar zbCDi=;Z>v{&hYis+!gQpMcR%z=H9KuEY!yfOd(X{Jb3V+-25tILDd0MWKhXrFy$=e zQ@sMmKYL)ap2ys`*TD?sm9Xo!(Nm<}8yG?sbEF~;JBxP~?AtSpc+_NmkkuU#mf!5y zqlL3bhL`@2`6hFT5HQG`&wK+UXa{5{Uy8${hybsX^SBiF3-y9D=g2t=KrrBQ%t)#& ztdaz!>6^f{n+I_RMX_BRpWx`+^^{lJ_LN3Jqq2*^YJ=-)jAnP9f;7*G*`lWyeXu=Z zp5<23g%dT0sbe0(q_Q#7h;>1Yn;7c29Yv#%mlpIdgz>L1Lw`4Fe``fuI&}>$K;5?| z7L2IaEq&y@+>LFy(-iZ-HB}Aj@GGyMjH47m9mHDF8VNFoK0_T9L$((m3jEDjJ0(6lO|8Q0^C_9x-#b+@S5vsSBc;~80RF0rWH=(iOfRT(W55q0MHg?jXZaluu1(IpI-(iJw4%_GvRefm7;jfjUWfYqg#)2Sd*z+u_2 zRBj1q4VPk0oJnl>^m57y-M~dTsFvuvm8uZ(RDVy;fz6ooBRc)^wuIE&k!lTC6y)Z# ze^^h2<7J{I>|S$8rF0=iM?F>x`e5nCx0%~LS?*^A)YS9Oya~reLAPrh(Kx0kTkgVs z=X7zYq)7zY)EffAjCrD9^fgcXrxe9l9rKa-Zkez5XIpjT7wb>3bMTa?dI-kBFHye`5>GZvtMSatt9P(9D@Vlz z>;d5=|Cl|}G=0SO8=Rwv0v^b1Sq+wOTD5kcpv7Y|;H%Z{GNQZF@Nqvd_{lZgwpyc^ zhv9j~Kf1`T@(JetENRH%g@RZ#$L`X&OltH)!)cHwgB>CI<6BE@N`RE41~J8U$XKDr zm)&G83;tOLkO)mtQsl>P32Cz}(JcU#$u-wp0X%r=2RQiipbV-UO$4Kr;i8c*V5q<^ z&{g3^xCKde*?oQ_(g}E71MGAy?xdZjuvwPgI8u%o?^Tb#1wFR6I|aY?*i%MS%VZyV zl@lF(R3d#@BV8`$&X)%qnCQiHW+_69mvL?(fEGJYi;!6kG7aQ=wJep1Mm5Y7IrrS>BPKTzp`Rh$DA=BvQd+7n4ZG%jDCB;3rB z1Y<_svBr065N7JTYkf}Tzi*Pr^K_7gI?jGF5_d;IND)wv?158`Y`Ly3BP)<6`i067 zD9fL}^B6)Zoo1`Ju^7pIX;4M@WTkc5JhNhJ*=WO{-qqZry@1*_fwxh>aMl_C?SU+l zbvLRtu{R+iu_O&Yo;CUeo9m>3QEPI5ahx!wc-UiLrd3%x?S+s!b0!x~Mc5vj=^3%~ zF6hOf`95K-jRfaB%5Zd>|8^^%q!Y6EEnd^A5rNOnHJt1WdvcV^x7QMR_S9+Q3FhFl z5nmn!nm3gh2E4M@WKDEr;+KchDh8V%Z`SGrH1n6xr z0NkllgTssZ#DKcxwnYfzOQ(7n)=)u(ltqvd%!So!r`N;!ZAhT)>N47^3aezfV9~V&c8SlyhZd>5gXRz@ks7J2F%<1_Oh^YOLYG z_fTM5K~7|OW)6kvc`J%&(|>!dHSRFOb$9PJ+z0EkS*H)7s3C{QqYrKb{8VQDt1S3_ zeFkjLGnMZ#lIHO5m&5`<5M$D~$Ta-UB7&VacNgXpv8 zCb5SGuBqTQn#?JXYE(KJ4k?ph8~>IX{$+0GZnYQK)2G?fS7(9I+XYVDHC}P_)x{9M zRDn)SG`HTa2g}hMgB8I%F7r#qEc9nHKxo4_{qmW2Uyhw#5kKTO^n|j_#Kon{yZR3ByyWD^?XK)O5b(?X(IP|{K{Q?<;eRfSCBQ>Qe$;A?l6o_m= zW>frd$~rF>kne;w5cJ&nl?+W1z#VE(KhUbM+4E;Vkbv+EVbk9$IISGoBZH~Lb{)ER z-$8n(dg3F9VT=JAIS18`HM!B^^fC3}ki)u*bb(7BLQI=3dU1EY;u7$D3Cbj00B|;P z|x1WjZ|CaVlW5yC_=+byO-zjt)+f`twf<64? zYX^_nOZynLn%n@J#C(XI@!YDEgJEXbVncl}Dv4nrNHJ`PM0B;=TCTp>FfLQdSe24D z|B~ztSE4{fij>q;n>_QpcU5Qx1s!MabAEGhD%f;K^J`Mf6J%d6*?MsmdAQ&7VtcDt ztyyG~<-TQG=lX1lHH^nP9vO!M&C6PQZh-G8kvZfDeX^m8@R0i|tSl&MK7k0Br#7O^ z_Eu}{gofkD3H1PeV2yF6Eq8$TnG?`=d)tvsXq8XtA^K|*$U?J)Viua3G)|BTqdY6? z$%VV2^`$LAU-)6w%#4e!pc>B>)v6KSa*!P2MR_;jzi}Pk;R$5sgqSo2f)i0Osc?HG zj+lq^Dl#dTPgqRt5Zt+^cJwPa_98FQ#l$DGPb~n$Y6ly5D>blVE{p*?LY(o-8KKKe z6eCkXSyCzZ2(UDYuQr_kDabrX=ewXz(MOC|h7L=R;S(DVsJ$Om9Ws)YXXP1Uk|C6a zYIbzmb(y53nWg2nA-Y4@#3x|AYH@nP)+&ksEg8&c1Dtw?EKQ-s=SAbhQvil+V~{i2 zWHY{M=9!$lAnc5cvLWMcTNgul5W$!Szf>KHGut5tg=p}ha zrY|jC#Q0>+&gbtodja7C{j!9LSr2kU<;TvBl@iU&Ztu(-JvIN0yd*{^1;4@%JRfs7 z?xuRNH71ayEc16&AWCJbUqG-WkW1#_oabX|0LMVpc()Q5W`zuNK@RvRIM>Yk2B={) z=(TFJ)m()^u~*RB5k2hrmP4+G$VjksR2|2r7Ftr-+tKN zUBj5A9pWwFywmW&!FJe#FzkV&*#8T)^9Fo~){EfvW+&mtllk5SS=?(6f^aj+iw{Ck zDmq&r{me&6L+*CQ+KlG2jCyin#Z&j`eJnlNOyn6t3=obos0B8f^|bngE9ZG3;D58PA4DOHRHcRu>Ep(8-r*AjI(19uSH%~V8vh$kC1Xn?m2-_5nRab_^B{pZlxM)G@cyHsj~WhJI)KuKeN1pEH0MRwp$ zew#cvcu{%>P~B#Q@0S@T(qYJb(Rf>04-)BgScE*H;kJ2ab zgmX|*pvKT8gb>`_?H@h1Cu05Zf0a^>!YVd$y6c`tKY(LR*7Y^-F^nnTmm7MLm_? z-ccvS{#8==ZTPENB@in z$f*J|DBfLdh}2ZHfqdaUa@^&qlgQZeAni>ah=ZyCpR|OM^h6Z3OoWOYP(G2r;C(S3 z8duohJng%2m=kC2zBq7p4&DE5P}621I`ilV`pb6zvT|rQ9+ZY<5HA<{PVuPe?{i(e$!KyP+>7~0Wqb34~k3T4eCi}3@AMQ zx!5J>){VTkN(;aE}=>qp5SA9zVVZhC(99h@646624LqItVH z;7Rfz)w24PfTfCiYpy^%GEUN0#oI4uJPlCEtJL1C9MH^<^bCk)&* zP6!x1e}0?v7Ue&s8ijSA8VQpSGA!{j8o2I`ir+oSAY=zK=S(59;aS$Z9gw6BhUsTn z2HsY^VcTA*$DF*hvr+Izi${nWlenpi@j$2tCAzTs&b35_+cb&Ai?|gj6+vxZaN=$t z%uv%BM;I#~c=L|XYYU5Gxeq}{W@d2&4q=?ZD~O1wFe?{!P-4^wX#{q3fJlE~Gxk0Z z7r8Jhw;38Cv0_ajO=BsYm|^Qy**0h20mzxqS`h55Gp;O#u7cVXp~w-b{5Zi#-ar|q zGa%Z>XBXNmcwOmLI3qUc8r-aJM@`{@Dgc69Itt7C(fjR18%$^+L1Jo`ueu04G$Uu5 z+g1n>v?Om0l#>Q?T^L31CB*wq{kY}tr%fCkzO@Ea_>ECttUf}S4E1*nf%43iCixks z3;l%meiy_P@F34KwntoZn2BM6347oGj2;uDO_WkWZKRTsiHTz%BQcRR?XxY+hQqqB z>ibC(G+In_KpU77qY|&dSlkSvE?sCI3R-qigS2TjqTprCiV>cGM1DA1(WwY`u;qP= z;Ub7>k;a`veZf+=ivjM5wbg}Z@r!BvE(e?UT2^@-G(>t(ZP7I;4iPMU(7U>d1m~c+ zkx=W$rI}Hj`6iot;Mj%2I$au0eT3b9fP?56UCLzx4VMS~7{|QS-d{2oIcn1{)AQgx+0s}ZgS!|{p8NNy3 zOdOM_h_kky7nl6$>V^OjcMA)j*Dd1KR*vutrBbd8ySya5%F_Ye{V_Om@?kpqb~*K0 zybFMdUdQVk$0%QQ+F@2yUd#Mp=B8?&(c9?TPtQkv3rxqSnzST7vQbTOW$U#uvr{{F%Z=7?N46t!&Mx2%JccUvZ`2{OZ)UqE;c@WkNku$FQ^n@$ zm&+E_2QHX6BJs2@rX#K+xgCY+6t#!E=k5!DI3m5!bz&TJpYw^_*EXW>y`u5}Wi8`H zdrmXk!r`E$n|TpmzRkS5F@n?$-V=-d4hA6G&Vp|Zn_cXL4%blXfDN=;I)U?k6~?+{ zC`pdT(_xC~2`(}*AWAN@6#7%``wcJg2sD9ESs~^8fFTzLCJrGyuhxeTd(NSW8Ir{X z6H*cdvL{5=Z!42{(dUzfWHF?00$MB||j$?-Y2+K~n8OeVY!afBhJj~`hxi`ctnMsxaAT2IhyS~07K$=lJw$`$w7T- z7mNe0ToM@H#q*i0`a+6dWvXqIEkDfVA1{RSSdT$Um6jbHf|GfiGEJW-o)Vc939ztt zTF!GyNS0i(Zv}cA$SZfIg~mXhEXw1ktt;1Q5zl>}V1_7DXa~5LoGVdPY5cG?xIQ!> zTm)n^ag0qCfUIO~O==GV@^sW$T-;A&3=Im>>EDuD+0?y;Zr?av91$yJ>@^ET|L1JVd#0rS%jx?I0RAIA;%= zIkbw`W=M80H;yO1Ffn1A6O3$gO-cuB8f?}$S$IZlV^$zsOZE?@+efv2m1pwJ>TXxm za!?fM4G(^5wMtccNP7I-;ChItfN?IVg@_nRE8RzS<68-c08gqHi;J>e!l@7h=3JWA zbnU!TD0`^~;WGs3-)?LCc2WFuarWmW1h@1V%Dcdo29zDJQW_n`-I+-(6dUU z@C?s9;HW?Qf;{$#+iuJkda?_Iv}$6;DQS7^8BC=VA83{BL}gqWRpxj?aH^dkz!>hv zfEl2tPvWi4haMbn3V}%g59VOnSoUB<4gKT*wE8(eoW_JGDK4L;e`S5inB9aT55Zzo z4@(dRVu_0;pOZ>esG*VLYxcxX(0H48e;1*nhKsB40ePHswy9G{pe7#TiBp-MZ4^a$ zJe}>U;^?DKAERY7ZfcN^8W^C(WD3)ZI0ja^j3e5IMPH@rh-1Do>{E=iQX+foq0D8G zCY?VVAExHZ%_&{C_%e&})RRRI8k0i@nvo7EObx^L+Sr5GLolvY!8hF7R?1p+qR=K2 z4e<0H$nf-&)9ovO0;zuFMyNXKuv8`6T5^4)5KVQ8F1yO~IQ@JYYm%EF<}xUyHnV_& zrs3RcX!oB@;T>5Kv1Zd;d};IHoR@_4`HX|(XF83J&8TVFCV9W)k&}Dmu$m$ne=5rQ ztU~cRT2)*K-&tHBp1a_-b3LJgcOy)klj`Vu2Q?LzAOZCtuT$T@;x{&TFvNxL$u8TH zHcd}%m+E4(ckD}W4zEx?99-Z#(-PR}sqEXY1H&CV>N@-`{G3!3;g+qz+|I?a#*UtM z1jDnAev{GDD7)!^r%p;82nc#^)iQ9xlI`;phw^8#7Fj)|#hYD5{Dre}E0XV923bcR zwx&7eR@T4hG+kcg-kFk7sjy4nj}8y4%b(pYvJE$`aqqY5&Ij(uNqt(Qg={O6I3AZrZ#v|eJ}@*Pz*6T!p>V1SCl`X z-Vz!kK?5i>>lsWDSaTB!pUlrmRf8{jsyIF_aE^xCELg7@DtrZX6NTF4(Sx3ju`e^8 zt%l~E66h87A#FTUI432a#+e}HLpRhE_i*MOxxmQLETXiW#A?bR^^d~&P|GrA)p%B` zO|ONTrn7m8Q67jX(@nRYvxW)9$qdN=RB^{UMQ4X9f8*{)&CjF?F%TUIV`&x(@NEl4^`nell#JXX;jQ|a@MDXY^e>=n?U>})Y~ zc%|*0=ve9)F&KK+(Q*vT*g0glu6k)#B`Yt8Wk{5^&E4~Y@#~4U!}H&30VpQoScJYG z#XpCF_X?n2ewA{{>CNPN3_E%p-(Pcg$NpY^k2wVWg8uy110TSbpn0I3@%;LXBjg4~ z)RX31cNw7U>|HR&kt8eh$f)Uj5z2u7*O1d2Lge~cmt>9M>oBZWZUJ^3C{uHwUPu$CRXuO_@h+CO8Ye>-jkcfjTX^m*l>{(Cm0EgM%D#RB_z~}ZC7o2QhvGhQ23>#& z{$K})5!Es3k<|}SWW`rI4RNIh)cA{7s}d$5u@mEN)0>?Dq1LI%a2a(rhNfb=lnFJx z%vg7I4I`Q5WKYEud$No-_h?u{eHu5(0 zEZT+@^*4;xWY#V!)w&UyVAUS3>|}C`?OEsJC?amIk6d3kF#nqDG-CP8Yq@6$uUEIf zbd^(@M;g2wh6OYnJ|8i~G>*(A2OQ2-Zp=wfuwZuqJ$$Jz+mO=RUX?V5WH!2iR_CTs z`HVRv#W#DFrf%2biGjX-?pHvN^5;7I{?$+5kQ)t@fXhaYHrT1|NB1-)pTim@0e*H+ zs}!pIbO$O>%f}Wqjwq$?Z#vHS%)=aD_CY5_hUu6lFS{SZ;-`2IU;>RWXVN~$>t*z_ z_bZUKL4XMkuAKV{g0884g3xB76Os~^Q(}o1lV$b%_7M!?7=_)E={^*Y>m&&s!<l@AScDO}8oC8bjH3Z8{2=}ojP@8G%$nFRC7ZJ_@4RseuR>hc6*NKm*`b0bfo^^_U?yQmeeWpio zZNq~%L7B)UJE4vb+JUH_n4;0nXk-|E{c?a!28r4eNnyRL$hfV( zmFd;<7AYsI(oal$@OZx`W3=N-c)NfyLF_yuO&9sU`}pj-nB34*;aai3e;y>5bP-JD zUm$0Z5udS+Zk>jCg_q~0*|GsTBknAAOq9uqf|og&gHov$7%2ov%W4*KN+SzalbwP^eJd08`4V)A$?!hEJk^coySc{wk!(Vob;i(l&9`(SHHx}94~b}8hPYH) z-%2*7AU04;pj~fSUb$JkPT|3zW2{m7N?YF{jkp*gYG0T{iWT%bZ`Mrxg9fv-0@JKYyFRlsPKQ)t+w@!n=7F>tV3fg zR8lXjtEpHtg8h{%_#knR5GQqv79!2>DZ$Sv-QdAI9KPr1q%>~1R zYIR4Fc`J6quSZ_MImW}IKXT~v*BAhHN1A+vG?Blb)KAP^Lv5TDhwJk^JeUo$;7A3H zl(X|N^Wv$WyNjE4f?7IRF{Ydi6M-x2;n1x;mG2q!s=7q3-_aEYMRYd%?A4*`Fowu6 zj2Qu<))9j+$H;{jb0SG=gZcSD#4YmY1Nr{dcWSLYI11K>?Je9WHt5ydm6SDG5&BDJ zMdF`3jeW-iC}x9=foVW(O?iVG(}1MRpV&@A3n5np@BdMp!qfj0!7x}uabl8>x1|vz zu@bpmu{Fuffc|VJPz4W5)Ii#p@vL5=RG6`KbmR?3gC`ZLBPCNNEB9LQ+XL4j;y|9b zic+1}_OCF?nx9m(La;9X(?%yT4=QVrt`d6p(A!aGCL0F>Dz>0jk}d6NJ)c_e+G(?A zo#w~NZL6>_ZEojz(~h2t<=gJscv_zPn*vVrIWVp`J=_G*E5Igl_+rN1{Lxa!IV*{i zf0A*4i{k%Fw*X=kh)LWIKvu=;KcWpTzZzyfP>jcE9)M2Y7po8~f(VAN1ij%((7D&^Q_iKprZ6u>mH(KodfXB1Le5#cc$ruCtav#g?t*lFaiCh-HDbkZ)jo}jbFV9 z>f+qcHf>%RMwsu+DX9K(_DY|9BEATCnw#L33nVv3aLv(@n z9&`y(VU#H+$IAdZ7f+DBOYmzs`NhZFeKn+6IvY>`&!_A>qtXsgPn$seuxu+y@)4_3 zD?f1o26f|rUd{%NMF#>o0dXb|$V83g`a^S%q8ug7VFK-E62)#H-c;Ci-{U^{#pG(O zDvyNs(+pDWb7wyuw(F?Kal|h!XFy+QY7aUx0EOUQWebbIu$J;`(ZFkP%!!yrM^Gho z;mC3OI{Z-_ryVjS+qLa}WXy)y79* z($9@KqX|Oo2xYNuNvAb}Ym68XHPfDXS9^?|A~Aus6_iVlngaXf+^P{YhNLI`aQhS+ z2T-S2YZb-R@N$u~qhMcljj#9Ku@dr!gZqIn{FMC@9AbxLByQk)8e#o37&n;mA*d># zQ+uH6JeX=9ucJW6_F`ARa%KooCZ}Z!QrnE!1EvCH^(?YLn!G8q$)xjX%Yc}#P0+n^u7w;#qcvGvNMuSrx8#{as?@a8c!H7&P1&( z(>3?Unxe)mA}%5OPnnF2*yrItqIfH_(9Gt`+&t-k=IL&B?6^Hk7+L$q=A*ZAhl~GU zy^G>FMQv*9=PT92^FXU(OACx|4WBy6PNhAQW(N9}2Tk`jt#guq3Qf^_ z?zH=Dl!!AV%C5-a@8hhRuE(Dm>sg1ICpQRh@4E{>;;G!Ye~1%BhZk3D^^#{xWRmmmF~hF8pfDY z7bpk)y3ddwid!|Q=ee)yGW6($YIcE^1$aytYmM{*SmX<_!LUkLP`i%@*IpWlQ2-}Q zl{QKooIcm%wx#8Sp@J(~HSCyXfz_{l2s_8qJ}qDg-PzvoD*n|1=u4#nSUKLgs6F(f z2nkcBVv0_URtATU<`MO;L@oc7v4P$VSvKTpM{db~7bLh%Ihp-jB~|9W&(~VYju>{e zT#1T{*zmXF65_Qvl^!Y3S~k|ACJifNSztm$t)<((3Z)&fTT|CL+^0*3oAlF;AGvVp z(xvC`)^<{q#8JFo4eE7?ReEXTN)db(xZn$tBuUsv@w5Nf6acUY!x&`x1krbJowbTS z9?@vX1C^>{fSJ}r#QXlF^5bcpiEh!16eo_O-Y$t@_b!FO$OXT`!4^gv=vBG4J@DAt zV8A}Lr-PD|(bHBoMHkIGvJ~1&*P*51rdr)H6!-*Mv4rv(8c2}tL_p`aUi@yT2-D*T ziUqXj3hNJIrsGri6pC>8W+%6<(xxa1K(|}P_olaVQkK?(JE#epLc8bjU%(<{R6Xmb z$}fMokAR7BI@xN@_c<3e24RF7a;rT=5mymbD>oc;4)y$b?D}1r$MZ@u`rqk2*8u{Yn{|*GE=K#3c zIOw{y(_uz87QH7FR8-6=nY{@iK}vKhB|EWfmO0qSAd15ggz?L=zBAwoc*TM#n34W8 z8yRbyQB^ZCn}TUZ2-_~I-S<)nw)~QgY&?V?jXQTy690i`ZkI0hx-b%F%l9vMv^Af;)x z&*FjDA)SNZ*W%YPA~>-fzGO_%tyRFj-ph9?X$4AGlf!&=oEWXEBlZBiJJmemp0vQH zpsAJ4(1F4`G<7s1L9NelycrtAkz76>cUZVzPqHm2-T9$OzSh{A$sDSSR+hd%hVs+7 zI!6Y)x}QQ1ksFzenao7i8@>tF1qpfMfD$z58;41IPOwzK4%IyqfO`k5yJVn-#!nj@ zNXh=H{)N8Nj-QoO)f=ItO?3P5^rCrynuXz-;T_d?^dO1PL0Q71!ds4pW941KVK9wT zRoyRx(B+UpQ4d`Y7;y@=Eo$vQvHgUtZ9b3nK!H+#Ka%u35pnl=bnBq}t&lcYIIPLu z?bp}4Hx!Az0g5l&iGpV7>&#M^sjc6z*#XJbU`!VSz0BwfxdNJ<36Ha$f^7*HOtM(8JuT#M+ZY!Yzb4 zLnW4VbpOKb;4{b|pem7|2uw%10MU4Oh>H(gx?;18S}$YFMx95I=Pk%^1P3ity%a}w z5{|Btl6`-Oik3^cac=CgjYKz*1)&QhBJFn5ix5@;X@rNAA3PuGkI^Hnv4o~BgL*%V zD>UQ-BkVdEhTo50tY8FiIk!3FQHf~IXa`;5E&%?G5SuaJ&!%8)zzlq^Of$7Mpqs=984*Y% zLF5#8yp$K_7d=C>^9bkpck683p|>&XlFpcb;kRDK2U&Va9X;cy4~C>YMAGOCMnf=` z=fHR*drqWDE-ieq1VD{-^5h=HIL~ZPbG5hWFjldHh0+B)(;niZk$@gUMOj*y_~pL$#%KY*iFzT5KP&ixcAMjcWyZyvNr=$n%*u{*uyPIL)gzg-1ju#XNq z#sGY(2Ngu-%&HaGHl6*lbmslFTNwfL&!FpKCS|=YRYv=O$dlwpO|p4_N(q~_+^w;AJ|In*d2zL|Vod2}jetR) z-Y>x=$8^eJxY$GVGaD$XoPJklIw_ONK6gRbJZr|rN#H!rojq&R@sJTbK)nRxg(9D# z{7zteUxiAP6GLr$sRC?xs&d5DmgdK$Qe@)NVz-|*Iw6Qzbr=rCwLi|9cU;b_;s>`e zr0e~(DUfJX(FgawspdLweU7o@&E;hsllvC}dlV$S%W6d1lT@;5y6ykmyby9m%$&yI9WV z(b(+lrdc%wUk#xDj6>t<29d1Z;*5YLpfaSJN43X?SLw@DzSM$CpZDJ7_|c*P1Bo5> zV^BaUe1=AVEsnBvAB9Tt_0hTL_WO@3UiiFp6ws$5{oa82nX{f0dhs)7?MIr64&65{ z10b-Usr*%~A|9?w33n8H!BlZxP_Va>)3ZUB&T#GtB0Z@(q$7N0^d(Zd;LGle*wk7z zRT6^^IFK)VNh?9w=*PK8s~U#LnjvbSJspo+0FC~8Gzol-H73}va`<+jS?Oc(3gwRm zE_dabOI5IAzao#tCM7wWVSs{irA<)cs>%6di^zTL0r)q{+80mDpLQ`(Y5$kvFM6Hc!^Xygh8&BDap?f*q9o&k+}aJn;{xSaE!Bvi$tP z7R(TXjx=FDmK?QTm1D22X{V4mlGwk|+RL+9ZMzX%`u^qSBHv(fuo6RfIxiH9-H2Ha zRCOR2*-%%9bczU^S0toN8crH)PLR2Qi%=klq^3#{ag+v|3WhME3`%AdUw>Q;o!XK5 z(PIjw-ZKESoD1b*_y|Y!Rj{a%7Psb;D}P9occpzcSo5?nlVi(gG_i*wK419O)(~pj zB%u%$flj6Z=_UD&GN3}p79+VDduOc6!>Cvu*-1a@jQ7Z@O`!dlj&1tv%cBtYWO|MD zy}1TUN-~HKXA#MMzMYd+5{QsOJd!lCGglGdlSKXk%~XB^<=Bx&EErY#WHRJ$)-k)Xu# zRrD*IihYGMT$X|W>$5*tNPP&ZF`S&-C&d}qe|$@mSs$<5(f&2K=X*MuFVsO z=R8VJp{=Tlz}#(5TI--H*x!(#lsl`^W`BO?Hf_Ls?2IE-wt>nH*#O;b>j5L5U`0sR zN}1>FYhy_TWypPpo{GLTY(Ip+B{SU($e=Cfxn}x5$Ym+7KTy307bD_dq%|3wSY3GX zYF%{+2~&ac+;MN^!TZj;95qx`*TH6uz5Yz{ghjt0uGU}BPgywsFo@gzcGdeg1_yFj zvz)h=pUg}6EZzi(b~6~J)~kI%>I1s`@v=U=b)70x`jah>!EUR{8(?BG4$THgTZX}x z?St*JuR@XkrPqOiVBci|39xqPwWa)jlwEf`)o=Splr*SFl-0DN ztRmy6q->QvBgu%YvN_sPl)XqxWR;usweV*SxUdHhm_kG>h zzTWS-0$^C6P^G?fryFeC(JW|?w%CrVHN5)5Dw&k^VtzdkmUuTPbnCoV2g5r6Xz<5V zX(Y$8l$<;HyMsK_{JQ4g*auA7rfng+DqZI3Zu4?#>R4AbKHCg8X06_6pZ{hb0b1d@p3A?(YS~O*vh; zA*Z`Wc&ET>E&%oIy@=iJS9ct8+}aTYr&bTjnl4-$%VG`(>A6=Id35Uw&zjT8u@aXBz-F$oDzlI@=n1%)v{Z$#qgNfg3+4Fy#O5HTR1&FIxj zi9U*ou3E8$8-y;t0i{P{$Jh7lA{KES|g$Z=}-kz)=_hpEdBy>uWL<3Mij^)Y@I2me( zDrRLu^*Q-ljP*T(?P!8YQp1MBM_~+_BlSaU3K1u@vU#Bcc=pl-5>VNGy)%)L4c=hK zDfSYFHi!7wSf27{5RnG>PHiQz)7yJl?B3LfX==Tq3_%MF-*V@E*16Fq+$)p}UCury zi*y=VCT;0p_Gd5hGh_?2$<8{0b(M-y%&e!L%&zY5e{>||W>-Ck-w+F48k++qG7eFV z8gze$jLsF>bfBOjZ7F;n5XlJ5L|K)T`2)KJ{Q^QQ)^9BH0mA!;CV9=LAf-%#%@wn^ z=utJ;T%`t|WmLkVwtFCs)(AXx+vKw}p)_KUQWT?w?vFP_pPd%QLe;ah$?gllki-uN zx6O!-P!hkuZ;n%q{0WYCJ8{qJQ1IgepyluDCCr5ix-t0zXoNRZ9k~G|#-~w1ln29( zv*ZuM7^gW@@jG6sqPDspkd;hB`RL7Gst?eC4?P4Vf$z7SI9b33E?*=}ntr>A&sYl{0BT|raiv%xR03Zn11G)8|A|$`aWJ6PMlw}CoVSHMjb__iA z@8e9b@g=)R{GmOz6g90Ip3#+~c;fPt#~ooop_x4Ha}#F>F^>u^vc)_aN-_8Cdg#V` zq&v2!2?6zww?jC5yUPrqY}#h5s(?8h2F`qH@4L)U;$E8dDF0N_fOsr3Ss-u}D1w_d zx1o`jVlkT(r9j$*!N;Eb-^HO3IEYJAQ?_@s86ImhZze-uwZHA<_`;*yLPSes!_w}% z6}0_lW0Jt7ohhL_i@+E?9}K#8-5V@MI5mwi91%vmXQ#e(97%=nZ1S7yOoUxT(krLT zygpCBqZv6k*#pBI+$7$tcsbqbXoE(cA>PlB94Ll1qqiUStGH7!Bh-^iGYAXq+e=ER z+o*8a^$jIaZ$QMOUvc0`s(}7yiA!9}cxiX*`nR144Nv+MDMAK&A-nq{TfdKdeio3e`?$mW8A@L+_m!eP(UfPajG#=bGV%UhG-zU<-`-UDW&%M z`OgwgWDEOXKxo(1IFVgEn|L_K13nvCz*5}yPI`{K48RZgHOaavs4L;Ompig!76~*f z7290mv}SWjeX3I=ve)am24FL~1P{xzIVoJW)?zAAde5Y%h<`Bq+Pf^J(O4 zZlZAs?*}|BayOv$DMpcP%SF@J95h2ZP9LG2x#U!kHG^?)Yu@mL$%nHiH!SR(&0=l; zIjh@%ENYn-2!wdC*-;}iai?N!b{Q!pY>IPy?R^%)#GCr`AvfP`MlnOJ(W&zc(AV>N zN~qqVPF|d!@ZBu8=-9=v(_@M=U;c2Tp;!Fr`oR_qEmol>+!LWgc;N^%*5{#?$0gxH2)i#1p$!)6z^hApI$*X{-@rqKOS?{+L}`_ zFW@uO#m}!2UH*D8W=Fz5P}-Is6VQ~Go}3aPB#PZS)3bd=ub)NK2y$dZV1v`*01A^P zEA#vPz1OlU#MYYv3@(Ud&#CpM0M+&p^kCyuf?Q4%-P(r%bLa;otPv_p6X3`M{e&)_ z>y2KMXtoE&RtKO3-5vjlW~plcLreJlKkp2qDD)B#Z{7au6JZDQm9xoW(;5vc$WS|Z z2tf8(q8xZbTgRX2TO2LP_0{2$WpNzWBzvqu%Er-9W z$j}n9uvs-(`s2&lJZ#@?oWq(-tlneIN&K}y7{*aJvnm}!$2a`7HiUOp1^(Bq*YG+h z0M24KNQBdTEeibz_-4O8xbofaKnqLPV@yEJ2Hw6UIbh>`gXw}E0mJ(Mgu!%f5 zASXpMPO|w**) z(nV`uenojpzv@7c<$?tl(SQ=gS!b?aA6)tFKWRPfy8s(8tZvYJ#C9n}TDu8l48hN} zfq&PFxa8zy<~cE>R^k1amcqqw(fbC zALFbjHMCVz;}_;Idd-L@@lH|{M-&ni0khCi3;?jf9M!yBt4n3JGv`Fu~&b^-u$5-Xz(3bFSf3d3s8`XJFBjU`_daPefeAPnN2|#mCP;pR`#sAMC)E+9nR7( zfmthLguy+TbbM45kbAw2hU@OO4J)|n%h2Xx0-NqvadsVESv@5AOl8=Mb7igz%Vtr| zgvjkT>XK-rF#6skO!@!{7En2pUwe;}vv9uUXS4>{zJ}($d~jaf=a<8izqk$)?|;1Q z(n)#f2G95`!+R_7`oF)U#-01~<9}fPGb^EW!B+q(xlaKz&3+<4_+|Vjyk5kE@M~?B zzC#Oig&Cyd-n;XM)~zOz@R^-B1j$2y9VUT_E3IS1`ocA4I}znF%L(jO^cl1ka-25< zRc)SIC9o(gB*R$d(T_GiXG0p*JMqGk%X8QYqHj{Sj5mX{N5dQe7 zvoHnp-Ndszd>Y5LUE?_3Y7L)x6%=24`?UXt4!zuCo5Hv@p`fNQh}>;@x!$|_rUy6Q zbfGad=v~bPEWh^*Vz{_Ar7mh|tPu-=#j>_5!D$Yr3N_%C^JP=9>J0{|$zwqC$^e?m&@Gz99(N%0_x~@& zjtxVo0Iy8RgW2c^ZGR7R(YpVkE0?!oMIm+`1ntM)a-(qFzhS}%ivAFRe{dTOz|)kJ zd`|+}ke6`wZ(^tad{@j1)a>QOGw{1mD3F|A=anogfVI-~(ow+fM}_?JrdwN(X8wOs zf8Yeq0SAgrv;Mi?)jZ>Wr~Xifr?n7eS}qR#f2cn)wouv)s}CLs-}u+TWxho?`!~(_ zuO$Px8SrjzOZHH#*bpxW3;q`c3sYaPM0G>jOtZ3k|NAB`ePZpl7vTIKGQ0W9+EP#v z7XE&S{!yMV9LNy~6IUT7JA#q?f8AB?Pu>rtn9fQ%SN|>>aLrW}zN)!axIRwV!X)#( z-8iAY*7%>Cv=y857uN^U1r%b0OXz*S-+66umR@U(>jSAnnA21EpYKZf5qYGpO_(Ep z>U^kT{qd_mJW>QLcrn{(d>MBW+eeAJ@ju-bx0Sd=7?{`wj{`BndRqeh&Ob;9{!N_b z0mG(S2{7VU76UV}rE0_fa??+5KyYm~M%w;YmYKCDTHPlJNqEniOs(PChqw? zY*-SHB_f|C^}X~36Ar3q%gkh&QA5R-W6U4Jf0_Ae z3(V#ER~pTf1+1rCSvJ(nS^RQ+d5g7+!@@6VxvrkCbDF5Mu?JC?@>9nOyIl!1-lP48 zDUt*Bmex~B^SZ|>vOljlPPpbS%xIKZ5R-={W)XvPy2+B>K-%6}Ndle+ql<-e>*CDM z(Dir6+_K?U+aRO}EP8|2@rmU?1(&fs`ScI)FCtbZ7 z5Uys4#_OMfX1(lW(n;&8~s6w2B+w$=+ z&fC{DPIi3RN5ej+!nLp5Uap2{{%O#{jy9sn{-NKVHu_6Nopq?bsv)Bd{EAGdfoy$?U- zB)q7-{c`EPPZJhGCB9JvPKU?zxz@RwaCr=#KYz%vel&|=>Tv_RMSkl(=E~N|JMIDa z4E!yxwR60mEkFNXwZ`%vnQtSP&|#xj(KOA!(J!qkRoJyvJxdV{C;5f>MVfU{%$3b@ zOBAhZlJ}3taSZ?q!@v2M;{T#GK$$M35B4K(`jtaf41HK8Qzv4gv`;L zDYt##mdQHr_|q+0d#iOX3NL|Th5P89=RN)`ot{$$=ON7Pg3dqfwcGFy0u;Y;~&^U%Qs%tkf68sq5NJ^R)ycP@Ixqc z?r$Erf4b{caGf~VI zR>n*vTB7ltYe#ui@&Dz=qEDa$>(|mOUlzL+ToJwe z?~s=BdQRZ57Mc9x4z51QavX)b3I9q}5#Y1>*`+7OUL~yf?HB8wa^?59>u6IN)y@>b zFR(ScaQ}n%ytJ^8boE6^SB~2OW`9HrZ#S>aO#Ire_513K&?V#9S6W|z=XHFIZh38W z{&|JBV%xTdY|*`2_qS*F@hE2KUtSiwA0dH~nv!X(WJa(xyKz&$o^R#5e-J0RSy)@| z1);Jj(pUea{#Q8I*57+A?q@_H6(xntFLfWrn7?QJN0;_-?elB9`bir8v3Z{P3BYw- z|4(VDwNEdv+R9a2CE#DfGU+T4aO=u?VtYuB?Ogk5_4j`WAkGWPLT8qE>kbUggDB;{ zr8w6w8Acs$dXi+*Zwrd4BVMM({uiA~jL!C}eVt2BO2&y9!aZQZJif zb(Ln_jemq<&KUnK$L#-Qh4b^^o(tv7Yl8dltT?|`4D$_krS#PCZR=qKDOK@WtGR$R zv+%bTTld~_hDeGV>0ajjdee`L7=~2|S(S-ZE%xj8`;jXAV0!HRnfyS4m8rS4W4fmG zetmxFyXE7XkE(x;De9v#03ZXsAMft}l|#i%a2uPoJncW4#&F2=bfUo?}pgzO-Bg^XC*YcB`1vhP3&g2G&$WC@xWE$pevs^ua>r?aCPY{4m4}b1E zli@73d=&jTa!A!`(m(B&dRX}g9OZ8i^aq_dV*aNj%LNgQll^`z zc#3uEa{U}1`W)jcg5}e-{`P-s98mu{2s5oC3v$9rl3z|1-uX`pS{=HfS$FWb1yRF8`Ke0A#jK)Cj;K^xLrBlB^w*e>$ye*(1y+ko|jGCD2t@|KlzbhHl-D%YXRm zD)oqzb7HnufT$CQwftVhS)%{z-2A0|iWfC<;QngYy8#pFPAydKf%MPzxE)nv8f8X!*a=m)ifT9D6PMgAY zDo(fl@#%lpMsNWzFvIHK%MEs{-Uxz>!vDbu#Z;p*>v;OZ?Yr{`g#J;1{Lb==9(LKh z0zfu69?gR%+t)dC7$QXDw6-e$Y;b6h;{Ws>YQW4?gv(uUzx~U9J_r~lWXZ$%Zv3u1 z7I)$QTd~wt8SMD}Tz*RwwJ;wgwEdsO4?9Z3^joXP>I-6`3I0nZKOCvvZCh91V7~E~Yc)IDp&yDNXB&4!HH*21pukmLxs=|3hRz3YGxqGMYcy zKgDMLe>4I*G0_0!rgZP_FN(lbq^tiT3ZxyZAh_`^UgC_4L@B>X!O}ASXHfvCL`22% ztc4U2)^6c#&gw&i&WhMXNba{`zTUb^h`W_9MO7VG8{Lj9CP$?f7du{$g|?t4-iw!U-ocqTj4s@)WZU&UAnU+o~^bsjIt)$&zR2vD7i5puuLKjK7_2) zYCrbGA~pp`7rp}LB)(2*8y>GBxM3g0N|{9=nagI6=$@zp?r+X`^Mr=G z5b-dp2Sfmy!Fo-Pk>6ak)M2xJHC$04*Z$k4B|yw2cogWaJ#|U024O5BFsP^j7>-Uy z_|Nfg+sOJGQ)AGC8Um>}C8{ex&u@MekBMj4#u zkKdS3 zNZG*;BUFeXd;o^rE3#3~tBQZcHn*i&I~euFS^eh&{SftZKg*8Wy6pS|kpU;*8_hlz zPGHWz!1fFQe#_jfMd-tTtrt?M`iVX*AL!I{Ve8cKhHt2XcnmJ4v5)klxqXAq({cN) z;d6zxclI&+#lslVt$e~welC4H6^s*0j4vyyV`X8>j{W{K#jyj{PgN+a2n&}SOOO3H z?R2m3DjCH0jL-#w#BVGB9?^?>NwEdutDz=hC!BaQx)%V+sF^c7+)ytA5GDK-IixbY zYG1qbjGt`S!tpjg#r%qlxBTiP;xd!%FCDY=-Kt_D)qp*&y0CdfRoUYU$p%OmM1V22Ww;NWJb5=mKANGUYV9M2 zfiQn`f2KE^Kkglgrs2M4X$E7Vh*uXe8&F<+vMymuq3rDPn@l66vScPOA3%vc{<2N) z_#A$yY5T);#^$^;@ry9JX6kvSpmTWrq` zh*4`?1uoS}@rgQ>LZYopEk3_3>+dgGq6SMln!D&m59w%e=V$xnZXi6*m#8@$uAE@l zey=i8WJzSftit4KWr)v`LM)NSl@-~+;O`eeE{S1AU5tt_lg!#{vKe58lj>tt zY6o^2Y+h=_tdCLRWIELR9x#_KzrG2i3jiA)&f0rV1`$b+wHzGy-MKC%3i|yQumYH3 zQsV6)Z2miH7bLJ}PjX;9#;AN_lU4FXJhwpnNL&Ll>4A4c_S%QjHxDfO7E-30I7z|dXlde6Cghz#N z3v6cIE>CWHBTXm7^7Cz8l>j%b1yK6IBTg_L9eKQ$w>`~OFB{{$;femYY@0|v%T}Vx-(wzj-5q@0SgSci zXe%&dwl1Yoo*ZS!%@~W9PSAk2pc1LFYLAJq_mT zJL~W;S7iK}5Cf1}4`71I>J}j1sll{38lC$Lyt%a{294>at>0t>dJmrZIbItFWIL0< zS7AN)9w5oDBG9Hj!!L<122yHe(qJRL!mFEt>{}?xH33S4v~EApO(Oo{$-=#+%DFDe zb!H^J>cCeXxnK;V@)fx-;~Kzs!w?zrC8`9#&_C&*kcV_1)s_@Q#R~+{J6bQNx4L$u zg0!g`H^De?%;jfB_@%j6WF}$FYZ07Nw=mnyWQjBMyU=jFJAbYIC1;<(OE4QDHKh;J zXtxIIftXH;(CqagUVY1iS(vD%bsro;QNp+qqBDmit7t$nZ=W6_p z>jq(z^9EQMWyd;#EGol{3Fx+MkyULx+y4~Bwt)X7I>>Gng*+TDW~-gVk<*jFwBYsK3 z4zphfH|^6-HLN!$BBiPt2Ou+L7@j=($qCW&{RC+gbrsN~#M=ODscN4?&Ay9ro4*@xz0?+{Igx3J2^vb$Xnk|;a`Eut@?$_!t z*B{KtPNGv7Z=jbAJP7rh1?!ENedwEvgjt4VbzLJe22y8L8{##jMb%ZUE?UQ(N zIH;@}fg|@49;YbdrZJY)&}WpCHaF3)510(nYWE6X;!lv&u~H~v4Khs)w!~)R1i7k& zE)TaLqNix1&dzBW1KR`}Fw0AP*A!v19k=btQoLRg=|0%Dx2tU16Yg(vC0L?O6PmB= ztN@yfe+X=;<+CXKhtp}^H8<8(b&w7EbHV|;6Vj>i#3nm6*i8s+ml^;9ofR1`<+~A; zxws+1FY=3J7XE1P21e85j(n-Gc;V$n7L)<4|58{D7GHCLde)0%r^i(vDHha}m}-tG zi3^h)0GxIvxmxhRacKG^UJnG;2&Sm5@dN^K0zIlp56+V0>usUEgbQ#9dn!13{?akE zNat5|fLrv6ssnl>GU_BpES?BCwwW_pkl6J-SEN0KIHd84!TODWofDVhTOX;YwfEF5 z<3>S$j!)7tH8U!7T#BJY-&1B)tI`1bFaJZG_Iu@@F!L-mv`tlBF2O8f=4%RVNMRJC zCW-VlwwSpXMxi*o?@`mpe$^-PSbzUf%H%kypG(A|Yd+&r?6vx3**eKNqJ%@Mkp73| z^V7jToZ$4cl=38Hc$odhN`uarX9vVL_WBt4AM3xkLG?WHaStF4ouiT7 zQ|g=t53)Y&hj@lBPA-zFcPB@LP+hx(lBQInLA=(@0+I^`qB1;r?tp+Ri?vUh1#M{p zB-BY`I6D$SLk0fyQPpb1rqoqm9Jf8fz+QD=5PYQu0}%3EeG8H0_%Rp7?^08$Y**EU zv!wq>Yh(8j1Kp*GDxpS9ZDkODZSLZN-PovJyw_p=<(whz6^V}1HwTzZPP<;czt%?n zwqFI4Gv*+;Pmeop=*f}7xG)n(_BruV2-SKdfV<6KHqi7Ni0S(DzcJmLXh6uFm4`kngO>W{6gg ztS`-O=s2nmjC{s`Q=q`re`kdC5yxg#GZRoR97@feKt5f|N==y`MYT6Z{S%WY+nA02 zUR8tm+e6>@Q@qs2VT&%vD4X#)E!gYxN{Ga4h-da0Sp)#U;=0@EQUM~gc}=P6H4s<6 zVg0z}tD<+|3Kl$0mH0aF+K;l}80q59$8nY-TC{AF+5LgJ>fuxo5J*eU!2qr_Tf!K5 z=uEa+9i0MUoT3QCVLwPgA)cs=B*!McauyoN{zFA->6)H#1dReBKSGG*18U_@v`0=r-Q z#hm)RxkI;^nxbn?ZvJKW zp6z~Bo zY&tUta%6UY3ozn#5^wGsJCUQsd|nO3*n`x(Ol?n#3+_VYZ_Gp3&wCFqD4_45gf%%z`32CH9LNRFCn}_! z8-LOG;nA))dSeb6d1W&k9Tzcq9T92~C5N&!fYK@wcRbzbqmK`5Y)6C>S;*LPP9Jdw zT;bE4!8;Xi`ldJ0Ih^;HmUI#pIaqVz^R8_ZrlL60FC%2O&|J6&DVaPrC zyO_P(7ramro9CHsP+JJBJYBJc@y>zAns6M%5*iq`Ugn2VY{k7}SjKjF%sbGZ{lqnB z1XVP8SmRs_zMpq6OZz^N>OCziZ+YN}+b(^SL7q&fd?67-OOkT&hH|;l1i!lPRRJOt6YpQaX27co8A*cIqZj1*sH6uINeTO`(Q9>%>D=9`1OVxPS{qZ z@AuqkAy_R`TSE7`9&BG-+JRdYH-81HiliNtNA#^|vz7X}XrB23p~Hx>Q$-GdtGt-h(()0W zAATRK@chXlJQmgiQVm-+KQ~-{#8-~kcT?IB434&3|AIZ#mk*$PQNam_cQ*O^`v+=( zh!JGUiUt)&Mqz&F7QMnQ*-zgwt@i_tU2%z`t$P4WShgi61C;kCt0d_FdLHOL^LAgmv65G1rgATOklB5nJ`p zLMn!XaFpfZh*aa!DCnwubtbH3Noc9Vgc?a@F@!$0ZI2cP4rQyxe6EX08rupY6d?Y8oHlL=?7TbFluG6MfOwjggUuf^jIi5d|yH!R)!K)v!Ca{l?T$% zd%5M2raez-c}rY~H%8vN)as_t^18RTA^{SVO#r4{tHDt+4yI*pu;Fl^aa)X(2mecQ zg*$>SC_nHSC%QrF=N7gxz!%(^5RFVud*XJUbX_Y06Jc<;lWMaEL&5P2pQZYOP*{#c z+_MT3&-&0OprlN8Ui=ErJCr_rmHX>*Qc%+T1#dO zAVE*30Ad0#J9r$05ryxA-VtK*P4J9)bPnUyxk?}HU`$quA>c!R9M15^m;PJ`PXK6% z)M_B&?9Cz)GLbJZYy`SR ztJcmO%Hdr>mXOHI6b`7CJXP->1qiyifm3sSEsX9_mlOSlp2o3Py^=pr%Q-zOHwg&G zs{Va9ZX12t{;D3ZPWWTOSmeqjwYR2`?y^8qk zQ3-@p9ZaAU=NCoQc9HGmMBSzK3lMN<&VMN_0Aw89ByXMubqnbjt-M@c9hs018Ie;b z^4J=Ry8%)w1@Jo?5ki-yk{76~3d{_&hRIWF+b{1Tr8-7y&&XV7w9D=18xI`8Q5gal zukVQ2YqGj3S3D$vD@&&*>~3hX%Dlc@K+JOCkVACkK?z#|OGw)iZIJ>pkr_Acnw?5`QQfx_a^tJ zLh`JYSIGARzz}cZh_CNXAq!RvE>)t?NUJcQu^`!~SE}UJuw{o}_u_lQH{1BWFdm6_ zbtOCE*Qd2O>ziN;e&0#`dI1MD4L0&wmRRu*)5W12L&Zlze~fE1P<@IZkS8JT6HH@a z=DZbjBSD}7Jay|d#Mm~zZQk1D!5zRB{b+G0K9O3s?B>lv8vTPB^>z=5W?+!7FFeLc zHR&|l#*>@b9H(!b!Cq%`{kS+aMsEGOAWKxM6i8bltcJd|s8}JpFR2>G`7O!o4vN}& zqSl>~4tAV)q7cM^{0(PFI@$3~KLE$seepDtZx!|6>G+z|%#rq@tVmzOFc+fGz?&uB3# zRI?>o8p&qX!GaBXcYBC(x-Cl%pzdILenw?;X`Fyio4lwc!fuD8X#G*iZ#9&HzUd`n z>Si8uGhiWR`V>6K&!w6NCY&4fP+@z1NSKvZZP%O8nb`?7k)tdwg7#tGewxLMh5>W1exEh(bdKkaM1cK#!gGAx zNN{*`O2y0>)TVzyZA|Bcj^=+C%Z_(aX-Z6&jI5(1#)b>fLqFr;2Gg zi!LFK2WIE-maoW><7kh-v>n)+(;@5Ijon0_xMBOre3#g0-}nmSUfb+zX@VbZHAF8v z&P9&Q&WNd&w#?mlxyPlu4sw>?R8GmNDYI3w`0=RSadhbhn4HD8ib{Yh)iAEC(N0VGE37_}Y>?$pMx@*IRpZ56?^>aE_(_iOo zenf}bUK*n}`>^Bn1({>5u$T%!X6mBoTPx!A)>{d${i4y_D0BsQbU$;x2L&eoK|X+3 zn5sTgWji;=?Bg{7a6YmEeu`U`mb zgO^eG5U40UHI`+bJLFz_++74hydnDwu~4?8m%Lma*`Qv3d5C4S3Y`7CD*GDyNzF#5 z3J8?>LxBU@J}w|?F3);IXOSw7{e|`?1b{i8yK#IWydf@RBThIjmV@&YO}}KI>Wo~> z<335RD>*(q8~JA)5`1T}ZN{XgM6EjX@N8`L^EGqcivD%AC&s(qmq^-tRVWI@+R!|7 zm&kF9|DhPs6*l;&>)D_uk-62N&xuxpKFK{ddK`>_COmRVyxc4&z1J#g{8-=A#Ue;} zYVe3Yv3-X)ShFp&UXAtSTpl`dZ+0wN`52Gsgm7GgKH)-$3j6y=2C+YwRbOw0fSnGT z`6KP(C7i1|8RF@M@vbAjYzv+#vtA%AlCFz23lhwJv6u69q(UNjcJGgpW-$InpUG#( z3%(sqZ%$6Wo2SvIEBMoUJ7;l8fvb6x*XVN@*l50VNow}3YGZKpE@=UA# zMpTdi#hHoPD#ow#lRORo>}?xQ;5pcm)dbFDyH_h<$;Uz&=Bs7wBvcS}XUpCXFx=65 zQC|!uRUc4U-a>YrHMdt?CNsX?_F~hddA!pHd%&F?KxvUD7EoYKg51tYtS73X=)T~q zFT{!7k+R9D3*>NLDA1bdd3@HJZ6DN|)Q;dKDl#i2n!v6H&oO$UqRCRWL*!Ui{R}vj zO}q{2y3hn-mIj#D?~eu=H3FttFjrD?V}FumyuZ-l+08yD$E(i7)FBi#0eS_Fw+FB1 zB8*!OC9=@)jOIK*thEp3F?inlIJ`gh&C{W$%*-E5U$_g;ezN48n5{EHfS8d}Tgs1= z>rY5)g$)4TEs!SaM)D7(KT&pcPz*bcV00_`RkQ@^gM#o$uyB{Hu_zSNxy z0DNTdwQvHEu$56gVCpMilhuyXR#^jSAkX6o{Nhx=lx=yx6=0dF)y{cprq5OR z_<%XPkh1wedRc0gR@_sO4J~$UoC=|D`YXl1 zPH!f-W%RL>I&MPXb9VKm!ow*hxM7IlN5s4wBR(2lbc*K3oy{7d?B6mA@{`qX|wcp>|uma9#(ZCu~bQ(KYG zIaS&9b19mkGt+jFMQWmHTvW;tQbpy7OU|T(wDW>uptCUX`5W`#pL6}>@(dX4RCk)0 zc+Pp9{^y}N@PoIi3cd~!E~`GF{(R?{Wtd+1#CP$K=t{LzFv$ZAdZ%!rfsZN zj>3#vW&=J=NjxR1=&z6QZIWv@m?d!<@2QR?cW!$`NIq^`{h+tkl{z`j(t&ZUIf7K$ z*;V2qrhte{r$oPdV(#U_h%7Lqxd9WSnw_fXxrTA~QZo9Bi6sCmNzKf7j*xyfIdEZz zYQZtG!dgRKlL1!g0%y+bVV7Qp=v0XmBJpwZX8(m3T8nQJoMM4;#r1Xe!a{=6+lENp z)F?reMK;oITtb7+eYK6`YEP+il5}@+mDQ6o4&!fn8Mt}x17%IY_}mlYmXa8u)EA5- zo%McHls!HtGy)W<<+UHhDpW#KK@e3|O{ec;sf}Cr;O7ryb>S99G-A9D`c!u-(Prfm zM@u_MXNUAEPsB*IMPiCR)Yo%+-_G)xo=a+)_>f}BcGp1WNSCA#FpD@osPlb!F@fs+ z3~NlR4q|xewfd1RViI~OGUklMS&I>9dPzbpR5lJ7o%Q`vgmSYxgIsVt-MmvRqmhyT z8A)`F3KwCiY;SpUfHqRO-N(;Z1WUyOgiCk>HPePBREac!(4SC;Zpj%i9w&% zi^Z*${o++AP<{3nmaOgn0qFp)pENY%+}tF}z4YPzkBO|P@ZS(n-?6m8)Go=)d5MPk zI@enPC&Q<}RC zBvqm06|hxfBHiiJyvm&Ci0$m2h{Fb@yz`s5K9Saz@$2ums*`A!tGMh>=N{!s^KsxS_bcOn=U3C$uW%wg(CBD` z`XC+uw|$R!Zckn*v#$8bWN<}cQh3)Xj-9W>F(c|irwb)9L_0m4iSM7pj`I-A?`a0* zOB?o%`T(#1shqKSe65YJ86vs8(}cz3@nJKgH6wcZn(8s{GiFRv%VoY8d2y3Z+Pp0! zFP(mHN+fOOogt7sY!f_d6uvOu69TBXw8f31f>M=obCrWR&!v?1>i3;uas1u|rJca;}2-b~Yn=qT%{0 zZ@itwj_wU+k5LqI^K>RY$Vj;PJiNi4OyVx+>~z+Cv6WC*isg9-^}xd}>t2e{L3(Rz(c&#F z$yQp@p9N@f{o!y;L44`lH6clDAc*0SDVI?P3896;}&=s%2WXz(L;eRdQwcPHz z@sKewAGc4GyMK27te`_^>!?^rzXv_JE+l{~-|y#f?VkApy;$@$m7)~motw!%iO-~1 zmH8bf9HIY-9S$pn7K*(TocQYK>-2Ynt#z7#gmZ(gUbVfMK;gYJ+xZX1LU5A7aHG(` zMpHHB=a+YqRPEc|$qV9l?C$1FK9{}W4Ds=a8Vh?0f&H0hs^mExGzCx7owXBWD3{#z zz+#|poc9<<7F|DN4g`&X9F6db(z9-(UV)qT;N6982b?>O67jz9M?|#q7I4fqJvGZQ zx^*P}c*?!05`5P4x%IHyTKoNDj$AxF zpRuxu6)NWE`!LZD1dadM4b7Uzo0?jiDFCERRz3JXkg z8JDR*n|{vh9cfDf1|Xl1F3uZ%t$be!UaAOi=ZRR@1x5 zFtpXyn*EOG8!Noc&NRcE{H`exhALh)#UKVIvLbFRyGjBr#kCgN$ z%yWn_e;kq`Rq4AX;dgyB1Z6B|4uiHjal){!oZGpxol9&fA%|=}VA9y#u z&?}eldf1_^>&E^p!ldR%h;^7rKrC??{aJ=ju8-gQY20_*JLIc!;4r6WZmq#7hnzmk zxzQ3`_V#ayUmUK5c+0+{nMzXV7asHUTJXjkCvxp}HRJMLRw>$_cH(2ibDP`yTdq01 zVsbUybe{fl9FRm+dy(6h(`9wWUuGpN*b+asBl{qUpjdx(S%WU55cJ1M^s1B3H$*ce z*wd6M4ZO8*r_Kdx3$86%A%!z42-6tL$?nOgsA>>VMJ8)HZ!Oz+!_p3zEUxnyNsd*M z+ZuJcHObxpbC{t8NqUUA|RaY#FyR~+4#)tP$xMnGO`%c;_HjbrnVmMP8&t!2@HXV+UZiHwv-CaS)*{pnHdgG=)ExfE zs4L4J20_kOBJyPaYcKkqyo^lEeI~__zoGABil^R3y-c59%GCoSL}Z#haxUkfLtFpm zN2;C0{3^_~44-G??k5r?7!?bi(F<(l%)Wb7;*DY|&GEeqlzLy!#J=--pkh{j_mD^< z>nSVMXXb%67qS3^nVKam;QoT^5uh1#`34@m^CvR1X6RI)fm*>`XF1RA)ARJ*Gz1~k z{#`$SR)G}SbgsS9OHj2AJmJ^p<=X=uT(`}j8+XM+!6EM?6}F=eUi$lOT%sXndE@sf zO_AyK+pBwN+%Hz&XzElR7Ce{`XI20NC1>&N@2vB_@>ZU*_7NA6X=>vqeOGB|oi%AQ z`Q$oprDI$#<=AN5mV{2M6=OVhqOYlM`dlk@imq{`;oizaM!~xVUWs!z_{H*tzqi&b zF5~PSJ~!pLeVc0|rl;JEN33S-NBa3p%Ygklc*_u(!P~R8^~GtO-u1n6W!biSIIw~s zU8A4&a4ent2vR>~q6yOBdsUY&Nh@%woEeD^&CpK>hJeH}= zH<@IZ%J{2b@wlL+%wR*)xmUGr=~-WdwRVn?(38)!-Fu*IW?#)~(>5P#;~e6M;p+|e z+(;i!mHM^9Qc7|(&eS0$SgbCk{LIm_7J_&2Y|R0WExmZ=f7~r;*rF~ApoFv zEw3rP>M5;QUR#_TeJ_(~SrdZ)ahmkB2ZkP-H{Zz_5jCVsn<;2Fzp(|P_|dRHGJV^R z>>Ge*{oz@06|*x;TgmAKl?<%Z5})( z3r7REXsjKdZSNl!X6W%g`sOn?$t(v$oOh}Urx1f37q_?ooo4rypzWr&L}sM!=x-@+ zWb+J73xPR?3>E9cU!IQbCZr3?wk$}!ThW^m*QlRelKhdkH|#M-iK`*2@mD?H@-kVQ z;M|SZ6vNB04>zfl?DkxEd#XYN+AKZ>9#yf}H*KAMsE~~If|hRMakUFE!uY5QWo`wT zWiCZu!-h0YMhd<>svDYOQ)&8WZ%;&RLk!l$RNXOZh`Zffo@H?+azRD<#99-o+yVMF_yyh9uDd`FJNUEN#bqY5q%?`*Dl9O^oq{p<*zVPtlJ z2X*Cf>yt)G8Y5r4(pt7NNvImX8)~)8C?7;9?WQHf-#TcI=g;JC@}1|%g~}w+yRdU} z{V17po=VkcQ__vEt%V>Zrhb5E&(%+`N%&T&o3SY4>0N@IhotLA<$R=+uGvA802y#~ zY4DbU`*yiyu&z*yPwVPxF5n)u=AP%3%5COX$}AGB^XiQvhb12isQq9{lJlNn9wm30 zK9iS{H~o=w_8#xvM7+zE`@m6rytWLZzJZ&q2AdB~WS%99iUZ{VK_}gol6&rTV?V6g zXy5Zj)mNI?)XTgPr76EO0IXUa9;PQ~rAW z*KId5hrA(2elrM(?S0G?-(7|cTaKp4sj)sxj0m}Mm)BS0mJ0rHh^;~Gh<$zFcDAfM z2oAkUP74)1*{j4VD{wq4)KWQS@{REUMFC(jR8u&6<`xBYz3oViNA-5S*}>a+O)s7c zv)xaq6KOJP-M#;FxYb#g0p>YC=0CrCSBbSlhveY46V1nFbr_Nu$Q$>(QPi-i&!4da z`k3*ga_Vlup7R^IS_p4FztVeyKEDm}tZF6~#OqsRZ7=`0s$#`#lER9 zG|C`{_D{?_74;}&swcm?zYkOyQzsZ|y|5Sf-!0734>i?FaZ6viHN59R`rAl19)Fqw zBhSMUdT2Ck!?#BvpjMpqxgFaN&z;$!J8x1cc^mp!QnE!NN+s+{+D|wNNOR9UylK7h zS<15$=MLAV=+2?B3jN(?YO^Zm^Ug|dWNelUaK2!kcd3apZ{9pF?CrOSEyi2~SoOxz zJ%=;st!8P@XKYC2X|s81bY9WHUnoo-YwjcTj)HZJiLI$O;!=YjS0#U~Rauj%_sP^B z$}$cm=0=?8Q`-)0V*%cPwr-WYUl%NJm}%rcHfC zXVTMXLa)D7skGQ7O-y_eI>0ss-lNX0TRH_efBzRB0o(nLF|0>~m&lovLFIvd2*f8bq@rai@ML!+RU~K+vM8P)m zhw7Xl@i~H`OwG*Dh4Cnd=Yml6bC^`i@uVERl^m;>e#UO+r-RzK)knPZoZ$2!VePg~ zk&v)bJ{{<#Dl4zq?tZoRT@C%yWWftjW8;0o^lxf)-rY(;gv8MSH$IHfxI6?$?8rXI z+Y*w0@6|twZ0q?g@_vIUs@EU$b-{B+vMaE9=WpFCO=G}ikrlcrJm!J5`N&6Va>%aA zOn)bL@_QPeRr$uVoUe(Uy9KcX=3l-EGPG67E(Ya@B~^;f2wxhmk+o4QVJE(@Vf+z9 zb1XEJUC`JrJQv(}L1YXwZw>YElpJf~GUU0Y24M}IW@&a&%|fZGEPB}G%O=wh#Os@{ z%itj5@2?_wpPV6NYqSaZ2i`#P%8KP(*ei5s?_&JXovoN0d6PGI+xu;>oC4$Pj_ z)Vw*}VPnu=$9rGDXoQzsbDZv4COA*Kh`PIA=#kz!lp?BF*K@1>xzG;BmrYL|cGu`? z>NjzwR^TB3nZo6#PZoa8an46<1mAHgLPEhH-kHMjpXqs8td6Uj1spDe3 z_F~*uGpacQ(lrG%iOWv5yG=ZwcvRUaT9pdQP+zR`{>VfhqM+HU#nuL+bum;emiylA zckI!l(Q(}yx$l#!k^Ua7$O^7f_5t>=8);Aw9d_={O{4PAowf*_Q{TgDcv;5S$R%$o zXGo8>O>6_F5Z8gdTct$4blfYJ7pMC_&fYqps&)AvmJsP~knWHYr36$`Ktu$jy96Y~ z4FZC6i?kr6AP6WeB_Q1)5)w)&t(#Q9_|3A8=bn4-_rC96M?E{%Uh8@0nVC<_NLJAmzXHzo{R6oA}h`HS zVmo*IbNe$`<@3{LR!z7Ab4yb8Gk<(?1%GkH#ZUd$8z|5=en8_7 z0+>xM>ZtUInZBbNmZL9k!WrNtf2qkGD@n8&vs(rRk~TwSZ(;+j6)993_~cILDfNp!1qn%4zi8@6Wp-{;pt&7a3anD2@UA*3ea!3kjj-AxB8!d1k{6IrmGt4CW=xuF#_4Cj* z+*Q(HS3w!T4Dg>W!)N=>R#cLYN>O|5L8c}2R?#|SG5T9-xE#^v4!N%z%eo(na8|s< z2eEPHWZknS9rps`{pV_)dre8{IP}E*Oi0xInUBLfOxw#=afn1?)i_G>|9jp2>U{#aM?9k}Rv|2G6)hq<1`zs*$*nGgO91V)E3{}s*z=!?)!S|>+(yt4)F>(Pp})ANI{m> zYCXgo|Rg1#09+P4*Jip z?4#!5VqG749Okmg`hZqHT}LM%SvN(Z{H3lrQ-nVn?Gg_**i%lb$KFYqIC4Z&tdi#yVGaTl8KT z*?FWZc`uA1?4b92MUSX=EG(+m(s@-X-xC;)3-L`La5WKYpo){Btby*ynO8y*47@jB z=%*ux;k1lQ>kNkObkJc6w52WN!u%=w@m4-VU3&SMYgFHQ#!M3QWRz(n{T;E zh#j~DXj8Y%Zud6ROod^`n-`CtC%8VjQ|1Wg1&R9GxFUD%o^wg7Rlm+QB=fq<+4O-$ zfxnAp!amK~L|}7@b?-Fkf&C4ygU{3jmt@HbXR^73jCCbm-;a@em*RgLUrSkEOEIZ0 z0DnH%gU+0AIQP8x$R(|7d#8FXr4OACkn4)Q^+}`c!#?J#N2>Y!Om(^@UTyAxnje6OZvl1 zIaO!sp=Um9t^4F+0X9=g%#{tw%K9Qz>dtNX%6eOlVe07+wbA2D&^os>*5LkLcnh** z;UVGftpB0r8t+Elo_aI2$z}pItted4xr&>6J*IPgXBLA=-1=$m_w_dEC@hvm zExCN|SSu?rd=dj?%TAZhJ35O|b(ZCg!q^QdX&^GBYKR`qk(4>dcIT{r%khVRBc3@sFdFZ9#5---c29+FR@OIMBzDUxYn3W_K$6Ls z#yFX(B~e53RIP^0*avS#P6vVFdXo{Jymc)sugq7w?fUWquvEs}Lto;k{wTMuP^b{d zUU_(ox`!ssXD{*w8UO~#m)`k0y%rmnWF+QhN#4>7BxY{Hk|vO8?bRGY5t0;=zDq3i zpWK2PtVlhHXC0rvh_rZeBQ*4s;u>3q#ROhMij`Cvo7YP2Y^0xptqcc*V|Nbq~;**yJl894TOZo**Pzs9=$eouVyd(m!yoOADp1as90{5*K*d-rnR*R zP7AdBb(rTJwB7PPd?@$pmRl&VHYr}7*rp9nhE_FilOP1roq$${PRkq6cC2_lBf`FMZ{MuM z#Mpo=Gb5rbLqo>qdDsO^Oa z>JVz8#{&sjx3}{bg4ASI=k)Qln^d7EXU&>_rJ+Tj>Uuua_0}ug7uGC!_vP|h zwszk@tFk^nq=l}Ip`yHs7kz8+yaKE(zP z|Ho5aSyP=@SlFySJK?B^Q0J-6uPb_u_%f(T5E8g6GF2X8E92dt31OC5<|Qxb(@Rbm z(9`p&>7t1VqA@lL=-}C8cYJhdv$3I%qRc8$9H@jJ36#)hyuPrxVcKm0%-|E{!bA0j-g?kdVh$fiI*8U~*A)%+C7Lhh zIml<}=P|RMSyNz_hy~?JCK|ekif77mz9h`MBdPZhCKSuy2l@hj8FWeeSho5@t(Q2~ znJ`yt77!9VeT%A!qb3j9gvL-VX)yzni!xKdO2X`jmD;r65)xzsn6ggjEwtZs1lj*a;KOzmRHOZ3Ta%ctSZ6lc&(RYvf#Gd2;=2__*_JeEAR zqRu;%1BqQ7N8}aPNRo~tH1&cky3SjKNd-)i6B^z( z`RDJ6IYd2J%Na<`HqSMv(%-*OJ&3EM6{?E4%<4RSnq|}%d#;-cf(7=NOB_cVu$-=wt6>g z^oovbLL1>wcU4{D$JE0UPULvotXZhXB*fQ$;;zp$%_4a7!W6i0jY+K&_0a+!<1;^V zkO!mD(V?DwUvZxof27cqAu|C=n98A+6B*r1K+JDymVMl{XR(HWqCWPC*AqT3#&MIQ7Amc`}jAb4y5E#ZB&cGrgVtp;; zJ%`27U}n2Q#pJKe)#2F1nXGexfTfalMXFmh6^4qfR zf$nF&(~eZh7Y6Ed(qpcTD*FbJjJ`!wlPV+SvS;Q#(lu5Z=wQga+K|cD*)?!lgvUb& zE7<$MCTZh>Bt*8nkoGMMrA-68iK; zaCN;fo|n#AAL)p&RLXp4(l;R+$mb#he(CdcE*hIR9WE%NwKCQ*&4j1x<*kxK34R5^ZG!BdXkl^{WR z-XT=GO0yn{OL=Dx$1Uy6PUS>`-brjFuP?`iWCnEa&*Dy!-pJb$TgB?AjPuNODkWi; z?Xk zJUY4TU?$zoW3dI;nCbRu@3@MtD~}b`uKc(DEGzw!tvn6ndnOiB;~7b zEoDHy;=aP!6D7pH-*S2sHZ%hCT7e^4`)i@Q5^R zj&4ux$bNND#2SDOUz74huc}Wxgcx|Y!s?GJw%q(!_vWyT2G8Aw_~q{Hd)~veq9f*U zwT&P~P`p!lst-v<@p$%b7de_E{q*BKXLV&|tNZYst!Yi(E<(H2DBcIvNk|oquks=! z)tcq#=%X*9!`Yk9&5e$+JxA5OmU{BxD>Pm6lPl>>ZPXvztPJAJQ_fS3-SLQvUK3kw zMzEAVDM+oZ%sNe}5ZECCN;@0c_O-IMO$^uE%~wbAty}LU1TX3<)b#E?_d9v4cj986 ziBGRTx4O+VZ~PM~`^MeMC*oEiuJNx#mk9K6Xw^QI+`XpjQ!Ly+EIO|%XjmMv8nUWW zIB~K4{K+ZXq=F~*p(h}}I(zVG()(x9K!=%^_y;8Z^N(2{@hGpd-bzubUoi4|n;2j} zT84jRv8R9f`kPbfsJZYj5rXV5Kdn38WY{#>-W8jg^Y>J@mVoS`qB4v7%>> zvZJEw9n6$oBpvy(DEF;kqSc+NywwQ!pd%%s3~xTaJ6YI;T)9;Y(Fk@yG+e*f%!m^p zIfTGK@ZhVu;h70YNps?9f2BIJ2|g{`*=BAoD{5ASR*I8q-x^_pJ!VFfDM+aOxz7;J zIK7A5g%Fu0Zyg-BU~8Tfd+x1R0H-s~N%v6LC!zkyHSe-6E`G;3fUS7atL&ORQP{JbQm@PX_J=b=tdsP)p7J?A z+tEdVcT}NJ??bRwkZ+vH-9zTrr#&%gg)cD@>hf=8r{&cKeiXBjb$oB=I7^MGLmkOk18C8v(MNyLy-|S^3w@dLV~b$bINK$@SD1uBOxu-^!=K~} za8I=lWCn1f1oBcP^XSjn#Wz3CGc$A(R8GhU-H!o4tLuEmMc+_;d5#Z`Rabj z-cv1L#K45e^FoS|9A>2c35f5m9mGd}Ze(Pasd)QvDTekjm3903tLUtb%ddEz89F}` zF9I{G*;UhT-j`uVV0GZ$u%>;`(_h=KF1xp|Y*llRzRDOng-xVD_;Tk;*5s+%A5)1K zxylJ$G2;n0b|-y)>^In7!qvE!4S)abCp2+eo`!53Kxz6%M6VUDLN({-MPuk#>Stbw znSvu)0}>q3bhmEaS%^D$U}!*ql92)fuV!_zz66K7wFFC4a`TBfx+p(*97Xg7dGOjb zAG&PZLx@h72gj92yMnbM9CPLKEa!1dl2!uLvB-E3<$m6<$reA;!tntJ>by}*r=-$B zjDbmFqD8b#(qOhpj08-o0DQT zaUDuxDt(yqt zr~WQCHgTQ3>uY4mE_g5D%YB|D!I~$tkO)%8^=Uu{3vooQJngG)PPm1uAo&C#*6{Q} zVo%xt$co~vX^oT9$MN|O9!7N*=R4KQQ){Tz!k!G(so*%}tB4`~bwfWcYL{6#KjsdYe3l8O1A4U7*xDs=|; zu@8E(%}ag(;+SZzb2qb1qBhjU4B5RhdFq=GM@iOYRgbL4dRWu%18N7>+~ud@PT$D#rhJkwt=0SW2>uQ~m>iSpze9t0~Ny>5X_or=!~Eva z0%-pD*n&>dGM4cgaPsT1B2q7kjW+w$3lIxKnct5-fxpqSE9AEkIcxZ4U)NkZOu=Hb zh^(JA%Wzfui3m#LjCv=fn|fPF%CVkLpf4`O#8%+JmDM%%-9bam^?HziDb5S8Yl=BKLH_p@`6@+OBmgqRH$^? z7OwWKgM-b9LFf6y7*o((p+juq{CRAX^lk8P8PQD2Z$a-lunFq%d*L~+1-r%ZY80w#e- zD_ZUz+{JuT_WI7T{=*{2zOWt(+=#UAA-g!c7bV+OX%l;D7Z=Kmo9RVp1Tpmy7yObB z0qwrdSd&znhmME$%A%+x<0AGLR}9VE_4-IEC&pVhAMY5&=VRS^m*drw$3M*F$i6mT z+vAzKY(jQ-m`lMNH>3xhRkYBIO5Wa0idx-1@sTM#cG<1wjcVw~gfd~>;28qNk6uw5 zl`rz+;=CQ!SS(=~l}a$%8k8Tu%XmQlPAEWVv0x0$uK2d}Q7<7Oz)vukcZ zrPmv7XTa3~JecW2l7V*UMY@aO8y6GtAJs)3%DS3EkJlM{vRpznHMs`Y1;rbMX6cV5 zB`NI%K{F*DCE3IpG|>Z+H^UONyXtXO;}0sUx-^;3doSD<`nET4 zlNiZLn(9b1+DKDOT-&H-L5{G@jc${*pc?H8E}eN0e;yxtVOH{oLT)F2wr&+DJ=J<= z!}GqwN+62pLkw^Lzph-ddft0<|yStM;SVdQoZ1Y@`>9YgZm&1Q4;gwcbMw zsXr`ys`*u-59lxRWp%!N6@^9&cXkB${oXh|on~VdWzr-&&E-TjohLMtTV}Ukw%o9p z_yjvBPI1fiRPLJ@x(x^-0(Xl|Yq!TIgihZ(Bx{o&|6qN{W!|WlFYEww|AMZO3GKX{ ziO~>R`hE5NyB;i$^`-~kpMPVk8&Jm@a0ecABDRei-Zm44A;-`3;g1AR{sToxFp_Od zDEsriq^HU>hSQFf_trM~|BHi+)LRQR^)cc~Z8uL^97Q zz39HeG6@2;M`h*+DByDw9wz(Y&=aPB{RX>`4v~$s#3bw;y}lj~2d@l+N8zF;W{5sU z<;$MNJ$DDAuOUlwqI=>Pzf@=0P)rG&sN8qyD~?u`JF=ahSM=4Lk`HAiZRR(~_I^M< zw1_69IikmxI2dMzg+Zj|$nES4RrBF`&h`ylR?uZXiiRqm2{%=S#D} z(Wdd(etRb%b3+2(xs{+2Y`qI%0%7n%rOmJpB*sIdIw!VFR^R3Y4$t9^9 z&mstzmOQWT%Zgon3XCC6==2GQ3?n={FyYkr-VJJ+;l5CC3FjueW5Dl9{kFUzo}Cfq zGxbBxr-3=7d{)B;pyjg{WljQ06bi<1C#(up@{ls&%N-4oHp{URu zs@-_dqdsoNVzP*U5IF8ng%~n$ma*3PFoX*1Cn5D!AInbLC9_1VL~C>=AnS0SpP-7P zd|7p$P{zEf<*HyBI53(QFP(A4y`Q>SySAMl95trW`el*MHl1uG+Wi-9+1cf*_ zQhXGd>RQz?ayeO17ymKUwXV+G=_n2jRF#3NDE&Y3+5lqmnddW*6f&n=2`mUY1}$?J z$k&fFNHu`LP0q4id(2UWtAHZ7ywp;@R$@cV8St3JjPrNN4{Q7ar*IcXe30mHv98$p zDOxcb^+uWus4O%)H$}xnyoSGR<&8|*-H(=9p%b{;DId0I15NwtPj=CIw*gWTn7F#b z-0Fs8uZ1=a8U?xoC*Rq6EY z@QT|wBr%8~iCwapC1iIa-g&_fN5ZHbg?&Bh+MltFSa!D{`3Ju}Wk?fkoMFZAoE&2# z@5A)6Hr=$79hJq#%@zvgMyq94R8e*NXI_g77cyZTt%6V!48;_$&$60ncQO7V(qy>J z@Fpe~~*FdyS_p-i2!hk?)lgq1fN z2lXD$q`aO*czK+yllID_dx{XA$bs;bl(PJt@RX&DH!gsj->gp9U|KFhH;~z2 z`YUL(xLEEy(%F@7a}_jzGi>tWXXrsupkF3hr@Kk}LOkz~4;kWq0=&Kpnl8o#mzwdo z(+7th^+Va}B;@>Y`nq$^sS74;ub4IT?$5n$F zS+FN=E~%f}XEgfa1179{HU+s!-IUGNBaxOqz)J{zn`8)qAKwrf7cKBDTH`he1^vwZ zRQrzUgjy7qdi8H2#1|RP@QoIi=#AnqM_%;1xNz4N>q!U-Gf>UsuIMZX*u4>lx*n@# z)C7Bs1Y3mxwg(jBV^m5zbvRY|Zz~@MC`QPqO!hZKC^vvnJImcYc1@#{$K0>`!Hl}d z!==EtJ%x?V?f}q8Z4P^F`|7KR&|70V)0x6yJW1=)gPlY)cDNELcJwTQ5jsROX;K zC90pXY@{H##n_68@^g4_FN`%G>WDW+Piw(2@j5={lupb zs<7+%HY^%Vt-22F4{P4sANJQPx}$$Yz$^D69(XIAt%gX1+3}c)>Xk*LVDkVlyKBGH zQCU}ZE5QrYHT4IW<~S8x73)ul1Rh#^4(=yL1CGUzjaM z0WFfY@(NT>diB^T&!SCx78+lB&5A5s3n&Ca4&Hh|!Nynv$4v{;>xo&Z^=-{Z#DQun z-PK_a1@?90LWluzFnPw8c+ReJFhVr>vPC#LkohwUeFv$L4jbO+olS3cO78N(<1$tr%Yi|CpD+FXtv3zr!K}QXn3qfTmd4` z%-swrX?WUI1%?yT;Rn4|b<;`mT8R4T4qs+J z%1xah4mgsb2;2IxQLGZXatt9-W$14BHiLZxHR;Qly(dK~n3@44en=bdBXm9dKi!Yr z(7aBHHK@BXu+eAv*?v=&e|+W|juyt3g8b`jCYL^nT(S%mB^4i4dExOAx-3T}Ji}fi ze%~*&aolC!H-7sX@U^cZ*!mRu_{1TZ}IudcbjXU(5uJbeh4qB-(dI(L}Y zhKFR2KsjaK^~0*CfAs=J?|8gSEZvS%X2Mgy7a3Yu7DDHI{?yLPrj-7TwzSBlGnqAF<2sVu22`pa_kCL+vI z+^MoJ7HgR$v750hf>_{1YktOqUfR@5Mnip1XmbSCm}mKK8C9(zO{(V;eOK zwH+%VI|&7VS6a#srFQ00&{BxEi$IL*{*p_-%Q_t1~HEb9cXnjnOF_-|1FqcZOoa9MU(|W0Bt4uYsCF zvs9RD?Vf*41qzYKv z_dG6gR3G#?`%&DB4LUi|$lyeBhOOUrf2r_E1ARWhBJTX*otsb`w7_5qQXrI)lIBIx zisaa6p}c&ctH*wtsslrdqa(#{VDEyMdvqscd*XB58}CxwUJp7i)oq*a(+;z-0d0<9 zT^`jU8J(3Kvdr@c=x+UF``mkEO2)u5h|^QV)d~^`@R~icA(!zIHF+LW%G4f}8dVp@MkX|DD>)#M zr)PXkM_xFbr6_MUs?+)A!DJln-LI$L+`sw!j?+GaZ`f8^gUYizT%@&wS6>V&(k@Q!+qiT!3#Q~`d(B<?MBxqF|c>&(ADk3`{W(TGOnz+ z40JuS_&tbmc)R>#EP=}XbB^pLPY=AQl3VVc@KCF(38~tWjo+M>cNKrNQstb9^ z=zl$a1lj#>$Jke)6`Ffosq0%=Ie6JUn}|&+Q@ouJ0J^^Ssy-(-{h956+qY;^>!%b_ zZp~mIJ96WE=u)2gJ?^{3d(5-VCFU(T_O<8|y_b{v8uzcU*<)cc$hfeInw?g!gz+jb zcP_T8EvJwOO`BhQSUqJI!<>O4rDCnkW`dy$_>lP-H?88oUJKE%8YYZK`ek2CsmY7uqNs zlT@U^S6c8-39`R@9X{Is@*TfuTHtBm9zlDzsU4ibnM}uZJ#;O+gK$L-P{Rdp#@<5W zda~DUjHHUXe9_GF9&~`LZ?<=O1)2pls4)YcuolQ|(q*SC<(%rP5QQXn@*Kfv%;&@% z#6x^<=SCtL4ws91GN|^XoC12J;(7@GJO)$aT}fq!k-!roqb;=AWVhHSVsL|Hd%TqH zU3q245#vUew&9}6{OHn^j||_}_>!&BL)tpCwhbZoK9*DSV)NwllpU!j+^3ceD;R_{ zUlzwzjdY6vVr#IoJSx$z+OCZqZKPMNIpf`35w{aVu*lbuB-@L6K7MK8bHSyF8`0)I zxo<>vPJW5@>p<4%Sd~G?t=~&>XY*y*m>uKCurO+}O_5`XrGY9pf~jCJ&6@VC>_Z|Mg@3_@4pI*l3xK0 zM}98S-{iEfMtWyJ6Bs}Cx6l4jeEr5m{`v;tPFw+iZDtYuT>dBmYMtHd4`|k0kmuePMj9cA~1&NfoxuQ#$E$y<7P1h z;Iz5#^wR%yVfgbFLVL@M;6GmK^ywJ{c}sIFQ|;?!NtXu2RWnCC&-DVlZWZ`kVGQN? z=6v;^iSq5HEN(Nb|M*$I7o6Dza)=PKtrVJsg+*oO8uxF$bRBTk} z^&#ZpM8nb=$xE6pY+in7>MolS21Y<(GyQaRNHQ{d%py1eOt^8s3G#gBF+oVQ<*j6h^1PzO|1d-Nfc>Lr1I z0pIuT=7B^f!W7(rbzsDibN15MD}&2-0Ybh6-XRQp$`hVd;r=$ie<_x zW?MGTODWky#OOGYZKO5Hj1-PA8d6=$E3jDFJ zD)7dCGEIJ$*2rV~=N4Q+T30l2aQ6eQa+`hm=R+X><2U`;2M>4+i<@)M9)gLK#?;H}bOK$My+RumdPa&@j4D_%34w?S-Gm!85D**j*pTGZYaY8 zP=D=%R&1j`1l`{QF5HK~18m3a-#b44G&mL!Nhtj!(&JEItp2?X_x){u#Oc3XS4E8c z(}+cbu`_}{-~ZD#n5 z;o2u_##}-&eq;@67vsR+x9Rs^jfajPGfAlB`qWa~wSV62Kc3}}g$$z)QO6Fq8vo0x zeJ0;@;@9%|z)?+vWr?Z29De|tn?STe{hv2_i3xL5;XiHMpS#kI9lqTusL=k`cXpHhWlr$NefT2;|M81zL*bWu z9wk`+{}oIJkW{KXbTs7m#tqoS?)}S*=Jyx<+r@{*gDcN;K|4E>m+1U)rN938-<}xq znu#;8;SEc!T>A44MUx9>{{10-U!ni}_0#h3?vsp%S6Ko~xXdPhU+RB)%Dze}DNuulM`cATKwML>9Zq zh3{G0-mSk~{` z|DW@vUyQT>IO?uubs9eYeUY>g*pB7D|Mu6-{Ns{;JuzAic;G~Lu&l-F;{7ZCJi!0j z$v+Fmzdi@KxSy$K2$Ff6-Kw6v@@u0a1+ZFE(*KnDT=0jVj(J-hj#m#cjMe|I@jalQ z0(Pp`yQE)L)sJf?yoOnYoF~7R-H$*1eZ}}=*Ji+%KbBL_g)(zV2NQwrKTUt|qk-Yu zrtV(AcWefjhXLVXT@%FT|8Z??c%MN~Kw{YuLxj+ACBRRs8GI9%8$f5Eyb59_5rEF= zW|igkwOj-97%8!d5N}4rqGT54sFA?b2j5SVZ@!MoX)~Ou)?<|O^XWV2-Z(rTIl2e!+n9-y zFv$_!&Hp3_kYCn%fQ4tla|0il>o{Ee8DK67a8r>}bn-@es0UMWkV#AmHlAdXi|a!W zrN8NtRQS%iXn7!TPHGWjSl=jbWjuB79(07&D5G{A$y=Na_GTL6uwI0h^`fAk7s8;4lF zsdX1vW0N66PIb33%3hzc75$p_=CjRrvky{Vo5V+pvo~v z++e`*{$>hX;{AhZ_+?@;pl{UA+YkP-5qp!+rR6C3GQdK&j)jA+X-}g6CwbK5L~N29|5u`~$KFj^kGf zi$zb>PlJ{DEI?3u$F0|?GUl#=I=G~ZD=i2nmp<{F1CgL=O{3t-7h={o;J9RPIsIg5 zmfC?)J0pK^4^n_x6CO7}7i&M-ocyF#ox9Wl+vYBgd z%k$@z=IGbkb%-4iVozm@6sGwz{wQXRG}ojs4dbyVM1Gv#0n?KYsPdQlpz?d4DNEM; zY#rwQCgSoogxfB%6~IRM>Q7TwYS}@pnvoV#HM8>`nnK+gmOs|y?_2x*JjoX#=;I{4JH>m z9x7Sa5%<%3makTl>@;DgT4AFJ-GJLmTxMYSqxPZ2$M)MwdlRI6UXl}rMa!B4j7*Fu z8Y*nQA#Ej?e7>F4G|0bBOIcEE#fz!VZNPUnycKcUYk~dql3ST9+aQxL^ZgVE46>63 zpJ+XCPk90!i?(tH`n0FKkE8rrx6eXyLQETo{0c%POoVla zLcI4ZM~ac-Du9}7Q2iwN_gR@^`G|OtC5G$ebDVm#r?oefk$Vt^FVo5m6Whc{Sk%`oHR1Y2(1DkVST~!C5Q0ET^Qq)=SbtU zhKN0m7c6rOX8N=Kv=1@|XNkJ{DtB8`x#r z?Y{t?&oStgQJVu&7ny`0Fy;V46Mv5>;PpRg5bDV=)&Pl_J|-|l%nS+Afe3#v{tZrt zVcW*uenb-r%P;$#H^RI-zRTFz%2W$aO+s{@B4RAf80fNzl-8_kE!iTekDTlBKd`!-CNAUc{BkhgA)ay4e}FG>)3cnP0ttizZ=rUdy0rYG9EEKLMhLIm~`02HPNA zju)SRoAT;mcKxA;MH(kwam-MKSC}45j5Wg)C|ZPXD{DV2jp%leBb9AH6=7mxHQz?a zs=d&XF-M>jP_funBWxB(h|>BEP~X6-5EsdR#(_16X+4-x9R;CYiqT{6^ei~>;Q=XX1PI|@;;PlM* zKB8Ke<5??PXiGL*zbe*sK7~=^*b-dvtmkV%tIi%rvy=Dac7_ka&;&s^!M=KQM`@N5 z!U**E>h`#8dD%35Y6rmnJ#NOlR2;hatQioZlOYM)MKJWpoj zx$8>&?C?F;`SxJA?Ltkd^UYq0)5EuT@~A;0bdxsPsUmH6ka@`=>Zvda1Yfe16F1vv z`&^}9Z2SGvTi(y_iWAZev`5hm%ULR^6t=e`Ar=`ugCSdyCab|bgAxfT4e0_vg&H}+ zgdx;MO>7g@0yUr+h@9ktJMHh@eBFJjRV#7o%K@;mv`qsGo_R(a7g(IBn~W16sRF=A z&0{^Pk^X{TEb(Q2;&H2}mEp}ZAh2X~@br)r-z|?-c-{FPZ``cgd zs2y95Z+K1Y%aj+&=%(WFTCA!${V+@i5uP>i_ZnkQG+;)ul?qLE1^XpT3Dx-`nkJ56#3lQs^x|q_oX$f; zl3bhoIQb;qKq!AyFylcWiL8V$4%hsu6FX)v#pXzlL;atz6k`0}+YEyO5dZtgk0ZHv z05NHbV_6rQkV3tk;KeD(8Vxi;^zJi*RqBXu5H(P9Jhv^rg4KCgvsZXDX)w(o41lrI z`P8h|iICOqo)JEmvLBd5kll&E%U+suZ`}-7p7;2eu=#zs@rD4HM(P~&Y6MzmqHn%* z1=eW#Loz-2aL93jIFU>K9$YvTFRe9AtfYy-Cp__v3H_$y*RyV5(#A%_waAA}u@ z?&_C;|AMvLMf0%BVC&8{B2D|Q_UPS0d>@1b^`!*Ok!prRW&()c^9-AO?zkP7HrdY zhX)D>a)=e!gk^wsg!zTi=E1|pve>1lujgxSerjQ6W2IqZ2ucOYtErC4IoW=^{u|YU zAr&b&bzn}68-e$n%x;tX|{3PNw5a40D_fM9PRZOWVvpvb%4U zQr&X-?15zf>-yXJocG(dyOPXg8IF+j$E8r+sIauyU6S*S-bmU6Rl^P%SWHs*n%`KD zcHm(MSswN@>!S~evPHP3QihqnD{96|V!}aq9RjIg4Y!KDE!pQ`wYh zIg#MtuBkLap?A3TTo`Y>b^By`PuLxVe%eGHx>(E`>9KL<+dEpDTw6v9nJYWvPyGZTV!r#e*7UyI%wFL_GIYLP3^iYA5))*Fi^co@-!Q zNj+F4UY10=H7@;Qx{xc^xUfxA&<%|CQJ9S&Jq1-7s}0>l!LoN#vog89jM|$nHx;C5 z320Zz4%v7Sb2$xJck@0ZEbkRndoPNZXSes}?5c$NeR54zsX$6jX-(G5K z>S(_Uzgvih@7N@HFUm5FxUh1-^<5x|!ricQ%j{7-Eukb&5{pe4#~%9y;CxS?z*Ii( z-*;R*^a-d2uD!x2l|r(^D`be>m4l+*{2WbF{dg}0GkVh#R#~jf3l^7|;Hx9@SFl`F z_N_&P*u4tEE^s8uTu)BEi@F>hEK!3e7d>Nr3X>9>GZSaTSgRd5fKYK6=Rbq*5j#2S zoGRB*J%`L~hoVv?`K*LQ;#tO=!}UuS{B-Zu3&)HgV!W?Vu6$J61gduEwNZU;^39#d zA_{+8i~wB|(bgm1{nxKe4ST7oF)0WxSEHCqG1v|%5V>9k3Pb~GKcz-yB5?p-GZ*$% z{SAHdFeSuv&hK@PN+c?9caY{4*uLS8+dsU=CS-tX@WPAKSIb5!-M3$Sr3`u`I0|e* zBZj1t!p5dUTS0yiwyhP@25(s}oOTK1iD^jePka5)l)X04IL`99SBqB~=M}Zdt7Edh zq25-n9)1}1Qz%lDY&n!o!NgVWc!EOdrsK$XXJwYw`>;gxOv(uLL?)F9SCl*Ny#`}T zz1nnCqw#v!da|+m*V&zD+1tI5xZxY@zy=?_NhbfL@!Q95m}7h~UW~~m^UEiWj>fnO z8!J8Boo$sE?X1K(eROOjH_3qS!QR-Tn?n-s4M`c;GmhPT=qUA=#%tavsTj%6eO3+M zfX#Pznv0{bWCmMrbPI#AyIfL#WR~pFq^FHg<||md0fl6a=ACZ`jo%{c$%Nqe_ic7q z9~`^V#_+?F?T>2VnPSuE&omsw(9k_)nCcjqfWF(=s3uTHn&!8O^&QtvATHG_PJVzp zgEmUwE{Tlt*|)u;DqClTzLh~!U+H-BuAqBx35;3NZ_bgx5<-)`K5n2?tAQG8aVKQE zRUIYjFwv{FRm8iFRu6udC%XH>ej_NJ@cuMP=MkryO5VUkKZV`KE&5nKeXlD)I``2T zV&H+a;#dD%#PV((sxeMH;~9OIGE=K2YT2?b?(nRs=o_d47`?dFLlR#pr3Z{Pcw`_1 z$fGZvGw0t3l>}p;uAC#lrzN>UHKcA@WPOq@w1gziL9|GFa{bE9QzUNIlO0~8-KC}a z*~BL0XoHli%HHZ~9W<{r3Ct(NJx3m?Nl$1M*5|mQ+%-(jnt}_#U41f@xzqd~YDsO+ z-=K9Elb}r=>k=O;1~O;kN@Dp?8683SzR&-w?7Rc9>fb+}O*Un3(Xf+|y+@QZWOYYI z_6T)TWM_{wNJc_RR#_F1Eo5b6Jysb}?y@5KT^}v=Jm2r{`Ti$5=f2N5pL2b#@xETy zoy9AkY3xTG_c3?hcYMiNa~IlDk<?w>bNuV(2dYA|GuJz>r`N}( zi}m$IkZv`ZV(i!D?3)QUl3PzN&lQWbO1RPfN~nCe-4etFQ}oTpqRLV~&>Nm5arT@N zB#i^wNxr+wlsAU!?%f*}UQauMk1)qCN^U=9vy^vf&EBTCb?&2rFa?g~=tAvkWG<~T z*Xe_Aor^j79VH%l15oYQBi^ceJ2|rXoyLd<5+n;F)h%yYH`|_AWtKABOD@c;C7(Du z0o&c=(P#Gxf3f&IM_H9hV)D~ zt))xmyo|)#YqWBg=ufm?`dDbs7O{15m{acY*}-YESW3p)wXHL(k})DUw@hLf0Y_a#%!R%ZF9NUdP~f=p zIw*)5qxWh2LxNb6rVZuIARJ3|MfJ=xXAYrp-lu_tNUJ~1Nh zqtcS6cP5e0Vy;insd=dP-z;YS&E`00ZLG4e6^*vooSawpxhlPOH)s{>tL7V^9kr9ZpP<~--EDvYFUFG;NK9(Lxf9YX%ASU!l2UJ4Aes z2V~M)mj(ifcuLP}UGNc#5faz#78CA!kHW%Zd#>p|b>$kG2#dO*u3#2AyJ>hx0^w%O z!bZm4FA5O+n70lbyVoI*os`pgi3-| zfI)9}EE%QljN>CR?EL~cdv;@ejgC0h&WO`bk2gWseOMw$DslQyQly|WpKf?D#=Ri&5$!s5w7|%2 z1Y1~TVk9W@rmX3vQ_-;uy3qZXrFCqYJYVdOm2j&4e8s*@k$>q7j9eLaQtgk3wse^24O%nS z5$M;?zNImG1tW-1IU2h2O^;q3(NQ;_35aOp*CSMinE8eDSB(hGb;>>X2SFrPsml=-yP<~upe?K z1gFI_J!^G3`4b)Ju=gUg!acf%Ybe#ijE0W#Yb>mOZQzY2#5{a7%x8lV6QmfV3G}f= zNlYK}7Q(}lBAEp^=-kP_|Malb^ov)qq%X2|9?cM|oq<(qBz4Ka-=OFnQtn%>!~G2H z&iUd7n6_grd_hTFm3Vz2<-=k^;`P?P_uKP&ktA}hs*ic2c@j9dIX1Kzokc2gJ<|nZ zuR~BhfDLeUBhXJv6(SFOu38(@$FL<3OiV$?N|J8!WA>#CX*%u~sNEzVl*4K|Y#93% z!v;f|jozNCWydG5mJXvS@Oo>$Df&f330XP=-44^|aw6jn z$?QgnPIE{;Qj@OE1z!#koMoqC)g{+R>dh^gfoRt6o|JZU$$r8cq?m`(3*hGmdwoTX zIlj^{PpS$Gyhb6-y0)tk%qL^cyS3Z72rGbvw09L$(>RM?TPJF3?sF`K`G*_dop(!;)o z_fgN@;bhcK*(Rq?yq0C$F2g!s0;}=P+%o=H+Ef!3e3q7~cG7E2q@>zS$6}Cvb8zoz z>PsH<-CAw1hK#mN3_&T35viH-0WU!$$FI-fKZPil+i=Y{W^@W1>+Ztl87M~2SQjoW z=N#D^RltQogY0(M%GT#*`N~`}?uwGBRcl8|+e%)6s?lAkbtwoU&hoSppL`p`?%;SI~X!Fgt3w<*ha}!VwaYnsxTdx(OPFe)5*|z^KIkPUG%D(7bvF|W@?*H{3cg*}#4>;axI=F18Wg<0FL)iGl8vsZ2s$pw-VejqU z3=>>7#?@)3J|#1h{cTO5B<9b$F$?O*p|yL9D&C-0La_Uo>-6TyxZHo6^sZsQj?$YCriDRydcJ~Hvd`v@D7dbN zKLnxD-H9H4G8&=>fQvMT)?qSjKL*#vpxvC|WlEX@`(ULIXYvKOb!d38=0OJ^-&THa zVA4|`t>>9jX$X$QH#!@~CA#ah1(1=*VVE#hiQ>gJ*O0&~8n)-lqD!iidacBaxrc3ZAN%?ocAc|WF2>$ zLG@*}q^Cnrs>M&H+TSzTg91HYMht&=rk>ZUl~-#0CjeJ!ql68!d4Mqw!JsKNsM$W4 z$3lQ-Bt7*`^4f!CgXqY{y^|zJi>ov*he(&1?=3IZvhKi4%-u@RO(}}Joc&>eREXHAEOFz&n>hPZEoz}!gli#hapU2K z_tRc`tXv5R&YUf0V!6HX3vegd=6OZGY8QD@N9iQ#AY*K@pFFUI(tsPUQ_T^=`_ zf86RVOqKgocp7&V_b|60l&rK{&vZjm_G|mHx$gUUbNQ?d4?oU5l_LC`)4@V$ygjIX zSB+as-|AvX1v@;}N)u9%IL`!@enB3!J5(WU4dRfI)x;F{<`&hA?kBV=o5{I$U7FC( zm3hV)N-yol0z$AT^U-jDZbVSMY=X>HvR2u;H77`<)p{-1WgeVmV!bDVpM*NThLv#-n}o@6hPtLD zg0mb?pJXJt%A+0^BP?D-wMn^kndXkrlP}NIJR+Uzy_yP zk)=WBrJmGk&UEO#h8Tv!VnpmCMb{14aIlYe9WaP&qNCLprsz0&Vf(#Lw3=+Rb8|r@ z%w@L6$zQy#6hF#V1o8mM`6EoRT%&4)0tGqaEP1MpYia#hc9?S)fybwh7Zi9C!ZbwU++G+{@=Q$N;7Le-Bl zlR=l?1WHD#wA7}JayZGO@AN0e-R)gXb3ACx54NwX>{nlr9vaq4G@VnsKG51Hd|Fl@ z#hlJ&p_jBB6%-K0(c#UIuYNH=i_Q!CxH|kEe5))x3%+ zd}+%2IV9D_tAnrnX+cVfsH1@E)GAR`nBXg+MT4-C8XnJ%C0Zj+Q_I)N!# z(vF_ueyG#UH;cD}nbEl(Q$~|kYhKhnwbw4g(PJQ9%tGbMUg_e1T6@%a-5L!O&pTfp&!Mxvl+_t%*tC^-z?4Z*_;rUnys6=(j zq9kNZ&u1P^ix{bAAIuZ^82&0a>AW9;QdOBtw=PrDL#OL4Y{`2sP!{AS))D@(2%(fx zMfkG~2jDcIIuyV}BDtE4JYJwSA_|m38IC{%hcYk4&tllZuO`tSr>t|+^f)tG2ct=L zuIH>NCLaOq&%}eIxz66S&I=!_w&(GsSlfFAH8@UX+;~}7y)9L7r9E2Z4u``W{xwNc z;9`i?v(Y03L7SGWG=na6_@HfOLhx}N28{2ghw?gY0ZM2rqB#su*>jSC89L-M2N-*5 z?%pW6t>-!iiU-g2Z_N0BI=&=(SHaq$$dl~$ZCtyuOzNIPi73m~7*tdCCs|De zO!N%~{bgc?5gQSC#Uodq;Ub*_5n>532wO~P!p}tJsS}PJ<6#gvtJvM}DhEb3?y*kG z(8sY=l{YB)DGrF^%qvk*RpB0t0Omv>sW7Zngt#!w(Zyxj|5(6#Ntynx6Pe_PluMu6 zzCvUoQuzoG;w;|l9n41DihR*(L<&cnTBUn3k}&AJ*k!R}|4CbnLS|vu*c?+}%$}iE zllm$eH9B>*(}-M^e9vbDWk4(PAL5}YPWy_c@krQW5+r;YpEU1n3Pg1=HfT}KsC9}~ zT*q?uqd0k=1e3_El7X{L$QjIs5nuD#M=d8{dRW*MtAhoM>Q|Sm3--edp(-fmF=%vTKKcXYF9kqkw zS@eY#*jKXjWDuP{%_evj!+Xq$LBG3bPt#jaRovPxmsPd=-Yb3lM=l_L7%a2M@iR52 zQ5_B#6MTCa<=$C`jyxV>+`VhR{2VB{40h_aLBk4x~<?DX$Or-RV94myN zsOw<#u6F?Hq1OIz3QMpbJg_?t<2Wo~1a4ke(Pgc`-7ZG)JPwE^izI!hTwc}cv-rpb zhWikfLE|$|6l40v#Hcs2N3TK_-f08@CzzlzXmGtbf8WlJN0xxCq=;0L=8)cuC{tVm z%|S$_Skt~Mw9C-Ip?({x$>$4mmq>_lKzmILMD%`OS1OW{GF(DL4bV68?Kl-&zlf-s zmEo?_a?mk12fG1rS-B~MjnI|mv`gZeJ@a$5haYM@8jZ)wKY%vq1N{QjM}szmC;ei7_12L+v&wMnG2R>MSid?CehfuM>uFcQVHgBoZ;$;f5J zFXpM#uMQ6Ik%i4SrCE%Dc`_NgkHjuqH_M@lXl(a%(BcH-kyPu#=1Z*Wp7W$>y*wKD zhP0@Z*flcea{MQmsm29?Q=bo%3+jJCfZL1Z5%|O-oHesKxsn!pIKiKWAIfCY|81H75|v9wP1yc}`00fYLQ~XFRsDC_ixTQ^(XI0MK%1`)GM6T@8jjIvW(8DY6`{giIUyp1Yqafx z*@z1M)50OQkyY04&q3ZV-t7^XJN|q6}g;Q=V z5(EHI3E~j63 z7i5zHdj*{2hNGPFX&nLlEh;`%W)et6mdZ#bT67}2KS&g$4*c0Sg?P?o^Cy@3xsbKj)wE21h15vu zlX7sk4N5*qqLB^x@Y<5poJX31Z}KHKM23DvEyk2QLetIX$aPkyqIstS>}I6862ZMR z7aHX}`*jl->eryC87-A*{v9wMj=Pc zP0S0B^BPMJ2PNHqg;k~&^5AEop^dV%CNx~@Eo9d2Ha`vs5%%_OxL*yrLa_K|=1A<= zZ@dWKxU1d=wqpDxOAW%X#x-N$%desl(gL0m!r`@$N?|IkxW{idRtILVzup4F8s1U( zt=zGpFQcZ(C;QD=QaTSwz#eu8-Zy%{b2)f@FG~lS=Qwo3g72$mOHRF_@=zg`=scQl zZ1)0oF?p`_EaGtoIba&c_BIdTJBcR|j(*d5<;jJv7X`7q z#rt<3a9EhXG0XDw$!Z9-!>o)L)md++69SM?`+R>2413;_!EDx$h5px)WmEjKXUJY% z!Qsvs%auUtsa9X22Yc%lcwj5(wsoJ_f=esx7cqFUMGP}zEyV^bV>{Z$2pzulN^R_Y zKW*Md*G1}1a5UdFR@>b{^`*k-Eu;G}RK;n8Wxnb1Trh7mMncFY;bTXq=;;0&-F0Kz zIDeT;yPRvqx@?^@!O z3=BN*`gRiO+5xKre22om&K*PA$Gb=NJ!{%`-6}K9u`GS4rLnMa*6yTSk?Yf$k{3=H z9eo7_Az7vEPwoauM)rk%Dzcvmn>~|v`J4@`JH)(njEPebkV`49j>Git`O6oXTscI! zq8t~z&gV@jFzqRv6#dR4mU;&b`CWU1`JqpzzOI~*u`fhOiooL+Qr+ zq!E*`x|E+q3(VkIvT~! zU_8R^1e-av(HDdf4+K9l-4>5MIW=vo-j_$P+Rs*z$rXIpeIu-``BB!QZ`*z~-y zt$48nMBX)t?JMBTMRzZ(r^~m-aSn?So_)CR8)1B$e7 zE~TbqSGezYz6AWkEESc#CV9<1Z=^H0@d6K^0fo{a zmDK4;W_*f}1-m|eoQH&+vzAs}c(VxPwAQBU_7x!hVzr2F4;bl~`QWPcf_jeryju~z z&0KO?BX~r~jf99Mu9O;2A@HQ7$!gji^snRA2@oMx!5r+i{4I3PNcW85XMK(KnZFam zAW(x_=qCLj-Hk*=Hwvx4fCXSjm1AG70_&=X24HHYp4v5A1|S6ddYH|av;~v)b$q8H z8PJBa<2okO2+sk;ARlz6+YWMM@XDt(+w2^pu6kUknGs=ma(m7Gm!CUA8m=oOv9V6w z)wR&V#%df{RA)T7ZJLa^;#1OxZDJ>64rbTqc5G+zXP#u8Wh226o`Wu?8+1k7K2=SO z9!;`4bcBkIRqT>WhMe$8JnoYf4IHGy#t|;h*bi!l;~)$kQ>o?hI=FNEX6qHI=p|hD z&BdEKY-7~#ymaJoY@Z+NrHbtH$_t{L4v7B@YsK-!C!u1~Zx&;MwD)(`$+yY^F|!|W zja$HjAOqAqt;9%YHZ5O6REBN3&w(#KihP;}lfhN|3F$?uJ^=hf?iLoN#_{->U`$3| zK+&)a?|B!vT_=`n7bXB>=K?BMb^k1{)3Ps=N!{C-EY-56nXsW>K_UOAi>R4Z{J&?0ZO(}@F0eia0P_wB;_!y(6_3Jkm zL&l8aKq=Poq5uHkCn(UA%IQN#DdJGL{Q!E+{C1ZxD?aS=qX|=p%{`y9SYwL|W^|s& zF_Ay_=2{%%kgn`p=J9uIZ-=;y-h=nc>ABE6s{5o&Ecb|QPKnM7=N@S-S;+yHY8SAj zB;7L;Liw5$;gh!t7NKb;Y&pGiOkd19s?HYN`(iD*m+_U$l?&8Q*%pbmq zXuIPSMGkh@3)!OHd!@$?cNu5KYco1Cm%z~-)8(IXw_1dRz;cX&6lk3%DvLNUwkCl& z;1DA=@yqR9ymM#Rb_$i*imWB9-a+A?nnQB98eh>Fn7T|Ja*xKH z`$?tSHvD)w6VH({(KqT+R!DG<=SsXt>ukmuOW`m40ntUWb z!0Rtls^zzr6B~pqGverlU?#`P!z>uAUM^SbcwxFRP#7M8n?p7C=8eM8XPPN)87_)- zG_BKYS{lQ2CVrZ}#ytz^uTqmmYZN?3BG04t3DLu?s71l=nLE&iulj`QFzcRQ0fVdA z8Q=aPzeU9;q>cj+GIKDRgm@I?kUj=_LdZk+oI3;>D(ms<+4L?Ia$Z2wyaY~uAhf;%;bGLD3n)wXjE*Wun6qInH?`b6HO(c&%v+-a#^9+k3mwj-6UzC?~_ zY6y=$-)s%X1pR0lPgjQS3e$wb!nic=jzH?>0?kv=j@(&XYpA(Tni09jSVLN~?aSP{)_mbDL{m)ZI%VKM)(NY&IQ?GN54ZS(3B z%-nmoJ#Xp}8$qAWAd{PPD~%G*@HmrL5XaQ09NQR_a)o7XFx@cBI@Qe-t`(?IEta|t zgM6rOarSxx1uji&1=4{&4l>5Pgtrx3#^f%u)?an}Xiomzn0jTT&4Kpz1MS;Uf3AOz%x1=5Xr_1o{y#ZF9dy5TvVT_s z8AX0;WDO)aC8wUbD`j+?NNwoV@rk0Wr@wHkHPCe(=CwSDCR;u#Tpzn$X;>+(>hqE7 z#dO^5VX7XoKHotY{{%GgiVj-tvpLaCX+y74_L3oab`>h!(}XTH$%*-nH9Xcceotif zTq1Eb3OZ-boDN(6I>kG1=lS)syW)Zzt0sFQEKDs{CSFoALmkV#^?-b7CRbIH-kK3b zl}=rL8fICcY3llqSd5+9?C)J+esoa0y5`$;P&)_8Ty3(1G~8T^Hz7@|a>TSfieZ$D zoLXDgsP3w@ZgtQS17_bS&H}k(rG|Zekx)_+pBrTs;Xs)%CWVXG5F$oO*01}EeoOMWAZo^F|3ZJc&S#}bjJ{212MU*vQxY;l5I z9jel5OkX}>cJw?VS1l_XzDA@>H1uXJP($8iWPwnhmuDBo#v(E3A&t&3EwQuS!gPGZ z-6Sk)O(*AUJ%`BfehlgL7kxT?!jCAWqD1Qb$HKGx6#_NyYLx0bN2xa30um}R(5vtf z?S;Z1w-hg)tnn}KVttz7elljNG^fUy(JB^96IDL1f=!qopSWl(H~b}4;Z^tw_jPqs zjhplSyROM}axhE>#RNQyIn?CydVXA6xG?;Q*jtxm?px}*bz%oH!_k<5@z_w;(4duC z*P?Ee81XTanvxezy1Y`E@QKZd?Pzi14LBOCf}BxZvzBI(9lOp)E1|186&j@)iq7s1 z>E02$Z|%JG=vE%vr!otzQ)U~lMYfMC)6`Ud#Oqs{*uPGfH}Hf9YSg<#9Tcd~EzK?J zZYd7<6z&!PFGFdASRQ~B-GYOj8D#s^bbgiQVB^gFgj+}!OPWI%^y{HFUFVNSnF!QB zFGYiQ2gP03GpQdQJ}+Hs+hM;#HMYf;RwZL>%scRMgatSi-t&0RA^Pf(zW-z$+}`b> zW!ZRqQf95Vh&08ovt&we-Y_ zjIVrxXox?c|6z{)Z=u>ZH|vK6jLXlDSG2KmyJO>~q=9OLZws1(Cdx%PSE}Mz)TYEA42_CsOjSP+*UJ@YY{C*BYOfL%5AyoorzBQbooPndtr88lf!r=e4TqjipooYs<{s^%+QpmrF%oL+kPih$R z<7MAC=iv~D*i_DVK_6v%yO5t{fY1P}CZe-F zi3HbHq6!51r=*e)`0Fj1jq*QaPqx$Q&OHXz1kQfF!6n zbZXf4MG+HI`=K&08^jBbwIOrF=~angz!6fTt|HXkd;bv zNmuYnFDOwkN<6FMlWp|KW`ncv1nsTk9WIL^s`}2F}#pBx|FEebefO@X22Z zVx+H2m%cJugOD7xwO&S*KtU)6AYdsqX$GD4po0ir&L6iH1B zVUimmln0830<}SS2Xs*aS-{EOyGSm78mu%7Rw6_^laQ*n!b7r0>f{9v7VB+y}g4V80 z=l}8Jch-`PINr0|^UTaZ>zd*r_>FT0r?0GBz+i5qSOSBgyZe}#wR=%%R8Rg2ozkS_r=lbXC7p)|mv+?6kG!?N3+ANoPjof~HNxwt%`EtMh`SW5nedq7z3r2I~Ro@;4 zLo&b9Eab<%-Sj;+-{wuv^ScRXV}j!&UB!Q=CyI&^3LgIld$h4bZ9WMt>>)D#uT{{! z8mARC{TIi4KO8+t7f}R-Vv4@$v5Eb*@yucf1x^0Hb5+tH_8TdiW#TuwP!So^uLt<^ z;r+bNKY#OW0Rjc^q0q5kQ)GU;tsD26K(CVJ&j zh%WV?@lZczKwh7Mn3$8ft!#T%vA!3$b z(T{&s>Ad_Nr!q-}{)NxQo8XIJl$7+A{P;htT8(`+-vBh)E&0`@Xch9GIT12}q|C#A z*fD~Bl5vqsf0{O>iIqwS{)cYvEIVmd>(lwzhu7YyB#qjp+Q;(m+(&mE&dJ~j-jm$P z@BAlZ4Q?-y_VWCz^}F#@e%>QT;5p|y+DL37t8)20B-H(JkAK<@*#G`So5d)V&`f4w zoxpKZHQai7Lc#w8CNnu;2HFtYd*uc!pFTSY*xKp*zTCd0dD@imlP)TB=;HCaP|)G2AiE_O+F4RtyenO;ZCB&Hn}4Ahe?9$P5@JnGCg@DMqn9IlCEBOv z(Mg8llf;@8AEiBS{SCs2)4}M8Py9V2A1i)k_}44@Yl`6&W(R^ z4U!C$=>71D+GYsKnunR~q(tQUtqP*4S`?;TmrDD;(LR;zSAFH)*#24uN?#S)rY3mV z8CyL*1#tergxQeeMg9+bzknzuTiigZ{q604Sc8q%Xdj;V&pZ9o9&UDI%(6hrBK7Km z@9cLq0|h01@sZ!3Dq25(?t9b%_!yCS95dT*qvjP320?sts zBx{4G{Kxlx674JT&&}gMaD7TQkof8iNyGGyNJCVb)KKSNK7PL;B+Mv@I+JaDhy7n3 z^V@#(2ag#oXbOJ(?tioI!+$MA!yCH`UR@zH)={$m4fk Date: Sat, 21 Feb 2026 03:19:58 +0000 Subject: [PATCH 07/13] feat: update dependencies and enhance configuration structure - Updated `apache-tvm-ffi` version to `0.1.8.post2` in both `pyproject.toml` files. - Added `pyzmq` to the optional `cuda` dependencies in `pymllm`. - Introduced `pymllm-server` script for server launch functionality. - Refactored configuration imports in `pymllm/configs/__init__.py` to streamline access to model and quantization configurations. - Created new configuration files for model and quantization settings to support enhanced model management. --- mllm-kernel/pyproject.toml | 2 +- pymllm/configs/__init__.py | 15 +- pymllm/configs/global_config.py | 606 +++++++++--------- pymllm/configs/model_config.py | 31 + pymllm/configs/quantization_config.py | 18 + pymllm/configs/server_config.py | 266 ++------ pymllm/engine/launch.py | 116 +++- .../scheduler.py => executor/eager_runner.py} | 0 pymllm/orchestrator/async_disk_io_process.py | 3 + pymllm/orchestrator/detokenizer_process.py | 3 + pymllm/orchestrator/model_runner_process.py | 3 + pymllm/orchestrator/parallel_state.py | 122 ++-- .../orchestrator/request_response_process.py | 10 + pymllm/orchestrator/scheduler_process.py | 3 + pymllm/orchestrator/tokenizer_process.py | 3 + pymllm/server/launch.py | 17 + pymllm/tests/test_vocab_parallel_embedding.py | 24 +- pyproject.toml | 5 +- 18 files changed, 624 insertions(+), 623 deletions(-) rename pymllm/{orchestrator/scheduler.py => executor/eager_runner.py} (100%) create mode 100644 pymllm/orchestrator/async_disk_io_process.py create mode 100644 pymllm/orchestrator/detokenizer_process.py create mode 100644 pymllm/orchestrator/model_runner_process.py create mode 100644 pymllm/orchestrator/request_response_process.py create mode 100644 pymllm/orchestrator/scheduler_process.py create mode 100644 pymllm/orchestrator/tokenizer_process.py diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index a8dbd98e..77340b29 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "packaging", "torch", "torch-c-dlpack-ext", - "apache-tvm-ffi == 0.1.8", + "apache-tvm-ffi == 0.1.8.post2", ] [project.optional-dependencies] diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py index 86af57be..a23de035 100644 --- a/pymllm/configs/__init__.py +++ b/pymllm/configs/__init__.py @@ -1,21 +1,14 @@ """Configuration module for pymllm.""" -from pymllm.configs.global_config import ( - CacheConfig, - GlobalConfig, - ModelConfig, - RuntimeConfig, - get_global_config, -) +from pymllm.configs.global_config import GlobalConfig, get_global_config +from pymllm.configs.model_config import ModelConfig +from pymllm.configs.quantization_config import QuantizationConfig from pymllm.configs.server_config import ServerConfig __all__ = [ - # Main singleton "GlobalConfig", "get_global_config", - # Sub configs "ServerConfig", "ModelConfig", - "RuntimeConfig", - "CacheConfig", + "QuantizationConfig", ] diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py index 43783e94..1761697b 100644 --- a/pymllm/configs/global_config.py +++ b/pymllm/configs/global_config.py @@ -1,349 +1,321 @@ -"""Global configuration singleton with all server, model and runtime configs.""" +"""Global configuration singleton aggregating all sub-configs.""" from __future__ import annotations -from dataclasses import dataclass, field +import argparse +import types +from dataclasses import MISSING, dataclass, field, fields from pathlib import Path -from typing import Any, Dict, Literal, Optional, TYPE_CHECKING +from typing import ( + Any, + Callable, + Literal, + Optional, + Sequence, + Union, + get_args, + get_origin, + get_type_hints, +) -if TYPE_CHECKING: - from transformers import PretrainedConfig +from pymllm.configs.server_config import ServerConfig +from pymllm.configs.model_config import ModelConfig +from pymllm.configs.quantization_config import QuantizationConfig @dataclass -class ModelConfig: - """Model-specific configuration parsed from HF config. - - This is a lightweight wrapper around HuggingFace config with - additional derived fields for efficiency. - """ - # Original HF config (populated after loading) - hf_config: Optional[Any] = field(default=None, repr=False) - hf_text_config: Optional[Any] = field(default=None, repr=False) - - # Model architecture - model_type: str = "unknown" - architectures: list[str] = field(default_factory=list) - - # Dimensions - hidden_size: int = 0 - num_hidden_layers: int = 0 - num_attention_heads: int = 0 - num_key_value_heads: Optional[int] = None - intermediate_size: int = 0 - vocab_size: int = 0 - - # Context length - max_position_embeddings: int = 0 - context_length: int = 0 # effective context length - - # Normalization - rms_norm_eps: float = 1e-6 - tie_word_embeddings: bool = False - - # RoPE - rope_theta: float = 10000.0 - rope_scaling: Optional[Dict[str, Any]] = None - - # Quantization - quantization: Optional[str] = None - - def __post_init__(self): - """Set default kv heads if not specified.""" - if self.num_key_value_heads is None: - self.num_key_value_heads = self.num_attention_heads - - -@dataclass -class RuntimeConfig: - """Runtime state that changes during execution.""" - - # Distributed state - tp_rank: int = 0 - tp_size: int = 1 - dp_rank: int = 0 - dp_size: int = 1 - pp_rank: int = 0 - pp_size: int = 1 - world_rank: int = 0 - world_size: int = 1 - local_rank: int = 0 - - # Device - device: str = "cuda" - - # Memory pools - max_num_seqs: int = 0 - max_model_len: int = 0 - - # Scheduler state (mutable during runtime) - num_running_reqs: int = 0 - num_waiting_reqs: int = 0 - num_swapped_reqs: int = 0 - +class GlobalConfig: + """Singleton that holds every sub-config pymllm needs. -@dataclass -class CacheConfig: - """KV cache configuration.""" - - block_size: int = 16 - num_gpu_blocks: int = 0 - num_cpu_blocks: int = 0 - - # Cache dtype - cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = "auto" - - # Sliding window - sliding_window: Optional[int] = None - - # Prefix caching - enable_prefix_caching: bool = False + Usage:: + from pymllm.configs import get_global_config -@dataclass -class GlobalConfig: - """Global configuration singleton containing all configs. - - This is the single source of truth for all configuration in pymllm. - It aggregates ServerConfig, ModelConfig, RuntimeConfig, and CacheConfig. - - Usage: - >>> from pymllm.configs import get_global_config - >>> config = get_global_config() - >>> - >>> # Access server config - >>> config.server.model_path - >>> config.server.tp_size - >>> - >>> # Access model config - >>> config.model.hidden_size - >>> config.model.vocab_size - >>> - >>> # Access runtime config (mutable) - >>> config.runtime.tp_rank - >>> config.runtime.device - >>> - >>> # Access cache config - >>> config.cache.block_size - >>> - >>> # Update with new server config - >>> config.load_server_config(server_config) - >>> - >>> # Update with HF model config - >>> config.load_hf_config(hf_config) + cfg = get_global_config() + cfg.model.model_path + cfg.model.hidden_size + cfg.quantization.method + cfg.server.host """ - - # Sub-configs - server: "ServerConfig" = field(default=None, repr=False) + + server: "ServerConfig" = field(default=None, repr=False) # type: ignore[assignment] model: ModelConfig = field(default_factory=ModelConfig) - runtime: RuntimeConfig = field(default_factory=RuntimeConfig) - cache: CacheConfig = field(default_factory=CacheConfig) - - # Additional metadata + quantization: QuantizationConfig = field(default_factory=QuantizationConfig) + _initialized: bool = field(default=False, repr=False) - + def __new__(cls): - if not hasattr(cls, '_instance') or cls._instance is None: + if not hasattr(cls, "_instance") or cls._instance is None: cls._instance = super().__new__(cls) return cls._instance - + def __post_init__(self): - # Lazy import to avoid circular dependency if self.server is None: - from pymllm.configs.server_config import ServerConfig - self.server = ServerConfig( - model_path=Path("."), # placeholder - ) - + self.server = ServerConfig(model_path=None) + @classmethod def get_instance(cls) -> "GlobalConfig": - """Get the singleton instance.""" - if not hasattr(cls, '_instance') or cls._instance is None: + if not hasattr(cls, "_instance") or cls._instance is None: cls._instance = cls() return cls._instance - - def load_server_config(self, server_config: "ServerConfig") -> None: - """Load server configuration and sync related fields.""" - self.server = server_config - - # Sync tp/dp/pp sizes to runtime - self.runtime.tp_size = server_config.tp_size - self.runtime.dp_size = server_config.dp_size - self.runtime.pp_size = server_config.pp_size - self.runtime.device = "cuda" if server_config.base_gpu_id >= 0 else "cpu" - - self._initialized = True - - def load_hf_config(self, hf_config: "PretrainedConfig") -> None: - """Load HuggingFace model configuration.""" - from transformers import PretrainedConfig - - # Store original - self.model.hf_config = hf_config - - # Get text config (for multimodal models) - if hasattr(hf_config, "text_config"): - self.model.hf_text_config = hf_config.text_config - text_config = hf_config.text_config - else: - text_config = hf_config - self.model.hf_text_config = hf_config - - # Extract fields - self.model.model_type = getattr(text_config, "model_type", "unknown") - self.model.architectures = getattr(text_config, "architectures", []) - - self.model.hidden_size = getattr(text_config, "hidden_size", 0) - self.model.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) - self.model.num_attention_heads = getattr(text_config, "num_attention_heads", 0) - self.model.num_key_value_heads = getattr(text_config, "num_key_value_heads", None) - self.model.intermediate_size = getattr(text_config, "intermediate_size", 0) - self.model.vocab_size = getattr(text_config, "vocab_size", 0) - - # Context length - self.model.max_position_embeddings = getattr( - text_config, "max_position_embeddings", 0 + + @classmethod + def reset(cls) -> None: + """Destroy the singleton (useful in tests).""" + cls._instance = None + + +def _parse_bool(value: Any) -> bool: + """Convert common CLI boolean spellings into ``bool``. + + This helper is intentionally permissive because CLI users often provide + booleans in different forms (for example ``true``, ``1``, ``yes``, + ``false``, ``0``, ``no``). The function raises ``argparse.ArgumentTypeError`` + to integrate naturally with ``argparse`` validation and error reporting. + """ + + if isinstance(value, bool): + return value + if value is None: + return True + + lowered = str(value).strip().lower() + if lowered in {"1", "true", "t", "yes", "y", "on"}: + return True + if lowered in {"0", "false", "f", "no", "n", "off"}: + return False + raise argparse.ArgumentTypeError( + f"Invalid boolean value: {value!r}. Expected one of true/false, 1/0, yes/no." + ) + + +def _unwrap_optional(annotation: Any) -> tuple[Any, bool]: + """Return ``(inner_type, is_optional)`` for Optional/Union annotations.""" + + origin = get_origin(annotation) + if origin not in (Union, types.UnionType): + return annotation, False + + args = [arg for arg in get_args(annotation) if arg is not type(None)] + if len(args) == 1 and len(get_args(annotation)) == 2: + return args[0], True + return annotation, False + + +def _converter_for_annotation(annotation: Any) -> Optional[Callable[[str], Any]]: + """Map a type annotation to an ``argparse`` converter. + + Only scalar, CLI-friendly annotations are supported. Complex runtime fields + (for example nested dict/object handles) are intentionally excluded from the + generated CLI surface to keep the interface predictable and safe. + """ + + inner, _ = _unwrap_optional(annotation) + origin = get_origin(inner) + if origin is not None: + if origin is Literal: + literal_values = get_args(inner) + if literal_values: + return type(literal_values[0]) + return str + return None + + if inner in (str, int, float): + return inner + if inner is Path: + return Path + return None + + +def _is_bool_annotation(annotation: Any) -> bool: + """Return ``True`` if annotation represents a bool/Optional[bool] field.""" + + inner, _ = _unwrap_optional(annotation) + return inner is bool + + +def _format_default_for_help(value: Any) -> str: + """Create a concise, readable default string for CLI help text.""" + + if value is MISSING: + return "" + if value is None: + return "None" + if isinstance(value, Path): + return str(value) + return repr(value) + + +def make_args( + parser: Optional[argparse.ArgumentParser] = None, +) -> argparse.ArgumentParser: + """Create an ``argparse`` parser with two-level GlobalConfig CLI options. + + The generated options follow the naming pattern ``--

.`` so + each sub-config can be configured independently: + + - ``server`` options map to :class:`ServerConfig` fields. + - ``model`` options map to :class:`ModelConfig` fields. + - ``quantization`` options map to :class:`QuantizationConfig` fields. + + Examples + -------- + - ``--server.host 0.0.0.0`` + - ``--server.port 8080`` + - ``--server.sleep_on_idle`` (implicit true) + - ``--server.sleep_on_idle false`` (explicit false) + - ``--quantization.method awq`` + + Design notes + ------------ + - Options are generated from dataclass metadata, which keeps the CLI surface + synchronized with config definitions and avoids manual drift. + - Parser defaults are suppressed (``argparse.SUPPRESS``), so ``read_args`` + can reliably detect whether a value was explicitly provided by the user. + - Only CLI-friendly scalar fields are exposed; runtime-only fields are + skipped automatically. + """ + + if parser is None: + parser = argparse.ArgumentParser( + prog="pymllm", + description="CLI options for configuring pymllm GlobalConfig.", ) - self.model.context_length = self._get_context_length(text_config) - - # Normalization - self.model.rms_norm_eps = getattr(text_config, "rms_norm_eps", 1e-6) - self.model.tie_word_embeddings = getattr( - text_config, "tie_word_embeddings", False + + cfg = GlobalConfig.get_instance() + sections: list[tuple[str, Any]] = [ + ("server", cfg.server), + ("model", cfg.model), + ("quantization", cfg.quantization), + ] + + for section_name, section_obj in sections: + section_group = parser.add_argument_group( + f"{section_name} config", + f"Options for the '{section_name}' section of GlobalConfig.", ) - - # RoPE - self.model.rope_theta = getattr(text_config, "rope_theta", 10000.0) - self.model.rope_scaling = getattr(text_config, "rope_scaling", None) - - # Sync to cache config - self.cache.sliding_window = getattr(text_config, "sliding_window", None) - - def _get_context_length(self, config: "PretrainedConfig") -> int: - """Extract effective context length from config.""" - # Try various fields - for key in ["max_position_embeddings", "n_positions", "seq_length"]: - if hasattr(config, key): - value = getattr(config, key) - if isinstance(value, int) and value > 0: - return value - return 2048 # default - - def update_runtime(self, **kwargs) -> None: - """Update runtime configuration.""" - for key, value in kwargs.items(): - if hasattr(self.runtime, key): - setattr(self.runtime, key, value) - else: - raise AttributeError(f"RuntimeConfig has no attribute '{key}'") - - def update_cache(self, **kwargs) -> None: - """Update cache configuration.""" - for key, value in kwargs.items(): - if hasattr(self.cache, key): - setattr(self.cache, key, value) - else: - raise AttributeError(f"CacheConfig has no attribute '{key}'") - - def temp(self, **kwargs): - """Context manager for temporary config changes. - - Usage: - # Modify runtime config temporarily - with config.temp(runtime=config.runtime): - config.runtime.tp_size = 2 - # ... do something with tp_size=2 - # runtime restored to original values - """ - return _TempGlobalConfig(self, **kwargs) - - def to_dict(self) -> Dict[str, Any]: - """Serialize all configs to dictionary.""" - return { - "server": self.server.to_dict() if self.server else {}, - "model": self._model_to_dict(), - "runtime": self._runtime_to_dict(), - "cache": self._cache_to_dict(), - } - - def _model_to_dict(self) -> Dict[str, Any]: - """Convert model config to dict.""" - return { - "model_type": self.model.model_type, - "architectures": self.model.architectures, - "hidden_size": self.model.hidden_size, - "num_hidden_layers": self.model.num_hidden_layers, - "num_attention_heads": self.model.num_attention_heads, - "num_key_value_heads": self.model.num_key_value_heads, - "intermediate_size": self.model.intermediate_size, - "vocab_size": self.model.vocab_size, - "context_length": self.model.context_length, - } - - def _runtime_to_dict(self) -> Dict[str, Any]: - """Convert runtime config to dict.""" - return { - "tp_rank": self.runtime.tp_rank, - "tp_size": self.runtime.tp_size, - "world_rank": self.runtime.world_rank, - "world_size": self.runtime.world_size, - "device": self.runtime.device, - } - - def _cache_to_dict(self) -> Dict[str, Any]: - """Convert cache config to dict.""" - return { - "block_size": self.cache.block_size, - "num_gpu_blocks": self.cache.num_gpu_blocks, - "cache_dtype": self.cache.cache_dtype, - } + type_hints = get_type_hints(type(section_obj)) + for dc_field in fields(section_obj): + if dc_field.name.startswith("_"): + continue + + annotation = type_hints.get(dc_field.name, dc_field.type) + option = f"--{section_name}.{dc_field.name}" + dest = f"{section_name}__{dc_field.name}" + default_value = getattr(section_obj, dc_field.name) + if _is_bool_annotation(annotation): + section_group.add_argument( + option, + dest=dest, + nargs="?", + const=True, + type=_parse_bool, + default=argparse.SUPPRESS, + help=( + f"{section_name}.{dc_field.name} (bool, default: " + f"{_format_default_for_help(default_value)}). " + "Can be provided as a flag for true or with an explicit value." + ), + ) + continue -class _TempGlobalConfig: - """Context manager for temporary global config changes. - - Supports nested keys like "runtime.tp_size" to modify sub-configs. + converter = _converter_for_annotation(annotation) + if converter is None: + # Skip non-scalar or runtime-only fields (e.g. arbitrary objects). + continue + + section_group.add_argument( + option, + dest=dest, + type=converter, + default=argparse.SUPPRESS, + help=( + f"{section_name}.{dc_field.name} (default: " + f"{_format_default_for_help(default_value)})." + ), + ) + + return parser + + +def read_args( + argv: Optional[Sequence[str]] = None, + parser: Optional[argparse.ArgumentParser] = None, +) -> GlobalConfig: + """Parse CLI args and apply overrides to the singleton ``GlobalConfig``. + + Parameters + ---------- + argv + Optional argument vector. If ``None``, ``argparse`` reads from + ``sys.argv`` (standard CLI behavior). + parser + Optional parser to use. When omitted, this function builds one through + :func:`make_args`. + + Returns + ------- + GlobalConfig + The singleton config instance after CLI overrides have been applied. + + Behavior + -------- + 1. Parse all generated ``--section.field`` options. + 2. Apply only explicitly provided options (no accidental overwrite by parser + defaults). + 3. Rebuild ``ServerConfig`` when server fields change so validation in + ``ServerConfig.__post_init__`` and ``_validate`` remains enforced. + 4. Keep ``server.model_path`` and ``model.model_path`` aligned when only one + side is explicitly overridden (the same precedence used by runtime config + loading conventions). """ - - def __init__(self, config: GlobalConfig, **kwargs): - self.config = config - self.temp_values = kwargs - self.old_values = {} - - def _get_nested_attr(self, key: str): - """Get attribute, supporting dot notation for nested access.""" - if "." in key: - parts = key.split(".") - obj = self.config - for part in parts[:-1]: - obj = getattr(obj, part) - return getattr(obj, parts[-1]) - return getattr(self.config, key) - - def _set_nested_attr(self, key: str, value): - """Set attribute, supporting dot notation for nested access.""" - if "." in key: - parts = key.split(".") - obj = self.config - for part in parts[:-1]: - obj = getattr(obj, part) - setattr(obj, parts[-1], value) - else: - setattr(self.config, key, value) - - def __enter__(self): - for key, value in self.temp_values.items(): - self.old_values[key] = self._get_nested_attr(key) - self._set_nested_attr(key, value) - return self.config - - def __exit__(self, exc_type, exc_val, exc_tb): - for key, value in self.old_values.items(): - self._set_nested_attr(key, value) - return False + + if parser is None: + parser = make_args() + + namespace = parser.parse_args(argv) + parsed = vars(namespace) + cfg = GlobalConfig.get_instance() + + # Server: reconstruct to preserve validation behavior. + from pymllm.configs.server_config import ServerConfig + + server_updates: dict[str, Any] = {} + for dc_field in fields(cfg.server): + key = f"server__{dc_field.name}" + if key in parsed: + server_updates[dc_field.name] = parsed[key] + if server_updates: + server_values = { + dc_field.name: getattr(cfg.server, dc_field.name) + for dc_field in fields(cfg.server) + } + server_values.update(server_updates) + cfg.server = ServerConfig(**server_values) + + # Model / Quantization: in-place updates are sufficient. + for section_name, section_obj in ( + ("model", cfg.model), + ("quantization", cfg.quantization), + ): + for dc_field in fields(section_obj): + key = f"{section_name}__{dc_field.name}" + if key in parsed: + setattr(section_obj, dc_field.name, parsed[key]) + + # Keep model path synchronized when only one side is explicitly overridden. + server_model_overridden = "server__model_path" in parsed + model_model_overridden = "model__model_path" in parsed + if server_model_overridden and not model_model_overridden: + cfg.model.model_path = cfg.server.model_path + elif model_model_overridden and not server_model_overridden: + cfg.server.model_path = cfg.model.model_path + + cfg._initialized = True + return cfg -# Convenience function def get_global_config() -> GlobalConfig: - """Get the global config singleton instance.""" + """Return the global config singleton.""" return GlobalConfig.get_instance() diff --git a/pymllm/configs/model_config.py b/pymllm/configs/model_config.py index e69de29b..c23dff1d 100644 --- a/pymllm/configs/model_config.py +++ b/pymllm/configs/model_config.py @@ -0,0 +1,31 @@ +"""Lightweight model configuration: path + HuggingFace config handle.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Optional + + +@dataclass +class ModelConfig: + """Minimal model config wrapping a HuggingFace PretrainedConfig. + + Attributes on ``hf_config`` are flattened onto this object:: + + cfg = get_global_config().model + cfg.hidden_size # -> hf_config.hidden_size + cfg.vocab_size # -> hf_config.vocab_size + cfg.text_config # -> hf_config.text_config (multimodal) + """ + + # Populated at runtime via ``transformers.AutoConfig.from_pretrained`` + hf_config: Optional[Any] = field(default=None, repr=False) + + def __getattr__(self, name: str) -> Any: + hf = object.__getattribute__(self, "hf_config") + if hf is not None and hasattr(hf, name): + return getattr(hf, name) + raise AttributeError( + f"'{type(self).__name__}' has no attribute '{name}' " + f"(also not found on hf_config)" + ) diff --git a/pymllm/configs/quantization_config.py b/pymllm/configs/quantization_config.py index e69de29b..850ea82b 100644 --- a/pymllm/configs/quantization_config.py +++ b/pymllm/configs/quantization_config.py @@ -0,0 +1,18 @@ +"""Quantization settings for model weights and KV cache.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal, Optional + + +@dataclass +class QuantizationConfig: + """Quantization configuration for weights and KV cache.""" + + # Weight quantization method (e.g. "awq", "gptq", "fp8", None for no quant) + method: Optional[str] = None + # KV cache data type override + kv_cache_dtype: Literal[ + "auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2" + ] = "auto" diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 56be4fc4..7cda9c3b 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -2,266 +2,118 @@ from pathlib import Path from typing import Any, Literal, Optional -from dataclasses import asdict, dataclass, field +from dataclasses import dataclass, field @dataclass class ServerConfig: - """ - Centralized runtime configuration for the MLLM server. + """Centralized runtime configuration for the MLLM server.""" - The fields are grouped by operational concern so that: - - CLI args can map directly to this dataclass. - - YAML/JSON config files can be loaded and validated in one place. - - future extensions can follow a predictable structure. - """ - - # ------------------------------------------------------------------------- - # Model and tokenizer settings - # ------------------------------------------------------------------------- - # Required path to the model checkpoint directory or model identifier. - model_path: Path - # Optional tokenizer path; when omitted we fall back to `model_path`. + # --------------------------------------------------------------------- # + # Model and tokenizer configuration + # --------------------------------------------------------------------- # + model_path: Optional[Path] = None tokenizer_path: Optional[Path] = None - # Tokenizer bootstrap strategy: - # - "auto": infer tokenizer mode from model type. - # - "slow"/"fast": force a specific tokenizer implementation. tokenizer_mode: Literal["auto", "slow", "fast"] = "auto" - # Number of worker threads/processes used by tokenizer service. - tokenizer_worker_num: int = 1 - # Skip tokenizer initialization at startup to reduce cold-start latency. - skip_tokenizer_init: bool = False - # Model loading format hint for loader backends. - load_format: Literal["auto", "pt", "safetensors", "gguf"] = "auto" - # Allow loading custom model code from remote repositories. + load_format: Literal["auto", "safetensors"] = "auto" trust_remote_code: bool = False - # Explicit context length; `None` means infer from model config. + download_dir: Optional[Path] = None context_length: Optional[int] = None - # Model precision policy for weights and activations. dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto" - # Quantization algorithm to apply at load time. - quantization: Optional[str] = None - # KV cache dtype; can differ from model dtype for better memory trade-offs. - kv_cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = ( - "auto" - ) - # HuggingFace revision/commit/tag for deterministic model resolution. - revision: Optional[str] = None - # Optional custom directory used to cache downloaded model artifacts. - download_dir: Optional[Path] = None - # ------------------------------------------------------------------------- - # HTTP / API server settings - # ------------------------------------------------------------------------- - # Host address the HTTP server binds to. + # --------------------------------------------------------------------- # + # HTTP / API server + # --------------------------------------------------------------------- # host: str = "127.0.0.1" - # TCP port exposed by the HTTP server. port: int = 30000 - # Optional FastAPI root path when running behind a reverse proxy. fastapi_root_path: str = "" - # API key required by client-facing endpoints. api_key: Optional[str] = None - # Admin API key for privileged management endpoints. admin_api_key: Optional[str] = None - # Public model name returned in OpenAI-compatible API responses. served_model_name: Optional[str] = None - # Path used for server-side file uploads or temporary user artifacts. file_storage_path: Path = Path("mllm_storage") - # ------------------------------------------------------------------------- - # Runtime and scheduling behavior - # ------------------------------------------------------------------------- - # Fraction of total GPU memory reserved for static allocations - # (primarily model weights + KV cache). + # --------------------------------------------------------------------- # + # Scheduling and memory + # --------------------------------------------------------------------- # mem_fraction_static: Optional[float] = None - # Maximum number of requests concurrently executing in scheduler. - max_running_requests: Optional[int] = None - # Maximum queued requests waiting for execution. + max_running_requests: Optional[int] = 1 max_queued_requests: Optional[int] = None - # Hard cap of total active tokens across all in-flight requests. max_total_tokens: Optional[int] = None - # Prefill chunk size used to trade throughput vs memory pressure. chunked_prefill_size: Optional[int] = None - # Upper bound for tokens accepted in a single prefill pass. - max_prefill_tokens: int = 16384 - # Scheduling policy: - # - "fcfs": first-come-first-served fairness. - # - "lpm": longest-prefix-match style cache locality optimization. - schedule_policy: Literal["fcfs", "lpm"] = "fcfs" - # Conservative multiplier for scheduler admission decisions. - # Values > 1.0 are safer for OOM avoidance but may reduce utilization. + max_prefill_tokens: int = None + schedule_policy: Literal["auto", "fcfs"] = "fcfs" schedule_conservativeness: float = 1.0 - # Enable low-power sleep while idle to reduce background GPU usage. sleep_on_idle: bool = False - # Stream partial output every N decode steps when streaming is enabled. stream_interval: int = 1 - # Enable token streaming in generation responses. stream_output: bool = True - # ------------------------------------------------------------------------- - # Parallelism and distributed deployment - # ------------------------------------------------------------------------- - # Tensor parallel size (intra-layer sharding). - tp_size: int = 1 - # Data parallel size (replicated model workers). - dp_size: int = 1 - # Expert parallel size for MoE-style models. - ep_size: int = 1 - # Pipeline parallel size (inter-layer partitioning). - pp_size: int = 1 - # Number of nodes participating in distributed serving. - nnodes: int = 1 - # Rank of current node in multi-node topology. - node_rank: int = 0 - # Torch distributed init address, e.g. "host:port". - dist_init_addr: Optional[str] = None - # Optional NCCL communication port override. - nccl_port: Optional[int] = None - # Timeout in seconds for distributed collectives. - dist_timeout: Optional[int] = None - # Base GPU index used for process-to-device mapping. + # --------------------------------------------------------------------- # + # Threads + # --------------------------------------------------------------------- # + enable_disk_io_async: bool = False + disk_io_async_thread_count: int = 1 + + # --------------------------------------------------------------------- # + # Device + # --------------------------------------------------------------------- # base_gpu_id: int = 0 - # Step size between logical workers when assigning GPU IDs. - gpu_id_step: int = 1 - # ------------------------------------------------------------------------- - # Backend and acceleration toggles - # ------------------------------------------------------------------------- - # Attention kernel backend selection. - attention_backend: Optional[str] = None - # Sampling backend selection. + # --------------------------------------------------------------------- # + # Backend / acceleration + # --------------------------------------------------------------------- # + attention_backend: Literal["auto", "flashinfer"] = "auto" sampling_backend: Optional[str] = None - # Grammar-constrained decoding backend. - grammar_backend: Optional[str] = None - # Disable CUDA graph capture for debugging/compatibility. disable_cuda_graph: bool = False - # Enable `torch.compile` acceleration path. - enable_torch_compile: bool = False - # Maximum batch size considered by `torch.compile` profiles. + enable_torch_compile: bool = True torch_compile_max_bs: int = 32 - # Enable deterministic inference behavior where possible. - enable_deterministic_inference: bool = False - # Random seed for reproducible sampling and initialization. - random_seed: Optional[int] = None + random_seed: Optional[int] = 42 - # ------------------------------------------------------------------------- - # Logging, metrics, and observability - # ------------------------------------------------------------------------- - # Global log level for server components. + # --------------------------------------------------------------------- # + # Logging and observability + # --------------------------------------------------------------------- # log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" - # HTTP access log level; if None, inherits global log level. - log_level_http: Optional[str] = None - # Log each request payload/metadata for debugging. - log_requests: bool = False - # Verbosity level for request logging, larger means more detail. - log_requests_level: int = 2 - # Toggle built-in Prometheus/metrics endpoint. enable_metrics: bool = False - # Include latency/time-cost summaries in logs. show_time_cost: bool = False - # Optional OpenTelemetry traces endpoint ("host:port"). - otlp_traces_endpoint: str = "localhost:4317" - # Enable tracing export to OTLP collector. - enable_trace: bool = False - - # ------------------------------------------------------------------------- - # Feature switches and advanced decoding options - # ------------------------------------------------------------------------- - # Enable LoRA adapter serving support. - enable_lora: bool = False - # Maximum number of LoRA adapters loaded simultaneously. - max_loaded_loras: Optional[int] = None - # Maximum LoRA adapters that can be mixed in one batch. - max_loras_per_batch: int = 8 - # LoRA backend implementation. - lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" - # Enable multimodal processing pipeline. - enable_multimodal: bool = False - # Max concurrent multimodal tool calls. - mm_max_concurrent_calls: int = 32 - # Timeout (seconds) for each multimodal call. - mm_per_request_timeout: float = 10.0 - # Speculative decoding algorithm name (e.g. "eagle", "ngram"). - speculative_algorithm: Optional[str] = None - # Draft model path used in speculative decoding. - speculative_draft_model_path: Optional[Path] = None - # Number of speculative steps per target decode iteration. - speculative_num_steps: Optional[int] = None - # Number of proposed draft tokens per speculation step. - speculative_num_draft_tokens: Optional[int] = None - # ------------------------------------------------------------------------- - # Internal bookkeeping (not usually set by users directly) - # ------------------------------------------------------------------------- - # Additional arbitrary key-value options for forward compatibility. + # --------------------------------------------------------------------- # + # Feature switches + # --------------------------------------------------------------------- # + # enable_lora: bool = False + # max_loaded_loras: Optional[int] = None + # max_loras_per_batch: int = 8 + # lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" + # enable_multimodal: bool = False + # speculative_algorithm: Optional[str] = None + # speculative_draft_model_path: Optional[Path] = None + # speculative_num_steps: Optional[int] = None + # speculative_num_draft_tokens: Optional[int] = None + + # --------------------------------------------------------------------- # + # Extra + # --------------------------------------------------------------------- # extra_options: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: - """Normalize defaults and validate constraints after dataclass initialization.""" if self.tokenizer_path is None: self.tokenizer_path = self.model_path if self.served_model_name is None: self.served_model_name = str(self.model_path) + self._validate() - self._validate_basic_constraints() - self._validate_parallelism_constraints() - self._validate_scheduler_constraints() - - def _validate_basic_constraints(self) -> None: - """Validate scalar ranges and common invariants.""" + def _validate(self) -> None: if self.port <= 0 or self.port > 65535: raise ValueError("`port` must be in range [1, 65535].") - if self.max_prefill_tokens <= 0: - raise ValueError("`max_prefill_tokens` must be greater than 0.") + if self.max_prefill_tokens is not None and self.max_prefill_tokens <= 0: + raise ValueError("`max_prefill_tokens` must be > 0.") if self.stream_interval <= 0: - raise ValueError("`stream_interval` must be greater than 0.") + raise ValueError("`stream_interval` must be > 0.") if self.mem_fraction_static is not None and not ( 0.0 < self.mem_fraction_static < 1.0 ): - raise ValueError("`mem_fraction_static` must be in range (0.0, 1.0).") - - def _validate_parallelism_constraints(self) -> None: - """Validate distributed and parallel topology settings.""" - for key, value in { - "tp_size": self.tp_size, - "dp_size": self.dp_size, - "ep_size": self.ep_size, - "pp_size": self.pp_size, - "nnodes": self.nnodes, - }.items(): - if value <= 0: - raise ValueError(f"`{key}` must be greater than 0.") - - if self.node_rank < 0 or self.node_rank >= self.nnodes: - raise ValueError("`node_rank` must satisfy 0 <= node_rank < nnodes.") - - def _validate_scheduler_constraints(self) -> None: - """Validate scheduler-related soft limits.""" + raise ValueError("`mem_fraction_static` must be in (0.0, 1.0).") if self.max_running_requests is not None and self.max_running_requests <= 0: - raise ValueError("`max_running_requests` must be greater than 0 when set.") + raise ValueError("`max_running_requests` must be > 0 when set.") if self.max_queued_requests is not None and self.max_queued_requests < 0: raise ValueError("`max_queued_requests` must be >= 0 when set.") - if self.max_total_tokens is not None and self.max_total_tokens <= 0: - raise ValueError("`max_total_tokens` must be greater than 0 when set.") - if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: - raise ValueError("`chunked_prefill_size` must be greater than 0 when set.") if self.schedule_conservativeness <= 0: - raise ValueError("`schedule_conservativeness` must be greater than 0.") - - def to_dict(self) -> dict[str, Any]: - """ - Serialize config to a plain dictionary. - - Path values are converted to string for easier JSON/YAML serialization. - """ - data = asdict(self) - for key in [ - "model_path", - "tokenizer_path", - "download_dir", - "file_storage_path", - "speculative_draft_model_path", - ]: - if data.get(key) is not None: - data[key] = str(data[key]) - return data + raise ValueError("`schedule_conservativeness` must be > 0.") diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 7ce1be5e..25ada7c7 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -1 +1,115 @@ -import multiprocessing as mp +import logging +from pathlib import Path +from typing import Optional + +import zmq +import torch +import torch.multiprocessing as mp +from transformers import AutoConfig +from huggingface_hub import snapshot_download +from pymllm.configs import get_global_config +from pymllm.orchestrator.tokenizer_process import TokenizerProcess +from pymllm.orchestrator.detokenizer_process import DetokenizerProcess +from pymllm.orchestrator.model_runner_process import ModelRunnerProcess +from pymllm.orchestrator.async_disk_io_process import AsyncDiskIoProcess +from pymllm.orchestrator.request_response_process import RequestResponseProcess + +logger = logging.getLogger(__name__) + + +class Engine: + def __init__(self): + self._config_logging() + self._set_default_torch_dtype() + self._check_model_and_tokenizer() + + # Orchestrator, shall we start the music here? + self._launch_processes() + + def _launch_processes(self): + """ + TODO issue processes here + """ + + # RR process is the main process + self._rr_process = RequestResponseProcess() + + def _set_default_torch_dtype(self): + """Set the default torch dtype based on the server configuration.""" + dtype = get_global_config().server.dtype + if dtype == "auto": + dtype = "bfloat16" if torch.cuda.is_available() else "float32" + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + torch_dtype = dtype_map.get(dtype) + if torch_dtype is None: + raise ValueError(f"Unsupported dtype for torch default dtype: {dtype!r}") + torch.set_default_dtype(torch_dtype) + + def _config_logging(self): + """Configure logging level from server configuration.""" + level_name = get_global_config().server.log_level.upper() + level = getattr(logging, level_name, logging.INFO) + root_logger = logging.getLogger() + if not root_logger.handlers: + logging.basicConfig( + level=level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + else: + root_logger.setLevel(level) + logging.getLogger("pymllm").setLevel(level) + + def _check_model_and_tokenizer(self): + cfg = get_global_config() + if cfg.server.model_path is None or cfg.server.tokenizer_path is None: + logger.error("Model path or tokenizer path is not set") + raise ValueError("Model path or tokenizer path is not set") + model_path = cfg.server.model_path + tokenizer_path = cfg.server.tokenizer_path + download_dir = cfg.server.download_dir + trust_remote_code = cfg.server.trust_remote_code + + shared_path = model_path == tokenizer_path + + model_path = self._maybe_download(model_path, download_dir) + cfg.server.model_path = model_path + + if shared_path: + cfg.server.tokenizer_path = model_path + else: + cfg.server.tokenizer_path = self._maybe_download( + tokenizer_path, download_dir + ) + + cfg.model.hf_config = AutoConfig.from_pretrained( + str(model_path), + trust_remote_code=trust_remote_code, + ) + logger.info("Loaded model config: %s", cfg.model.hf_config.__class__.__name__) + + @staticmethod + def _maybe_download(path: Path, download_dir: Optional[Path] = None) -> Path: + """Return a local directory for *path*, downloading if necessary.""" + if path.is_dir(): + return path + + repo_id = str(path) + logger.info("Downloading '%s' ...", repo_id) + + kwargs = {} + if download_dir is not None: + kwargs["local_dir"] = str(download_dir / path.name) + + downloaded = snapshot_download(repo_id=repo_id, **kwargs) + logger.info("Downloaded '%s' to '%s'", repo_id, downloaded) + return Path(downloaded) + + def generate(self, stream: bool = True): + pass + + async def generate_async(self, stream: bool = True): + pass diff --git a/pymllm/orchestrator/scheduler.py b/pymllm/executor/eager_runner.py similarity index 100% rename from pymllm/orchestrator/scheduler.py rename to pymllm/executor/eager_runner.py diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py new file mode 100644 index 00000000..598d93eb --- /dev/null +++ b/pymllm/orchestrator/async_disk_io_process.py @@ -0,0 +1,3 @@ +class AsyncDiskIoProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py new file mode 100644 index 00000000..47c1c595 --- /dev/null +++ b/pymllm/orchestrator/detokenizer_process.py @@ -0,0 +1,3 @@ +class DetokenizerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py new file mode 100644 index 00000000..45091b59 --- /dev/null +++ b/pymllm/orchestrator/model_runner_process.py @@ -0,0 +1,3 @@ +class ModelRunnerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/parallel_state.py b/pymllm/orchestrator/parallel_state.py index 545c74a8..9fb20876 100644 --- a/pymllm/orchestrator/parallel_state.py +++ b/pymllm/orchestrator/parallel_state.py @@ -1,21 +1,31 @@ -"""Parallel state management for tensor and pipeline parallelism.""" +"""Minimal parallel state for single-GPU serving. + +pymllm targets single-GPU, high-concurrency inference. This module keeps +the TP / DP / PP scaffolding so the rest of the codebase can query ranks +and groups uniformly, but the default (and expected) case is world_size=1. +""" import logging +from typing import Optional + import torch import torch.distributed as dist -from typing import Optional -from pymllm.configs.global_config import get_global_config from pymllm.orchestrator.group_coordinator import GroupCoordinator logger = logging.getLogger(__name__) - -# Global groups _TP_GROUP: Optional[GroupCoordinator] = None _DP_GROUP: Optional[GroupCoordinator] = None _PP_GROUP: Optional[GroupCoordinator] = None +_TP_RANK: int = 0 +_TP_SIZE: int = 1 +_DP_RANK: int = 0 +_DP_SIZE: int = 1 +_PP_RANK: int = 0 +_PP_SIZE: int = 1 + def initialize_model_parallel( tensor_model_parallel_size: int = 1, @@ -23,15 +33,12 @@ def initialize_model_parallel( pipeline_model_parallel_size: int = 1, backend: str = "nccl", ) -> None: - """Initialize model parallel groups. - - Args: - tensor_model_parallel_size: Number of GPUs for tensor parallelism - data_parallel_size: Number of GPUs for data parallelism - pipeline_model_parallel_size: Number of stages for pipeline parallelism - backend: Communication backend (nccl for GPU, gloo for CPU) - """ global _TP_GROUP, _DP_GROUP, _PP_GROUP + global _TP_RANK, _TP_SIZE, _DP_RANK, _DP_SIZE, _PP_RANK, _PP_SIZE + + _TP_SIZE = tensor_model_parallel_size + _DP_SIZE = data_parallel_size + _PP_SIZE = pipeline_model_parallel_size if not dist.is_initialized(): return @@ -40,29 +47,6 @@ def initialize_model_parallel( world_rank = dist.get_rank() local_rank = int(torch.cuda.current_device()) if torch.cuda.is_available() else 0 - config = get_global_config() - - # Update runtime config - config.runtime.world_size = world_size - config.runtime.world_rank = world_rank - config.runtime.local_rank = local_rank - config.runtime.tp_size = tensor_model_parallel_size - config.runtime.dp_size = data_parallel_size - config.runtime.pp_size = pipeline_model_parallel_size - - # Logging - logger.info( - "Model parallel runtime config set: world_size=%s, world_rank=%s, " - "local_rank=%s, tp_size=%s, dp_size=%s, pp_size=%s", - config.runtime.world_size, - config.runtime.world_rank, - config.runtime.local_rank, - config.runtime.tp_size, - config.runtime.dp_size, - config.runtime.pp_size, - ) - - # Validate parallelism setup assert ( tensor_model_parallel_size * data_parallel_size * pipeline_model_parallel_size == world_size @@ -71,13 +55,22 @@ def initialize_model_parallel( f"PP({pipeline_model_parallel_size}) != World({world_size})" ) - # Create TP groups (intra-layer sharding) + logger.info( + "Parallel init: world=%d rank=%d tp=%d dp=%d pp=%d", + world_size, + world_rank, + tensor_model_parallel_size, + data_parallel_size, + pipeline_model_parallel_size, + ) + if tensor_model_parallel_size > 1: num_tp_groups = world_size // tensor_model_parallel_size for i in range(num_tp_groups): ranks = list( range( - i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size + i * tensor_model_parallel_size, + (i + 1) * tensor_model_parallel_size, ) ) if world_rank in ranks: @@ -86,13 +79,9 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.tp_rank = _TP_GROUP.rank_in_group + _TP_RANK = _TP_GROUP.rank_in_group break - else: - _TP_GROUP = None - config.runtime.tp_rank = 0 - # Create DP groups (data replication) if data_parallel_size > 1: num_dp_groups = world_size // data_parallel_size for i in range(num_dp_groups): @@ -103,13 +92,9 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.dp_rank = _DP_GROUP.rank_in_group + _DP_RANK = _DP_GROUP.rank_in_group break - else: - _DP_GROUP = None - config.runtime.dp_rank = 0 - # Create PP groups (inter-layer partitioning) if pipeline_model_parallel_size > 1: num_pp_groups = world_size // pipeline_model_parallel_size for i in range(num_pp_groups): @@ -121,67 +106,60 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.pp_rank = _PP_GROUP.rank_in_group + _PP_RANK = _PP_GROUP.rank_in_group break - else: - _PP_GROUP = None - config.runtime.pp_rank = 0 + + +# ---- group accessors ------------------------------------------------------ def get_tp_group() -> Optional[GroupCoordinator]: - """Get the tensor model parallel group.""" return _TP_GROUP def get_dp_group() -> Optional[GroupCoordinator]: - """Get the data parallel group.""" return _DP_GROUP def get_pp_group() -> Optional[GroupCoordinator]: - """Get the pipeline parallel group.""" return _PP_GROUP -# Convenience functions for tensor parallelism +# ---- rank / size helpers -------------------------------------------------- + + def get_tensor_model_parallel_rank() -> int: - """Get current tensor model parallel rank.""" - return get_global_config().runtime.tp_rank + return _TP_RANK def get_tensor_model_parallel_world_size() -> int: - """Get tensor model parallel world size.""" - return get_global_config().runtime.tp_size + return _TP_SIZE def get_data_parallel_rank() -> int: - """Get current data parallel rank.""" - return get_global_config().runtime.dp_rank + return _DP_RANK def get_data_parallel_world_size() -> int: - """Get data parallel world size.""" - return get_global_config().runtime.dp_size + return _DP_SIZE def get_pipeline_model_parallel_rank() -> int: - """Get current pipeline parallel rank.""" - return get_global_config().runtime.pp_rank + return _PP_RANK def get_pipeline_model_parallel_world_size() -> int: - """Get pipeline parallel world size.""" - return get_global_config().runtime.pp_size + return _PP_SIZE def model_parallel_is_initialized() -> bool: - """Check if model parallel is initialized.""" return _TP_GROUP is not None or _DP_GROUP is not None or _PP_GROUP is not None -# Communication helpers +# ---- communication helpers ------------------------------------------------ + + def tensor_model_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: - """All-reduce across TP group.""" group = get_tp_group() if group is None: return tensor @@ -192,7 +170,6 @@ def tensor_model_parallel_all_gather( tensor: torch.Tensor, dim: int = 0, ) -> torch.Tensor: - """All-gather across TP group.""" group = get_tp_group() if group is None: return tensor @@ -200,7 +177,6 @@ def tensor_model_parallel_all_gather( def data_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: - """All-reduce across DP group.""" group = get_dp_group() if group is None: return tensor diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py new file mode 100644 index 00000000..998c2655 --- /dev/null +++ b/pymllm/orchestrator/request_response_process.py @@ -0,0 +1,10 @@ +""" +This module contains the request and response threads for the orchestrator. + +NOTE: This RR(request and response) threads can only be used as the main thread of the orchestrator. +""" + + +class RequestResponseProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py new file mode 100644 index 00000000..7a7783d5 --- /dev/null +++ b/pymllm/orchestrator/scheduler_process.py @@ -0,0 +1,3 @@ +class SchedulerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py new file mode 100644 index 00000000..0dca2155 --- /dev/null +++ b/pymllm/orchestrator/tokenizer_process.py @@ -0,0 +1,3 @@ +class TokenizerProcess: + def __init__(self): + pass diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index e69de29b..83a222f7 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -0,0 +1,17 @@ +from pymllm.engine.launch import Engine +from pymllm.configs.global_config import make_args, read_args + + +def _prepare_args(): + parser = make_args() + read_args(parser=parser) + + +def main(): + _prepare_args() + engine = Engine() + engine.launch() + + +if __name__ == "__main__": + main() diff --git a/pymllm/tests/test_vocab_parallel_embedding.py b/pymllm/tests/test_vocab_parallel_embedding.py index e22b52a5..44148f98 100644 --- a/pymllm/tests/test_vocab_parallel_embedding.py +++ b/pymllm/tests/test_vocab_parallel_embedding.py @@ -12,10 +12,11 @@ import torch.multiprocessing as mp from typing import Callable -from pymllm.configs import get_global_config from pymllm.layers import VocabParallelEmbedding -from pymllm.orchestrator import ( - initialize_model_parallel, +from pymllm.orchestrator import initialize_model_parallel +from pymllm.orchestrator.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, ) # Show runtime init logs during test execution. @@ -91,10 +92,11 @@ def embedding_forward_tp8_worker_cuda(rank: int, local_rank: int, world_size: in local_rank: Local rank within this node (for logging/debugging) world_size: Total world size """ - config = get_global_config() + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() - assert config.runtime.tp_size == 8, f"Rank {rank}: tp_size should be 8" - assert config.runtime.tp_rank == rank, f"Rank {rank}: tp_rank mismatch" + assert tp_size == 8, f"Rank {rank}: tp_size should be 8" + assert tp_rank == rank, f"Rank {rank}: tp_rank mismatch" vocab_size = 1024 embed_dim = 64 @@ -281,12 +283,12 @@ class TestVocabParallelEmbeddingCUDA: @pytest.fixture(autouse=True) def setup_config(self): - config = get_global_config() - config.runtime.tp_size = 1 - config.runtime.tp_rank = 0 + import pymllm.orchestrator.parallel_state as ps + ps._TP_SIZE = 1 + ps._TP_RANK = 0 yield - config.runtime.tp_size = 1 - config.runtime.tp_rank = 0 + ps._TP_SIZE = 1 + ps._TP_RANK = 0 def test_cuda_forward(self): layer = VocabParallelEmbedding(1000, 512).cuda() diff --git a/pyproject.toml b/pyproject.toml index 160341ba..d417b579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies=[ "packaging", "pytest", "pytest-html", - "apache-tvm-ffi == 0.1.8", + "apache-tvm-ffi == 0.1.8.post2", "pyyaml >= 6.0.2", "openai", "modelscope", @@ -33,12 +33,13 @@ dependencies=[ ] [project.optional-dependencies] -cuda = ["tilelang", "flashinfer-python"] +cuda = ["tilelang", "flashinfer-python", "pyzmq"] [project.scripts] pymllm = "pymllm.__main__:main" mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" mllm-service = "pymllm.mobile.service.tools:cli_app" +pymllm-server = "pymllm.server.launch:main" [tool.setuptools.exclude-package-data] "*" = ["*.pyc"] From 6c4aa444971841693fa682422882edd9ce789797 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 21 Feb 2026 15:05:35 +0000 Subject: [PATCH 08/13] feat: implement store_cache functionality and related components - Added a new `store_cache` CUDA kernel for efficient key/value tensor storage in a cache. - Introduced Python interface for the `store_cache` kernel, enabling its use in PyTorch. - Created benchmarks to compare `store_cache` performance against standard PyTorch indexing. - Updated `.gitignore` to exclude `.claude` directory and added `.pytest_cache` to `mllm-kernel`. - Added tests for `store_cache` functionality to ensure correctness and performance. - Refactored memory management in `KVPool` to utilize the new `store_cache` kernel when applicable. --- .claude/skills/update-codeowners/SKILL.md | 44 + .gitignore | 1 - mllm-kernel/.gitignore | 1 + mllm-kernel/benchmarks/bench_store_cache.py | 164 ++++ .../mllm_kernel/cuda/csrc/store_cache.cuh | 202 +++++ mllm-kernel/mllm_kernel/cuda/jit/__init__.py | 3 +- .../mllm_kernel/cuda/jit/store_cache.py | 127 +++ mllm-kernel/tests/test_store_cache.py | 66 ++ pymllm/engine/io_struct.py | 196 +++++ pymllm/engine/launch.py | 308 ++++++- pymllm/mem_cache/__init__.py | 37 + pymllm/mem_cache/memory_pool.py | 480 +++++++++++ pymllm/mem_cache/param_disk_cache.py | 0 pymllm/mem_cache/radix_cache.py | 794 ++++++++++++++++++ pymllm/orchestrator/async_disk_io_process.py | 83 +- pymllm/orchestrator/detokenizer_process.py | 113 ++- pymllm/orchestrator/ipc_utils.py | 70 ++ pymllm/orchestrator/model_runner_process.py | 113 ++- .../orchestrator/request_response_process.py | 148 +++- pymllm/orchestrator/scheduler_process.py | 247 +++++- pymllm/orchestrator/tokenizer_process.py | 101 ++- 21 files changed, 3264 insertions(+), 34 deletions(-) create mode 100644 .claude/skills/update-codeowners/SKILL.md create mode 100644 mllm-kernel/benchmarks/bench_store_cache.py create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/store_cache.py create mode 100644 mllm-kernel/tests/test_store_cache.py create mode 100644 pymllm/engine/io_struct.py create mode 100644 pymllm/mem_cache/memory_pool.py delete mode 100644 pymllm/mem_cache/param_disk_cache.py create mode 100644 pymllm/orchestrator/ipc_utils.py diff --git a/.claude/skills/update-codeowners/SKILL.md b/.claude/skills/update-codeowners/SKILL.md new file mode 100644 index 00000000..28666704 --- /dev/null +++ b/.claude/skills/update-codeowners/SKILL.md @@ -0,0 +1,44 @@ +--- +name: update-codeowners +description: Updates CODEOWNERS entries safely with consistent path and owner formatting. Use when the user asks to add, remove, or modify CODEOWNERS rules, ownership mappings, reviewers, or module maintainers. +--- + +# Update CODEOWNERS + +## Goal +Maintain `CODEOWNERS` accurately while preserving the repository's existing section/comment style. + +## Workflow +1. Read the current `CODEOWNERS` file before editing. +2. Identify requested changes as one of: + - Add new path rule + - Modify owners for existing path rule + - Remove obsolete path rule + - Reorganize section comments (only if requested) +3. Update rules in place instead of creating duplicates for the same path. +4. Keep existing section headers and comment style unless the user asks to refactor structure. +5. Return a concise changelog describing which paths were added, changed, or removed. + +## Rule Format +- Use one rule per line: ` ...` +- Owners must be GitHub handles prefixed with `@`. +- Keep path style consistent with the file (in this repo, path patterns typically start with `/`). +- Do not leave rules with empty owner lists. + +## Editing Guidelines +- Prefer minimal edits near related sections. +- If a path already exists, update that line instead of adding a second conflicting line. +- If a new rule logically belongs to an existing section, place it in that section. +- Preserve human-readable grouping and blank lines. +- Keep comments intact unless they are clearly outdated and the user asked for cleanup. + +## Validation Checklist +- [ ] Every non-comment, non-empty line has at least one owner. +- [ ] Every owner token starts with `@`. +- [ ] No accidental duplicate rule for the exact same path pattern. +- [ ] Existing comments/sections were preserved unless explicitly changed. + +## Example Requests +- "Add `/mllm/models/new_model/ @alice @bob` under models." +- "Change `/core/Storage` owner to `@team-core`." +- "Remove ownership rule for deprecated path `/legacy/`." diff --git a/.gitignore b/.gitignore index 7397d6ec..cdafc270 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ .cache/ .tmp/ compile_commands.json -.claude/ # MLLM Team Specific tasks/mllmteam* diff --git a/mllm-kernel/.gitignore b/mllm-kernel/.gitignore index df61d0fa..3eefc8fb 100644 --- a/mllm-kernel/.gitignore +++ b/mllm-kernel/.gitignore @@ -3,3 +3,4 @@ build-py/ .vscode/settings.json compile_commands.json .clangd +.pytest_cache/ diff --git a/mllm-kernel/benchmarks/bench_store_cache.py b/mllm-kernel/benchmarks/bench_store_cache.py new file mode 100644 index 00000000..b96fa608 --- /dev/null +++ b/mllm-kernel/benchmarks/bench_store_cache.py @@ -0,0 +1,164 @@ +"""Benchmark store_cache vs torch index with torch.profiler. + +Example: +python benchmarks/bench_store_cache.py --warmup 20 --iters 200 --batch-size 512 --num-slots 8192 +""" + +import argparse + +import torch +from torch.profiler import ProfilerActivity, profile + +from mllm_kernel.cuda.jit import can_use_store_cache, store_cache + + +def _run_store_cache_once( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, +): + store_cache(k, v, k_cache, v_cache, indices) + + +def _run_torch_index_once( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, +): + k_cache[indices] = k + v_cache[indices] = v + + +def _profile_path( + name: str, + fn, + *, + warmup: int, + iters: int, + row_limit: int, + trace_path: str | None, +): + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=False, + profile_memory=False, + with_stack=False, + ) as prof: + for _ in range(iters): + fn() + torch.cuda.synchronize() + + events = prof.key_averages() + # torch profiler times are in microseconds. + # PyTorch versions vary between *cuda* and *device* naming. + time_attr = ( + "self_cuda_time_total" + if events and hasattr(events[0], "self_cuda_time_total") + else "self_device_time_total" + ) + sort_key = ( + "self_cuda_time_total" + if time_attr == "self_cuda_time_total" + else "self_device_time_total" + ) + total_self_device_us = sum(float(getattr(evt, time_attr, 0.0)) for evt in events) + avg_self_device_us = total_self_device_us / max(iters, 1) + + print(f"\n=== {name} ===") + print( + prof.key_averages().table( + sort_by=sort_key, + row_limit=row_limit, + ) + ) + print(f"{name} total self device time: {total_self_device_us:.2f} us") + print(f"{name} avg self device time/iter: {avg_self_device_us:.2f} us") + + if trace_path: + prof.export_chrome_trace(trace_path) + print(f"{name} trace exported: {trace_path}") + + return avg_self_device_us + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark store_cache vs torch index using torch.profiler" + ) + parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--num-slots", type=int, default=16384) + parser.add_argument("--head-num", type=int, default=8) + parser.add_argument("--head-dim", type=int, default=128) + parser.add_argument( + "--dtype", + type=str, + default="float16", + choices=["float16", "bfloat16", "float32"], + ) + parser.add_argument("--warmup", type=int, default=50) + parser.add_argument("--iters", type=int, default=200) + parser.add_argument("--row-limit", type=int, default=20) + parser.add_argument("--export-trace-dir", type=str, default="") + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for this benchmark") + + torch.manual_seed(args.seed) + device = torch.device("cuda") + dtype = getattr(torch, args.dtype) + + row_dim = args.head_num * args.head_dim + row_bytes = row_dim * torch.tensor([], dtype=dtype).element_size() + if not can_use_store_cache(row_bytes): + raise RuntimeError(f"store_cache is unavailable for row_bytes={row_bytes}") + + k = torch.randn(args.batch_size, row_dim, device=device, dtype=dtype) + v = torch.randn(args.batch_size, row_dim, device=device, dtype=dtype) + # Use unique indices to avoid write conflicts. + indices = torch.randperm(args.num_slots, device=device)[: args.batch_size].to( + torch.int64 + ) + k_cache = torch.zeros(args.num_slots, row_dim, device=device, dtype=dtype) + v_cache = torch.zeros_like(k_cache) + print("=== store_cache profiler benchmark ===") + print( + f"shape: batch={args.batch_size}, row_dim={row_dim}, slots={args.num_slots}, dtype={dtype}" + ) + print(f"warmup={args.warmup}, iters={args.iters}, row_limit={args.row_limit}") + + trace_dir = args.export_trace_dir.strip() + store_trace = f"{trace_dir}/store_cache_trace.json" if trace_dir else None + torch_trace = f"{trace_dir}/torch_index_trace.json" if trace_dir else None + + store_avg_us = _profile_path( + "store_cache", + lambda: _run_store_cache_once(k, v, k_cache, v_cache, indices), + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=store_trace, + ) + torch_avg_us = _profile_path( + "torch_index", + lambda: _run_torch_index_once(k, v, k_cache, v_cache, indices), + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=torch_trace, + ) + speedup = torch_avg_us / max(store_avg_us, 1e-12) + print(f"\nSpeedup: {speedup:.3f}x") + + +if __name__ == "__main__": + main() diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh new file mode 100644 index 00000000..05daabee --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh @@ -0,0 +1,202 @@ +// Copyright SGLang Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Store KV cache kernel: efficiently scatter key/value tensors into a +// pre-allocated KV cache pool using warp-level vectorized copies. +// +// Reference: sglang jit_kernel/csrc/elementwise/kvcache.cuh + +#pragma once + +#include +#include +#include + +#include +#include + +#include + +namespace { + +// ─────────────────────────────────────────────────────────────── +// Parameter block passed to the kernel via __grid_constant__ +// ─────────────────────────────────────────────────────────────── + +struct StoreKVCacheParams { + const void* __restrict__ k; + const void* __restrict__ v; + void* __restrict__ k_cache; + void* __restrict__ v_cache; + const void* __restrict__ indices; + int64_t stride_k_bytes; + int64_t stride_v_bytes; + int64_t stride_cache_bytes; + int64_t stride_indices; + uint32_t batch_size; +}; + +constexpr uint32_t kNumWarps = 4; +constexpr uint32_t kThreadsPerBlock = kNumWarps * device::kWarpThreads; + +// ─────────────────────────────────────────────────────────────── +// Vectorized warp-level KV copy +// ─────────────────────────────────────────────────────────────── +// +// Each warp copies kElementBytes of K data and kElementBytes of V +// data using the widest possible aligned vector type (uint4 = 16B, +// uint2 = 8B, or uint32_t = 4B). + +namespace detail { + +template +__device__ __forceinline__ void warp_copy_bytes(const void* __restrict__ src, void* __restrict__ dst, int64_t num_vecs) { + const int lane = threadIdx.x % device::kWarpThreads; + const auto* s = static_cast(src); + auto* d = static_cast(dst); + for (int64_t i = lane; i < num_vecs; i += device::kWarpThreads) { d[i] = s[i]; } +} + +} // namespace detail + +template +__device__ __forceinline__ void copy_kv_warp(const void* __restrict__ k_src, const void* __restrict__ v_src, + void* __restrict__ k_dst, void* __restrict__ v_dst) { + static_assert(kElementBytes > 0 && kElementBytes % 4 == 0, "Element size must be a positive multiple of 4 bytes"); + + // Pick the widest aligned vector type the element size supports. + if constexpr (kElementBytes % 16 == 0) { + constexpr int64_t N = kElementBytes / 16; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } else if constexpr (kElementBytes % 8 == 0) { + constexpr int64_t N = kElementBytes / 8; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } else { + constexpr int64_t N = kElementBytes / 4; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } +} + +// ─────────────────────────────────────────────────────────────── +// Main kernel +// ─────────────────────────────────────────────────────────────── +// +// Template parameters: +// kElementBytes total bytes per token row (head_num * head_dim * dtype_size) +// kSplit how many warps collaborate on one element (1, 2, or 4) +// kUsePDL whether to emit PDL synchronisation instructions +// T index dtype (int32_t or int64_t) + +template +__global__ void store_kvcache(const __grid_constant__ StoreKVCacheParams params) { + using namespace device; + constexpr auto kSplitSize = kElementBytes / kSplit; + + const uint32_t warp_id = blockIdx.x * kNumWarps + threadIdx.x / kWarpThreads; + const uint32_t item_id = warp_id / kSplit; + const uint32_t split_id = warp_id % kSplit; + + const auto& [k_input, v_input, k_cache, v_cache, indices, stride_k, stride_v, stride_cache, stride_indices, batch_size] = + params; + + if (item_id >= batch_size) return; + + const auto index_ptr = static_cast(indices) + item_id * stride_indices; + PDLWaitPrimary(); + + const auto index = *index_ptr; + const auto k_src = pointer::offset(k_input, item_id * stride_k, split_id * kSplitSize); + const auto v_src = pointer::offset(v_input, item_id * stride_v, split_id * kSplitSize); + const auto k_dst = pointer::offset(k_cache, index * stride_cache, split_id * kSplitSize); + const auto v_dst = pointer::offset(v_cache, index * stride_cache, split_id * kSplitSize); + + copy_kv_warp(k_src, v_src, k_dst, v_dst); + PDLTriggerSecondary(); +} + +template +struct StoreKVCacheKernel { + static_assert(kElementBytes > 0 && kElementBytes % 4 == 0); + + template + static constexpr auto store_kernel = store_kvcache; + + template + static auto get_kernel(int num_split) { + using namespace mllm_kernel::host; + if constexpr (kElementBytes % (4 * 128) == 0) { + if (num_split == 4) return store_kernel<4, T>; + } + if constexpr (kElementBytes % (2 * 128) == 0) { + if (num_split == 2) return store_kernel<2, T>; + } + if (num_split == 1) return store_kernel<1, T>; + Panic("Unsupported num_split ", num_split, " for element size ", kElementBytes); + } + + static void run(tvm::ffi::TensorView k, tvm::ffi::TensorView v, tvm::ffi::TensorView k_cache, tvm::ffi::TensorView v_cache, + tvm::ffi::TensorView indices, int num_split) { + using namespace mllm_kernel::host; + + auto B = SymbolicSize{"batch_size"}; + auto D = SymbolicSize{"element_size"}; + auto KS = SymbolicSize{"k_stride"}; + auto VS = SymbolicSize{"v_stride"}; + auto S = SymbolicSize{"cache_stride"}; + auto I = SymbolicSize{"indices_stride"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + auto indice_dtype = SymbolicDType{}; + device.set_options(); + + // k, v: [B, D] with strides [KS, 1] + (void)TensorMatcher({B, D}).with_strides({KS, 1}).with_dtype(dtype).with_device(device).verify(k); + (void)TensorMatcher({B, D}).with_strides({VS, 1}).with_dtype(dtype).with_device(device).verify(v); + + // k_cache, v_cache: [*, D] with strides [S, 1] + (void)TensorMatcher({-1, D}).with_strides({S, 1}).with_dtype(dtype).with_device(device).verify(k_cache).verify(v_cache); + + // indices: [B] with strides [I] + (void)TensorMatcher({B}).with_strides({I}).with_dtype(indice_dtype).with_device(device).verify(indices); + + const int64_t dtype_size = dtype_bytes(dtype.unwrap()); + const uint32_t num_elements = static_cast(B.unwrap()); + RuntimeCheck(kElementBytes == dtype_size * D.unwrap(), "Element size mismatch: expected ", kElementBytes, " but got ", + dtype_size * D.unwrap()); + + const auto params = StoreKVCacheParams{ + .k = k.data_ptr(), + .v = v.data_ptr(), + .k_cache = k_cache.data_ptr(), + .v_cache = v_cache.data_ptr(), + .indices = indices.data_ptr(), + .stride_k_bytes = KS.unwrap() * dtype_size, + .stride_v_bytes = VS.unwrap() * dtype_size, + .stride_cache_bytes = S.unwrap() * dtype_size, + .stride_indices = I.unwrap(), + .batch_size = num_elements, + }; + + const auto use_int32 = indice_dtype.is_type(); + const auto kernel = use_int32 ? get_kernel(num_split) : get_kernel(num_split); + const auto num_blocks = div_ceil(num_elements * num_split, kNumWarps); + + LaunchKernel(num_blocks, kThreadsPerBlock, device.unwrap()).enable_pdl(kUsePDL)(kernel, params); + } +}; + +} // namespace diff --git a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py index 696e73ea..202ff3b3 100644 --- a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py +++ b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py @@ -1,3 +1,4 @@ from .add_constant import add_constant +from .store_cache import can_use_store_cache, store_cache -__all__ = ["add_constant"] +__all__ = ["add_constant", "can_use_store_cache", "store_cache"] diff --git a/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py b/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py new file mode 100644 index 00000000..96a73f5e --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py @@ -0,0 +1,127 @@ +# Copyright (c) MLLM Team. +# Licensed under the MIT License. +# +# Python interface for the store_cache CUDA kernel. +# Efficiently scatters key/value tensors into a pre-allocated KV cache pool. + +from __future__ import annotations + +import logging +import torch +from mllm_kernel.jit_utils import jit +from mllm_kernel.jit_utils.compile import cache_once, make_cpp_args + + +logger = logging.getLogger(__name__) + + +@cache_once +def _is_arch_support_pdl() -> bool: + if not torch.cuda.is_available(): + return False + major, minor = torch.cuda.get_device_capability() + # PDL requires sm_90a (Hopper) or later + return major > 9 or (major == 9 and minor >= 0) + + +def _make_store_cache_kernel(row_bytes: int): + """Create a JIT-compiled store_cache kernel for the given row_bytes.""" + pdl = _is_arch_support_pdl() + cpp_args = make_cpp_args(row_bytes, pdl) + + @jit( + args=[row_bytes, pdl], + device="cuda", + cuda_files=["store_cache.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("store_cache", f"StoreKVCacheKernel<{cpp_args}>::run"), + ], + func_name="store_cache", + ) + def _kernel( + compiled_module, + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, + num_split: int, + ) -> None: + compiled_module.store_cache(k, v, k_cache, v_cache, indices, num_split) + + return _kernel + + +_KERNEL_CACHE: dict[int, object] = {} + + +def _get_kernel(row_bytes: int): + if row_bytes not in _KERNEL_CACHE: + _KERNEL_CACHE[row_bytes] = _make_store_cache_kernel(row_bytes) + return _KERNEL_CACHE[row_bytes] + + +@cache_once +def can_use_store_cache(row_bytes: int) -> bool: + """Check whether the JIT store_cache kernel supports the given row size. + + Returns ``False`` if *row_bytes* is not a multiple of 4 or if the JIT + compilation fails for any reason. + """ + if row_bytes % 4 != 0: + logger.warning( + "Unsupported row_bytes=%d for JIT store_cache kernel: " + "must be multiple of 4", + row_bytes, + ) + return False + try: + _get_kernel(row_bytes) + return True + except Exception as e: + logger.warning( + "Failed to load JIT store_cache kernel with row_bytes=%d: %s", + row_bytes, + e, + ) + return False + + +def store_cache( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, + *, + row_bytes: int = 0, + num_split: int = 0, +) -> None: + """Store key and value tensors into a KV cache at specified indices. + + Each row of *k* (and *v*) is scattered into *k_cache* (and *v_cache*) + at the location given by the corresponding entry in *indices*. + + Args: + k: Key tensor, shape ``(batch_size, head_num * head_dim)``. + v: Value tensor, shape ``(batch_size, head_num * head_dim)``. + k_cache: Key cache, shape ``(num_slots, head_num * head_dim)``. + v_cache: Value cache, shape ``(num_slots, head_num * head_dim)``. + indices: Index tensor, shape ``(batch_size,)``, dtype int32 or int64. + row_bytes: Bytes per row. Auto-detected from *k* when 0. + num_split: Number of warps that cooperate on each element (1, 2, or 4). + When 0 the best value is chosen automatically based on alignment. + """ + row_bytes = row_bytes or k.shape[-1] * k.element_size() + kernel = _get_kernel(row_bytes) + + if num_split <= 0: + if row_bytes % 2048 == 0: + num_split = 4 + elif row_bytes % 1024 == 0: + num_split = 2 + else: + num_split = 1 + + kernel(k, v, k_cache, v_cache, indices, num_split) diff --git a/mllm-kernel/tests/test_store_cache.py b/mllm-kernel/tests/test_store_cache.py new file mode 100644 index 00000000..5e4f1bcc --- /dev/null +++ b/mllm-kernel/tests/test_store_cache.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import pytest +import torch + +from mllm_kernel.cuda.jit import can_use_store_cache, store_cache + + +def _make_inputs( + *, + batch_size: int, + num_slots: int, + row_dim: int, + dtype: torch.dtype, + index_dtype: torch.dtype, + seed: int = 0, +): + torch.manual_seed(seed) + device = "cuda" + k = torch.randn(batch_size, row_dim, device=device, dtype=dtype) + v = torch.randn(batch_size, row_dim, device=device, dtype=dtype) + # Use unique indices to avoid write conflicts on the same cache slot. + indices = torch.randperm(num_slots, device=device)[:batch_size].to(index_dtype) + k_cache = torch.zeros(num_slots, row_dim, device=device, dtype=dtype) + v_cache = torch.zeros_like(k_cache) + return k, v, k_cache, v_cache, indices + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) +@pytest.mark.parametrize("index_dtype", [torch.int32, torch.int64]) +def test_store_cache_matches_torch_index(dtype: torch.dtype, index_dtype: torch.dtype): + batch_size = 257 + num_slots = 4096 + row_dim = 8 * 128 # 1024 -> fp16 row_bytes=2048 + row_bytes = row_dim * torch.tensor([], dtype=dtype).element_size() + + assert can_use_store_cache(row_bytes), f"store_cache unavailable for row_bytes={row_bytes}" + + k, v, k_cache, v_cache, indices = _make_inputs( + batch_size=batch_size, + num_slots=num_slots, + row_dim=row_dim, + dtype=dtype, + index_dtype=index_dtype, + seed=2026, + ) + + k_ref = k_cache.clone() + v_ref = v_cache.clone() + k_ref[indices] = k + v_ref[indices] = v + + store_cache(k, v, k_cache, v_cache, indices) + torch.cuda.synchronize() + + assert torch.equal(k_cache, k_ref) + assert torch.equal(v_cache, v_ref) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_can_use_store_cache_rejects_invalid_row_bytes(): + assert not can_use_store_cache(2) + assert not can_use_store_cache(6) + assert can_use_store_cache(4) + diff --git a/pymllm/engine/io_struct.py b/pymllm/engine/io_struct.py new file mode 100644 index 00000000..777186e2 --- /dev/null +++ b/pymllm/engine/io_struct.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, Iterator, List, Optional, Union + + +@dataclass +class BaseReq: + rid: Optional[Union[str, List[str]]] = field(default=None, kw_only=True) + + def regenerate_rid(self) -> Union[str, List[str]]: + if isinstance(self.rid, list): + self.rid = [uuid.uuid4().hex for _ in range(len(self.rid))] + else: + self.rid = uuid.uuid4().hex + return self.rid + + +@dataclass +class BaseBatchReq: + rids: List[str] + + def regenerate_rids(self) -> List[str]: + self.rids = [uuid.uuid4().hex for _ in range(len(self.rids))] + return self.rids + + +@dataclass +class GenerateReqInput(BaseReq): + text: Optional[Union[List[str], str]] = None + input_ids: Optional[Union[List[List[int]], List[int]]] = None + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + return_logprob: Optional[Union[List[bool], bool]] = None + logprob_start_len: Optional[Union[List[int], int]] = None + top_logprobs_num: Optional[Union[List[int], int]] = None + stream: bool = False + + # Multimodal placeholders. + image_data: Optional[Any] = None + video_data: Optional[Any] = None + audio_data: Optional[Any] = None + + # Runtime extension placeholders. + lora_path: Optional[Union[List[Optional[str]], str]] = None + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + extra_options: Dict[str, Any] = field(default_factory=dict) + + # Derived fields populated by normalization. + is_single: bool = field(default=True, init=False) + batch_size: int = field(default=1, init=False) + + def normalize_batch_and_arguments(self) -> None: + self._validate_inputs() + self._determine_batch_size() + + def _validate_inputs(self) -> None: + has_text = self.text is not None + has_input_ids = self.input_ids is not None + if has_text == has_input_ids: + raise ValueError("Exactly one of `text` or `input_ids` must be provided.") + + def _determine_batch_size(self) -> None: + if self.text is not None: + if isinstance(self.text, str): + self.is_single = True + self.batch_size = 1 + else: + if len(self.text) == 0: + raise ValueError("`text` cannot be an empty list.") + self.is_single = False + self.batch_size = len(self.text) + return + + assert self.input_ids is not None + if len(self.input_ids) == 0: + raise ValueError("`input_ids` cannot be empty.") + if isinstance(self.input_ids[0], int): + self.is_single = True + self.batch_size = 1 + else: + self.is_single = False + self.batch_size = len(self.input_ids) + + def __getitem__(self, i: int) -> "GenerateReqInput": + if i < 0 or i >= self.batch_size: + raise IndexError(f"index {i} out of range for batch size {self.batch_size}") + if self.batch_size == 1: + return self + return GenerateReqInput( + rid=self._pick(self.rid, i), + text=self._pick(self.text, i), + input_ids=self._pick(self.input_ids, i), + sampling_params=self._pick(self.sampling_params, i), + return_logprob=self._pick(self.return_logprob, i), + logprob_start_len=self._pick(self.logprob_start_len, i), + top_logprobs_num=self._pick(self.top_logprobs_num, i), + stream=self.stream, + image_data=self._pick(self.image_data, i), + video_data=self._pick(self.video_data, i), + audio_data=self._pick(self.audio_data, i), + lora_path=self._pick(self.lora_path, i), + session_params=self._pick(self.session_params, i), + extra_options=self.extra_options.copy(), + ) + + @staticmethod + def _pick(value: Any, i: int) -> Any: + if isinstance(value, list): + return value[i] + return value + + def to_request_dict(self) -> Dict[str, Any]: + payload: Dict[str, Any] = {} + for key, value in { + "rid": self.rid, + "text": self.text, + "input_ids": self.input_ids, + "sampling_params": self.sampling_params, + "return_logprob": self.return_logprob, + "logprob_start_len": self.logprob_start_len, + "top_logprobs_num": self.top_logprobs_num, + "stream": self.stream, + "image_data": self.image_data, + "video_data": self.video_data, + "audio_data": self.audio_data, + "lora_path": self.lora_path, + "session_params": self.session_params, + }.items(): + if value is not None: + payload[key] = value + payload.update(self.extra_options) + return payload + + +@dataclass +class TokenizedGenerateReqInput(BaseReq): + input_text: str = "" + input_ids: List[int] = field(default_factory=list) + sampling_params: Dict[str, Any] = field(default_factory=dict) + stream: bool = False + return_logprob: bool = False + logprob_start_len: int = -1 + top_logprobs_num: int = 0 + lora_path: Optional[str] = None + session_params: Optional[Dict[str, Any]] = None + + +@dataclass +class BatchTokenizedGenerateReqInput(BaseBatchReq): + reqs: List[TokenizedGenerateReqInput] + + def __len__(self) -> int: + return len(self.reqs) + + def __getitem__(self, i: int) -> TokenizedGenerateReqInput: + return self.reqs[i] + + def __iter__(self) -> Iterator[TokenizedGenerateReqInput]: + return iter(self.reqs) + + +@dataclass +class BatchTokenIDOutput(BaseBatchReq): + finished_reasons: List[Optional[str]] + decode_ids: List[int] + read_offsets: List[int] + output_ids: Optional[List[int]] + skip_special_tokens: List[bool] + prompt_tokens: List[int] + completion_tokens: List[int] + input_token_logprobs_val: List[float] = field(default_factory=list) + input_token_logprobs_idx: List[int] = field(default_factory=list) + output_token_logprobs_val: List[float] = field(default_factory=list) + output_token_logprobs_idx: List[int] = field(default_factory=list) + input_top_logprobs_val: List[List[float]] = field(default_factory=list) + input_top_logprobs_idx: List[List[int]] = field(default_factory=list) + output_top_logprobs_val: List[List[float]] = field(default_factory=list) + output_top_logprobs_idx: List[List[int]] = field(default_factory=list) + + +@dataclass +class BatchStrOutput(BaseBatchReq): + finished_reasons: List[Optional[str]] + output_strs: List[str] + output_ids: Optional[List[int]] + prompt_tokens: List[int] + completion_tokens: List[int] + input_token_logprobs_val: List[float] = field(default_factory=list) + input_token_logprobs_idx: List[int] = field(default_factory=list) + output_token_logprobs_val: List[float] = field(default_factory=list) + output_token_logprobs_idx: List[int] = field(default_factory=list) + input_top_logprobs_val: List[List[float]] = field(default_factory=list) + input_top_logprobs_idx: List[List[int]] = field(default_factory=list) + output_top_logprobs_val: List[List[float]] = field(default_factory=list) + output_top_logprobs_idx: List[List[int]] = field(default_factory=list) diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 25ada7c7..edad97af 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -1,38 +1,310 @@ +import asyncio +import atexit import logging +import os +import uuid from pathlib import Path -from typing import Optional +from typing import Any, AsyncIterator, Dict, List, Optional, Union -import zmq import torch import torch.multiprocessing as mp from transformers import AutoConfig from huggingface_hub import snapshot_download + from pymllm.configs import get_global_config -from pymllm.orchestrator.tokenizer_process import TokenizerProcess -from pymllm.orchestrator.detokenizer_process import DetokenizerProcess -from pymllm.orchestrator.model_runner_process import ModelRunnerProcess -from pymllm.orchestrator.async_disk_io_process import AsyncDiskIoProcess -from pymllm.orchestrator.request_response_process import RequestResponseProcess +from pymllm.engine.io_struct import GenerateReqInput +from pymllm.orchestrator.ipc_utils import make_ipc_address +from pymllm.orchestrator.request_response_process import ( + ReqState, + RequestResponseProcess, +) +from pymllm.orchestrator.tokenizer_process import run_tokenizer_process +from pymllm.orchestrator.scheduler_process import run_scheduler_process +from pymllm.orchestrator.model_runner_process import run_model_runner_process +from pymllm.orchestrator.detokenizer_process import run_detokenizer_process +from pymllm.orchestrator.async_disk_io_process import run_async_disk_io_process logger = logging.getLogger(__name__) class Engine: def __init__(self): + self._subprocesses: List[mp.Process] = [] + self._rr_process: Optional[RequestResponseProcess] = None self._config_logging() self._set_default_torch_dtype() self._check_model_and_tokenizer() - # Orchestrator, shall we start the music here? + def launch(self) -> None: self._launch_processes() + atexit.register(self.shutdown) - def _launch_processes(self): - """ - TODO issue processes here + def _launch_processes(self) -> None: + """Spawn all subprocess workers and wire up ZMQ IPC channels.""" + mp.set_start_method("spawn", force=True) + uid = str(os.getpid()) + + # IPC addresses for ZMQ communication between processes + addr_request_response_to_tokenizer: str = make_ipc_address( + "request_response_to_tokenizer", uid + ) + addr_tokenizer_to_scheduler: str = make_ipc_address( + "tokenizer_to_scheduler", uid + ) + addr_scheduler_to_model_runner: str = make_ipc_address( + "scheduler_to_model_runner", uid + ) + addr_model_runner_to_scheduler: str = make_ipc_address( + "model_runner_to_scheduler", uid + ) + addr_scheduler_to_detokenizer: str = make_ipc_address( + "scheduler_to_detokenizer", uid + ) + addr_detokenizer_to_request_response: str = make_ipc_address( + "detokenizer_to_request_response", uid + ) + addr_scheduler_to_disk_io: str = make_ipc_address("scheduler_to_disk_io", uid) + + # Record all subprocesses + procs_and_readers: List[tuple] = [] + + # Tokenizer + tokenizer_reader, tokenizer_writer = mp.Pipe(duplex=False) + tokenizer_proc = mp.Process( + target=run_tokenizer_process, + args=( + addr_request_response_to_tokenizer, + addr_tokenizer_to_scheduler, + tokenizer_writer, + ), + daemon=True, + ) + procs_and_readers.append((tokenizer_proc, tokenizer_reader, "tokenizer")) + + # Scheduler + scheduler_reader, scheduler_writer = mp.Pipe(duplex=False) + scheduler_proc = mp.Process( + target=run_scheduler_process, + args=( + addr_tokenizer_to_scheduler, + addr_scheduler_to_model_runner, + addr_model_runner_to_scheduler, + addr_scheduler_to_detokenizer, + scheduler_writer, + ), + daemon=True, + ) + procs_and_readers.append((scheduler_proc, scheduler_reader, "scheduler")) + + # Model Runner + model_runner_reader, model_runner_writer = mp.Pipe(duplex=False) + model_runner_proc = mp.Process( + target=run_model_runner_process, + args=( + addr_scheduler_to_model_runner, + addr_model_runner_to_scheduler, + model_runner_writer, + ), + daemon=True, + ) + procs_and_readers.append( + (model_runner_proc, model_runner_reader, "model_runner") + ) + + # Detokenizer + detokenizer_reader, detokenizer_writer = mp.Pipe(duplex=False) + detokenizer_proc = mp.Process( + target=run_detokenizer_process, + args=( + addr_scheduler_to_detokenizer, + addr_detokenizer_to_request_response, + detokenizer_writer, + ), + daemon=True, + ) + procs_and_readers.append((detokenizer_proc, detokenizer_reader, "detokenizer")) + + # Async Disk I/O + if get_global_config().server.enable_disk_io_async: + disk_io_reader, disk_io_writer = mp.Pipe(duplex=False) + disk_io_proc = mp.Process( + target=run_async_disk_io_process, + args=(addr_scheduler_to_disk_io, disk_io_writer), + daemon=True, + ) + procs_and_readers.append((disk_io_proc, disk_io_reader, "async_disk_io")) + + # Start all subprocesses + for proc, _, name in procs_and_readers: + proc.start() + self._subprocesses.append(proc) + logger.info("Started %s process (pid=%s)", name, proc.pid) + + # Wait for readiness signals + for _, reader, name in procs_and_readers: + try: + msg = reader.recv() + except EOFError: + raise RuntimeError(f"{name} process died before signalling readiness") + if msg.get("status") != "ready": + raise RuntimeError(f"{name} process failed to initialise: {msg}") + logger.info("%s process ready", name) + + # RR Process is current main process + self._rr_process = RequestResponseProcess( + send_to_tokenizer_addr=addr_request_response_to_tokenizer, + recv_from_detokenizer_addr=addr_detokenizer_to_request_response, + ) + + try: + self._loop = asyncio.get_running_loop() + except RuntimeError: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + + self._rr_process.start(self._loop) + logger.info("RequestResponseProcess started in main process") + + def generate( + self, + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + image_data: Optional[Any] = None, + audio_data: Optional[Any] = None, + video_data: Optional[Any] = None, + return_logprob: Optional[Union[List[bool], bool]] = None, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + lora_path: Optional[Union[List[Optional[str]], str]] = None, + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + stream: bool = False, + rid: Optional[Union[List[str], str]] = None, + **kwargs, + ) -> Dict[str, Any]: + """Synchronous, non-streaming generation entry point.""" + if rid is None: + rid = uuid.uuid4().hex + request = GenerateReqInput( + rid=rid, + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + stream=stream, + image_data=image_data, + audio_data=audio_data, + video_data=video_data, + lora_path=lora_path, + session_params=session_params, + extra_options=kwargs, + ) + request.normalize_batch_and_arguments() + + async def _run() -> Dict[str, Any]: + state = await self._rr_process.add_request(request) + if isinstance(rid, list): + raise ValueError("Synchronous `generate` currently supports single request.") + return await self._wait_for_final_result(rid, state) + + return self._loop.run_until_complete(_run()) + + async def generate_async( + self, + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + image_data: Optional[Any] = None, + audio_data: Optional[Any] = None, + video_data: Optional[Any] = None, + return_logprob: Optional[Union[List[bool], bool]] = None, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + lora_path: Optional[Union[List[Optional[str]], str]] = None, + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + stream: bool = False, + rid: Optional[Union[List[str], str]] = None, + **kwargs, + ) -> AsyncIterator[Dict[str, Any]]: + """Asynchronous generation entry point. + + When *stream* is ``False`` (default) the returned async iterator + yields a **single** final result dict. When *stream* is ``True`` + every incremental chunk from the detokenizer is yielded as it + arrives, following the ``Event + out_list`` pattern. """ + if rid is None: + rid = uuid.uuid4().hex + request = GenerateReqInput( + rid=rid, + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + stream=stream, + image_data=image_data, + audio_data=audio_data, + video_data=video_data, + lora_path=lora_path, + session_params=session_params, + extra_options=kwargs, + ) + request.normalize_batch_and_arguments() + state = await self._rr_process.add_request(request) + + try: + if isinstance(rid, list): + raise ValueError("`generate_async` currently supports single request only.") + if stream: + async for chunk in self._stream_results(rid, state): + yield chunk + else: + yield await self._wait_for_final_result(rid, state) + finally: + self._rr_process.remove_state(rid) + + @staticmethod + async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: + """Block until the request is finished and return the last output.""" + while True: + await state.event.wait() + if state.finished: + return state.out_list[-1] + state.event.clear() + + @staticmethod + async def _stream_results( + rid: str, state: ReqState + ) -> AsyncIterator[Dict[str, Any]]: + """Yield incremental chunks as they arrive, until finished.""" + while True: + await state.event.wait() + for item in state.out_list: + yield item + state.out_list.clear() + if state.finished: + return + state.event.clear() - # RR process is the main process - self._rr_process = RequestResponseProcess() + def shutdown(self) -> None: + """Terminate all subprocesses.""" + if self._rr_process is not None: + try: + self._loop.run_until_complete(self._rr_process.shutdown()) + except Exception: + pass + for proc in self._subprocesses: + if proc.is_alive(): + proc.terminate() + proc.join(timeout=5) + if proc.is_alive(): + proc.kill() + self._subprocesses.clear() + logger.info("All subprocesses shut down") def _set_default_torch_dtype(self): """Set the default torch dtype based on the server configuration.""" @@ -93,23 +365,13 @@ def _check_model_and_tokenizer(self): @staticmethod def _maybe_download(path: Path, download_dir: Optional[Path] = None) -> Path: - """Return a local directory for *path*, downloading if necessary.""" if path.is_dir(): return path - repo_id = str(path) logger.info("Downloading '%s' ...", repo_id) - kwargs = {} if download_dir is not None: kwargs["local_dir"] = str(download_dir / path.name) - downloaded = snapshot_download(repo_id=repo_id, **kwargs) logger.info("Downloaded '%s' to '%s'", repo_id, downloaded) return Path(downloaded) - - def generate(self, stream: bool = True): - pass - - async def generate_async(self, stream: bool = True): - pass diff --git a/pymllm/mem_cache/__init__.py b/pymllm/mem_cache/__init__.py index e69de29b..c2ce06eb 100644 --- a/pymllm/mem_cache/__init__.py +++ b/pymllm/mem_cache/__init__.py @@ -0,0 +1,37 @@ +from pymllm.mem_cache.memory_pool import ( + KVPool, + ReqToTokenPool, + TokenToKVPoolAllocator, + make_full_attention_net_mem_pool, + make_req_to_token_pool, +) +from pymllm.mem_cache.radix_cache import ( + EvictResult, + InsertResult, + MatchResult, + RadixCache, + RadixKey, + TreeNode, + hash_bytes, + hash_to_int64, + hash_token_ids, +) + +__all__ = [ + # memory_pool + "KVPool", + "TokenToKVPoolAllocator", + "ReqToTokenPool", + "make_full_attention_net_mem_pool", + "make_req_to_token_pool", + # radix_cache + "RadixCache", + "RadixKey", + "TreeNode", + "MatchResult", + "InsertResult", + "EvictResult", + "hash_token_ids", + "hash_to_int64", + "hash_bytes", +] diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py new file mode 100644 index 00000000..0721fd71 --- /dev/null +++ b/pymllm/mem_cache/memory_pool.py @@ -0,0 +1,480 @@ +"""Lightweight KV-cache memory pools + +Three-layer architecture:: + + ReqToTokenPool maps (req_slot, position) → kv_index + TokenToKVPoolAllocator manages a free-list of integer indices + KVPool holds the actual GPU K/V tensors + +All indices are **int64** tensors on the target device. Slot 0 in the KV +buffers is reserved as a padding / dummy-output slot and is never allocated. +""" + +import logging +from typing import List, Optional, Tuple, Union + +import torch + +from mllm_kernel.cuda.jit.store_cache import store_cache, can_use_store_cache + +logger = logging.getLogger(__name__) + + +class KVPool: + """GPU (or CPU) storage for per-layer key and value caches. + + Layout per layer:: + + JIT: + k_buffer[layer][slot, k_head_num * k_head_dim] + v_buffer[layer][slot, v_head_num * v_head_dim] + + PyTorch: + k_buffer[layer][slot, k_head_num, k_head_dim] + v_buffer[layer][slot, v_head_num, v_head_dim] + + K and V may have **independent** head counts and head dimensions, which + covers standard MHA, GQA / MQA, and architectures like MLA where value + projection uses a different dimensionality. + + ``size`` usable slots are numbered ``[1, size]``. Slot 0 is a dummy + padding slot that absorbs writes from padded tokens. + + Parameters + ---------- + size : int + Number of usable token slots (total buffer length = ``size + 1``). + layer_num : int + Number of transformer layers (one K buffer + one V buffer per layer). + k_head_num : int + Number of key heads. + k_head_dim : int + Dimension of each key head. + device : str | torch.device + Target device (``"cuda"``, ``"cpu"``, …). + dtype : torch.dtype + Storage data type. + v_head_num : int, optional + Number of value heads. Defaults to *k_head_num*. + v_head_dim : int, optional + Dimension of each value head. Defaults to *k_head_dim*. + pin_memory : bool, optional + Whether to use pinned memory. Defaults to True. + """ + + def __init__( + self, + size: int, + layer_num: int, + k_head_num: int, + k_head_dim: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.float16, + v_head_num: Optional[int] = None, + v_head_dim: Optional[int] = None, + pin_memory: bool = True, + ): + self.size = size + self.layer_num = layer_num + self.k_head_num = k_head_num + self.k_head_dim = k_head_dim + self.v_head_num = v_head_num if v_head_num is not None else k_head_num + self.v_head_dim = v_head_dim if v_head_dim is not None else k_head_dim + self.device = torch.device(device) + self.dtype = dtype + + buf_len = size + 1 # slot 0 is padding + + if buf_len % 8 != 0: + logger.warning( + "KVPool buffer length is not divisible by 8, padding to the next multiple of 8" + ) + buf_len = (buf_len + 7) & ~7 + + k_row_dim = self.k_head_num * self.k_head_dim + v_row_dim = self.v_head_num * self.v_head_dim + self._same_kv_dim = k_row_dim == v_row_dim + self._row_bytes = k_row_dim * torch.tensor([], dtype=dtype).element_size() + self._use_jit = ( + self.device.type == "cuda" + and self._same_kv_dim + and can_use_store_cache(self._row_bytes) + ) + if not self._use_jit: + logger.warning( + f"Fallback to PyTorch index for KVPool, which is slower than the mllm-kernel's implementation, same_kv_dim={self._same_kv_dim}, row_bytes={self._row_bytes}" + ) + + self.k_buffer: List[torch.Tensor] = [ + torch.zeros( + (buf_len, self.k_head_num, self.k_head_dim), + dtype=dtype, + device=self.device, + pin_memory=pin_memory, + ) + for _ in range(layer_num) + ] + self.v_buffer: List[torch.Tensor] = [ + torch.zeros( + (buf_len, self.v_head_num, self.v_head_dim), + dtype=dtype, + device=self.device, + pin_memory=pin_memory, + ) + for _ in range(layer_num) + ] + + # Pre-computed 2D views for the JIT store_cache kernel. + # Zero-copy: same underlying storage as k_buffer / v_buffer. + if self._use_jit: + self._k_buffer_2d = [b.view(buf_len, -1) for b in self.k_buffer] + self._v_buffer_2d = [b.view(buf_len, -1) for b in self.v_buffer] + + logger.info( + "KVPool allocated: %d layers, %d slots, K=[%d,%d] V=[%d,%d], %.2f GB", + layer_num, + size, + self.k_head_num, + self.k_head_dim, + self.v_head_num, + self.v_head_dim, + self._mem_bytes() / (1 << 30), + ) + + def get_key_buffer(self, layer_id: int) -> torch.Tensor: + return self.k_buffer[layer_id] + + def get_value_buffer(self, layer_id: int) -> torch.Tensor: + return self.v_buffer[layer_id] + + def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]: + return self.k_buffer[layer_id], self.v_buffer[layer_id] + + def set_kv_buffer( + self, + layer_id: int, + indices: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ) -> None: + """Write K/V vectors into the cache at the given *indices*. + + ``k`` / ``v`` can be any shape as long as the trailing dimensions + multiply to ``head_num * head_dim`` (the row dimension). All leading + dimensions are treated as the batch axis and must match ``indices`` + after flattening. Typical shapes:: + + k: [num_tokens, head_num, head_dim] indices: [num_tokens] + k: [batch, seq_len, head_num, head_dim] indices: [batch, seq_len] + k: [num_tokens, head_num * head_dim] indices: [num_tokens] + """ + if self._use_jit: + row_dim = self.k_head_num * self.k_head_dim + store_cache( + k.reshape(-1, row_dim), + v.reshape(-1, row_dim), + self._k_buffer_2d[layer_id], + self._v_buffer_2d[layer_id], + indices.reshape(-1), + row_bytes=self._row_bytes, + ) + else: + self.k_buffer[layer_id][indices] = k + self.v_buffer[layer_id][indices] = v + + def _mem_bytes(self) -> int: + total = 0 + for buf in self.k_buffer + self.v_buffer: + total += buf.nelement() * buf.element_size() + return total + + +class TokenToKVPoolAllocator: + """Manages allocation / deallocation of integer indices into a :class:`KVPool`. + + Each ``alloc(n)`` returns *n* free indices; each ``free(indices)`` returns + them to the pool. + + Uses a **dual-buffer** strategy (``free_slots`` + ``release_slots``) so + that ``free()`` never cats onto the large main free-list. Freed indices + accumulate in the smaller ``release_slots`` and are merged lazily (with an + optional sort) only when ``alloc()`` cannot be satisfied from + ``free_slots`` alone. + + A **batch-free** API (``free_group_begin`` / ``free_group_end``) further + amortises cost when many ``free()`` calls happen in a tight loop (e.g. + during scheduling or eviction). + + Typical usage:: + + allocator = TokenToKVPoolAllocator(size=4096, device="cuda") + + # --- basic alloc / free --- + indices = allocator.alloc(128) # 128 free slot indices (int64) + allocator.free(indices[:64]) # return 64 slots + + # --- batch free (amortised) --- + allocator.free_group_begin() + for req in finished_requests: + allocator.free(req.kv_indices) # O(1) list append each + allocator.free_group_end() # single torch.cat + release + + Parameters + ---------- + size : int + Total number of allocatable slots (must match ``KVPool.size``). + device : str | torch.device + Device for the free-list tensor. + page_size : int + When > 1 the allocator works in page-aligned mode: ``alloc`` returns + multiples of ``page_size`` contiguous within each page, and ``free`` + deduplicates by page. + need_sort : bool + When ``True`` (default), ``merge_and_sort_free`` sorts after merging + so that lower-index slots are allocated first (better memory locality). + """ + + def __init__( + self, + size: int, + device: Union[str, torch.device] = "cuda", + page_size: int = 1, + need_sort: bool = True, + ): + self.size = size + self.page_size = page_size + self.device = torch.device(device) + self.need_sort = need_sort + self.clear() + + def clear(self) -> None: + """Reset the allocator so that all slots ``[1, size]`` are free. The first slot is reserved for padding.""" + if self.page_size == 1: + self.free_slots = torch.arange( + 1, self.size + 1, dtype=torch.int64, device=self.device + ) + else: + num_pages = self.size // self.page_size + self.free_slots = torch.arange( + 1, num_pages + 1, dtype=torch.int64, device=self.device + ) + self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self._is_not_in_free_group = True + self._free_group: List[torch.Tensor] = [] + + def available_size(self) -> int: + """Number of tokens that can still be allocated.""" + return (len(self.free_slots) + len(self.release_slots)) * self.page_size + + def merge_and_sort_free(self) -> None: + """Merge ``release_slots`` into ``free_slots`` (and sort if ``need_sort``).""" + if len(self.release_slots) == 0: + return + self.free_slots = torch.cat((self.free_slots, self.release_slots)) + if self.need_sort: + self.free_slots, _ = torch.sort(self.free_slots) + self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + + def free_group_begin(self) -> None: + """Start collecting ``free()`` calls; actual release is deferred to ``free_group_end``.""" + self._is_not_in_free_group = False + self._free_group = [] + + def free_group_end(self) -> None: + """Flush all ``free()`` calls collected since ``free_group_begin``.""" + self._is_not_in_free_group = True + if self._free_group: + self.free(torch.cat(self._free_group)) + self._free_group = [] + + def alloc(self, need_size: int) -> Optional[torch.Tensor]: + """Allocate *need_size* token indices. + + Returns a 1-D ``int64`` tensor on success, or ``None`` if the pool is + exhausted. + """ + if self.page_size == 1: + if need_size > len(self.free_slots): + self.merge_and_sort_free() + if need_size > len(self.free_slots): + return None + out = self.free_slots[:need_size] + self.free_slots = self.free_slots[need_size:] + return out + + num_pages = (need_size + self.page_size - 1) // self.page_size + if num_pages > len(self.free_slots): + self.merge_and_sort_free() + if num_pages > len(self.free_slots): + return None + pages = self.free_slots[:num_pages] + self.free_slots = self.free_slots[num_pages:] + offsets = torch.arange(self.page_size, device=self.device) + out = (pages[:, None] * self.page_size + offsets).reshape(-1) + return out[:need_size] + + def free(self, indices: torch.Tensor) -> None: + """Return *indices* to the free pool.""" + if indices.numel() == 0: + return + + if not self._is_not_in_free_group: + self._free_group.append(indices) + return + + if self.page_size != 1: + indices = torch.unique(indices // self.page_size) + + if self.need_sort: + self.release_slots = torch.cat((self.release_slots, indices)) + else: + self.free_slots = torch.cat((self.free_slots, indices)) + + +class ReqToTokenPool: + """Maps each live request to its per-position KV-pool indices. + + Internally a 2-D tensor ``req_to_token[slot, position]`` stores the + KV-pool index for every token position of every active request. + Slots are recycled via a simple free-list. + + This class is a **pure mapping table** -- it does **not** track per-request + sequence lengths. The caller (typically the ``Req`` / IO-struct object) + must store ``req_pool_idx`` and ``seq_len`` and use them to slice into + ``req_to_token`` when reading back KV indices. + + Typical usage:: + + pool = ReqToTokenPool(max_reqs=256, max_context_len=4096) + + # --- on new request arrival --- + [slot] = pool.alloc(1) # slot = req_pool_idx + kv_indices = kv_allocator.alloc(seq_len) # from TokenToKVPoolAllocator + pool.write((slot, slice(0, seq_len)), kv_indices) + + # --- read back (caller tracks seq_len) --- + kv_indices = pool.req_to_token[slot, :seq_len] + + # --- on request completion --- + kv_allocator.free(pool.req_to_token[slot, :seq_len]) + pool.free(slot) + + Parameters + ---------- + max_reqs : int + Maximum number of concurrent requests (number of rows). + max_context_len : int + Maximum sequence length any single request can reach (number of cols). + device : str | torch.device + Target device for the mapping tensor. + """ + + def __init__( + self, + max_reqs: int, + max_context_len: int, + device: Union[str, torch.device] = "cuda", + ): + self.size = max_reqs + self.max_context_len = max_context_len + self.device = torch.device(device) + + self.req_to_token = torch.zeros( + (max_reqs, max_context_len), dtype=torch.int64, device=self.device + ) + self._free_slots: List[int] = list(range(max_reqs)) + + def available_size(self) -> int: + return len(self._free_slots) + + def alloc(self, n: int = 1) -> Optional[List[int]]: + """Allocate *n* request slots. Returns a list of slot indices.""" + if n > len(self._free_slots): + return None + out = self._free_slots[:n] + self._free_slots = self._free_slots[n:] + return out + + def free(self, slot: int) -> None: + """Return a single request slot to the pool.""" + self._free_slots.append(slot) + + def write(self, index: Tuple, values: torch.Tensor) -> None: + """Write KV indices into the mapping table. + + ``index`` is typically ``(req_pool_idx, slice(start, end))``. + """ + self.req_to_token[index] = values + + def clear(self) -> None: + self._free_slots = list(range(self.size)) + self.req_to_token.zero_() + + +def make_full_attention_net_mem_pool( + size: int, + layer_num: int, + k_head_num: int, + k_head_dim: int, + v_head_num: int, + v_head_dim: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.float16, + page_size: int = 1, + need_sort: bool = True, + pin_memory: bool = True, +) -> Tuple[KVPool, TokenToKVPoolAllocator]: + """Create a :class:`KVPool` and its :class:`TokenToKVPoolAllocator` for a + full-attention (non-SWA) model. + + Parameters + ---------- + size : int + Number of usable token slots in the KV cache. + layer_num : int + Number of transformer layers. + k_head_num / k_head_dim : int + Key head count and dimension. + v_head_num / v_head_dim : int + Value head count and dimension. + device : str | torch.device + Target device. + dtype : torch.dtype + Storage data type for the KV buffers. + page_size : int + Allocator page size (1 = per-token, >1 = page-aligned). + need_sort : bool + Whether the allocator sorts on merge for memory locality. + pin_memory : bool + Whether to use pinned memory for the KV buffers. + + Returns + ------- + (KVPool, TokenToKVPoolAllocator) + """ + pool = KVPool( + size=size, + layer_num=layer_num, + k_head_num=k_head_num, + k_head_dim=k_head_dim, + device=device, + dtype=dtype, + v_head_num=v_head_num, + v_head_dim=v_head_dim, + pin_memory=pin_memory, + ) + allocator = TokenToKVPoolAllocator( + size=size, + device=device, + page_size=page_size, + need_sort=need_sort, + ) + return pool, allocator + + +def make_req_to_token_pool( + max_reqs: int, + max_context_len: int, + device: Union[str, torch.device] = "cuda", +) -> ReqToTokenPool: + return ReqToTokenPool(max_reqs, max_context_len, device) diff --git a/pymllm/mem_cache/param_disk_cache.py b/pymllm/mem_cache/param_disk_cache.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index e69de29b..997790ea 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -0,0 +1,794 @@ +"""Lightweight radix-tree KV cache with SWA and multimodal support. + + +Supports: + - Multi-batch serving on a single GPU + - Sliding Window Attention (SWA) via tombstone mechanism + - Multimodal namespace isolation via ``extra_key`` + - SHA256 position-aware hashing + - Page-aligned operations (page_size >= 1) + - LRU leaf eviction +""" + +from __future__ import annotations + +import hashlib +import heapq +import logging +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union + +import torch + +logger = logging.getLogger(__name__) + + +def hash_token_ids( + token_ids: List[Union[int, Tuple[int, ...]]], + prior_hash: Optional[str] = None, +) -> str: + """SHA-256 hash of a token-id page with optional chain-hash. + + Each token is encoded as a 4-byte little-endian unsigned integer; + tuples (bigram / EAGLE) hash each element in order. When *prior_hash* + is supplied the digest is seeded with the raw bytes of the previous + hash, making the result position-aware. + """ + hasher = hashlib.sha256() + if prior_hash: + hasher.update(bytes.fromhex(prior_hash)) + for t in token_ids: + if isinstance(t, tuple): + for elem in t: + hasher.update(elem.to_bytes(4, byteorder="little", signed=False)) + else: + hasher.update(t.to_bytes(4, byteorder="little", signed=False)) + return hasher.hexdigest() + + +def hash_to_int64(hex_str: str) -> int: + """Convert a hex digest to a signed 64-bit integer (first 16 hex chars).""" + val = int(hex_str[:16], 16) + return val - (1 << 64) if val >= (1 << 63) else val + + +def hash_bytes(data: bytes) -> int: + """SHA-256 → unsigned 64-bit int. Useful for multimodal embedding keys.""" + return int.from_bytes(hashlib.sha256(data).digest()[:8], "big", signed=False) + + +class RadixKey: + """Compound lookup key: token-id sequence + optional namespace tag. + + ``extra_key`` isolates independent namespaces so that sequences with + identical leading tokens but different adapters / LoRA ids / multimodal + context hashes never share prefix nodes. + """ + + __slots__ = ("token_ids", "extra_key") + + def __init__( + self, + token_ids: List[Union[int, Tuple[int, ...]]], + extra_key: Optional[str] = None, + ): + self.token_ids = token_ids + self.extra_key = extra_key + + def __len__(self) -> int: + return len(self.token_ids) + + def __iter__(self) -> Iterator: + return iter(self.token_ids) + + def __getitem__(self, idx: Union[int, slice]) -> RadixKey: + if isinstance(idx, slice): + return RadixKey(self.token_ids[idx], self.extra_key) + return RadixKey([self.token_ids[idx]], self.extra_key) + + def __repr__(self) -> str: + preview = self.token_ids[:10] + tail = "..." if len(self.token_ids) > 10 else "" + return f"RadixKey(extra={self.extra_key!r}, toks={preview}{tail})" + + +_node_counter: int = 0 + + +def _next_node_id() -> int: + global _node_counter + _node_counter += 1 + return _node_counter + + +class TreeNode: + """A single node in the radix tree. + + ``value`` holds a 1-D ``int64`` tensor of KV-pool indices (one per token + in ``key``). When the node has been evicted, ``value`` is ``None``. + """ + + __slots__ = ( + "children", + "parent", + "key", + "value", + "lock_ref", + "swa_lock_ref", + "swa_tombstone", + "swa_boundary_id", + "last_access_time", + "hit_count", + "hash_values", + "id", + ) + + def __init__(self) -> None: + self.children: Dict[Any, TreeNode] = defaultdict(TreeNode) + self.parent: Optional[TreeNode] = None + self.key: Optional[RadixKey] = None + self.value: Optional[torch.Tensor] = None + + self.lock_ref: int = 0 + self.swa_lock_ref: int = 0 + self.swa_tombstone: bool = False + self.swa_boundary_id: Optional[int] = None + + self.last_access_time: float = time.monotonic() + self.hit_count: int = 0 + self.hash_values: Optional[List[str]] = None + self.id: int = _next_node_id() + + @property + def evicted(self) -> bool: + return self.value is None + + def __lt__(self, other: TreeNode) -> bool: + return self.last_access_time < other.last_access_time + + +def _key_match(key0: RadixKey, key1: RadixKey, page_size: int) -> int: + """Return the length of the common prefix (page-aligned when *page_size* > 1).""" + if key0.extra_key != key1.extra_key: + return 0 + if page_size == 1: + i = 0 + for a, b in zip(key0.token_ids, key1.token_ids): + if a != b: + break + i += 1 + return i + min_len = min(len(key0), len(key1)) + i = 0 + while i < min_len: + if key0.token_ids[i : i + page_size] != key1.token_ids[i : i + page_size]: + break + i += page_size + return i + + +def _child_key(key: RadixKey, page_size: int) -> Any: + """Derive the dict key used in ``node.children``.""" + plain = key.token_ids[0] if page_size == 1 else tuple(key.token_ids[:page_size]) + return (key.extra_key, plain) if key.extra_key is not None else plain + + +@dataclass +class MatchResult: + """Returned by :meth:`RadixCache.match_prefix`.""" + + indices: torch.Tensor + last_node: TreeNode + prefix_len: int = 0 + + +@dataclass +class InsertResult: + """Returned by :meth:`RadixCache.insert`.""" + + prefix_len: int = 0 + + +@dataclass +class EvictResult: + """Returned by :meth:`RadixCache.evict`.""" + + full_evicted: int = 0 + swa_evicted: int = 0 + + +class RadixCache: + """Lightweight radix tree for KV-cache prefix sharing. + + Parameters + ---------- + page_size: + Number of tokens per KV-pool page. Keys and values are aligned to + this granularity. + sliding_window_size: + If set, enables SWA mode. The cache tracks which nodes have had + their SWA KV freed (tombstoned) and constrains prefix matching + so that the sliding-window invariant is maintained. + disable: + When *True* every public method is a no-op (useful for ablation). + token_to_kv_pool_allocator: + Optional pool allocator with ``free(indices)`` (and ``free_swa`` for + SWA mode). When *None*, index tensors are simply discarded. + """ + + def __init__( + self, + page_size: int = 1, + sliding_window_size: Optional[int] = None, + disable: bool = False, + token_to_kv_pool_allocator: Any = None, + ): + self.page_size = page_size + self.sliding_window_size = sliding_window_size + self.disable = disable + self.pool = token_to_kv_pool_allocator + + if self.pool is not None and hasattr(self.pool, "device"): + self.device = self.pool.device + else: + self.device = torch.device("cpu") + + self._swa_boundary_counter: int = 0 + self.reset() + + @property + def supports_swa(self) -> bool: + return self.sliding_window_size is not None + + def evictable_size(self) -> int: + return self._evictable_size + + def swa_evictable_size(self) -> int: + return self._swa_evictable_size + + def protected_size(self) -> int: + return self._protected_size + + def swa_protected_size(self) -> int: + return self._swa_protected_size + + def reset(self) -> None: + """Clear all cached state and re-initialise the root node.""" + self.root_node = TreeNode() + self.root_node.key = RadixKey([]) + self.root_node.value = torch.tensor([], dtype=torch.int64) + self.root_node.lock_ref = 1 + self.root_node.swa_lock_ref = 1 + self._evictable_size: int = 0 + self._swa_evictable_size: int = 0 + self._protected_size: int = 0 + self._swa_protected_size: int = 0 + + def match_prefix(self, key: RadixKey) -> MatchResult: + """Find the longest cached prefix of *key*. + + For SWA mode the match is further constrained: the path from the + returned ``last_node`` to root must have at least + ``sliding_window_size`` non-tombstone tokens (or be entirely + tombstone-free back to root). + + Accessing a prefix refreshes LRU timestamps along the matched path. + """ + empty = MatchResult( + indices=torch.empty(0, dtype=torch.int64, device=self.device), + last_node=self.root_node, + ) + if self.disable or len(key) == 0: + return empty + + key = self._page_align_key(key) + if len(key) == 0: + return empty + + if self.supports_swa: + values, last_node, best_count = self._match_swa(key) + values = values[:best_count] + else: + values, last_node = self._match_normal(key) + + cat = ( + torch.cat(values) + if values + else torch.empty(0, dtype=torch.int64, device=self.device) + ) + return MatchResult(indices=cat, last_node=last_node, prefix_len=len(cat)) + + def insert( + self, + key: RadixKey, + value: Optional[torch.Tensor] = None, + *, + prev_prefix_len: int = 0, + swa_evicted_seqlen: int = 0, + ) -> InsertResult: + """Insert *key*/*value* into the tree. + + Returns how many leading tokens were already present (the prefix + length). The caller is responsible for freeing duplicate KV indices + in the range ``[cache_protected_len, prefix_len)``. + + Parameters + ---------- + prev_prefix_len: + (SWA mode) tokens before this offset are already protected and + should not have their values overwritten. + swa_evicted_seqlen: + (SWA mode) the sequence length up to which SWA KV has been + previously evicted. Used to decide whether a tombstoned node can + be un-tombstoned with the incoming value. + """ + if self.disable: + return InsertResult() + if value is None: + value = torch.tensor(key.token_ids, dtype=torch.int64) + if self.supports_swa: + plen = self._insert_swa( + self.root_node, key, value, prev_prefix_len, swa_evicted_seqlen + ) + else: + plen = self._insert_normal(self.root_node, key, value) + return InsertResult(prefix_len=plen) + + def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: + """Evict up to *num_tokens* (full) and *swa_num_tokens* (SWA) tokens. + + Full eviction removes leaf nodes entirely; SWA eviction tombstones + internal nodes (freeing SWA KV but retaining full-attn KV). + """ + if self.disable: + return EvictResult() + + full_evicted = 0 + swa_evicted = 0 + + # Phase 1: full leaf eviction + if num_tokens > 0: + leaves = self._collect_evictable_leaves() + heap: List[Tuple[float, TreeNode]] = [ + (n.last_access_time, n) for n in leaves + ] + heapq.heapify(heap) + + while full_evicted < num_tokens and heap: + _, node = heapq.heappop(heap) + if node.evicted or node.lock_ref > 0: + continue + n = len(node.value) + self._free_indices(node.value) + full_evicted += n + swa_evicted += n + self._delete_leaf(node) + + p = node.parent + if ( + p is not None + and p != self.root_node + and len(p.children) == 0 + and p.lock_ref == 0 + ): + if self.supports_swa and p.swa_tombstone: + self._free_indices(p.value) + full_evicted += len(p.value) + self._delete_leaf(p) + else: + heapq.heappush(heap, (p.last_access_time, p)) + + # Phase 2: SWA tombstone eviction (internal nodes) + if self.supports_swa and swa_evicted < swa_num_tokens: + candidates = self._collect_swa_evictable() + heap2: List[Tuple[float, TreeNode]] = [ + (n.last_access_time, n) for n in candidates + ] + heapq.heapify(heap2) + + while swa_evicted < swa_num_tokens and heap2: + _, node = heapq.heappop(heap2) + if node.swa_tombstone or node.swa_lock_ref > 0 or node.evicted: + continue + n = len(node.value) + if len(node.children) == 0 and node.lock_ref == 0: + self._free_indices(node.value) + full_evicted += n + swa_evicted += n + self._delete_leaf(node) + elif len(node.children) > 0: + self._free_swa_indices(node.value) + swa_evicted += n + self._tombstone_node(node) + + return EvictResult(full_evicted=full_evicted, swa_evicted=swa_evicted) + + def inc_lock_ref(self, node: TreeNode) -> Optional[int]: + """Lock nodes from *node* up to root (prevents eviction). + + Returns ``swa_boundary_id`` that must be passed back to + :meth:`dec_lock_ref`. In non-SWA mode, returns ``None``. + """ + if self.disable or node is None: + return None + + swa_locked = 0 + swa_boundary_id: Optional[int] = None + cur = node + while cur != self.root_node: + if cur.lock_ref == 0: + self._evictable_size -= len(cur.key) + self._protected_size += len(cur.key) + cur.lock_ref += 1 + + if ( + self.supports_swa + and swa_locked < self.sliding_window_size + and not cur.swa_tombstone + ): + if cur.swa_lock_ref == 0: + self._swa_evictable_size -= len(cur.key) + self._swa_protected_size += len(cur.key) + cur.swa_lock_ref += 1 + swa_locked += len(cur.key) + if swa_locked >= self.sliding_window_size: + if cur.swa_boundary_id is None: + self._swa_boundary_counter += 1 + cur.swa_boundary_id = self._swa_boundary_counter + swa_boundary_id = cur.swa_boundary_id + + cur = cur.parent + return swa_boundary_id + + def dec_lock_ref( + self, node: TreeNode, swa_boundary_id: Optional[int] = None + ) -> None: + """Unlock nodes from *node* up to root.""" + if self.disable or node is None: + return + + dec_swa = True + cur = node + while cur != self.root_node: + if cur.lock_ref == 1: + self._evictable_size += len(cur.key) + self._protected_size -= len(cur.key) + cur.lock_ref -= 1 + + if self.supports_swa and dec_swa and not cur.swa_tombstone: + if cur.swa_lock_ref == 1: + self._swa_evictable_size += len(cur.key) + self._swa_protected_size -= len(cur.key) + cur.swa_lock_ref -= 1 + if swa_boundary_id and cur.swa_boundary_id == swa_boundary_id: + dec_swa = False + + cur = cur.parent + + def total_size(self) -> int: + """Total number of cached tokens (including tombstoned).""" + total = 0 + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.value is not None: + total += len(n.value) + stack.extend(c for c in n.children.values() if not c.evicted) + return total + + def compute_node_hash(self, node: TreeNode) -> List[str]: + """Compute position-aware SHA-256 hashes for *node* (one per page). + + Lazily computed and cached on ``node.hash_values``. + """ + if node.hash_values is not None: + return node.hash_values + + parent_hash: Optional[str] = None + if ( + node.parent is not None + and node.parent.hash_values is not None + and len(node.parent.key) > 0 + and len(node.parent.hash_values) > 0 + ): + parent_hash = node.parent.hash_values[-1] + + hashes: List[str] = [] + for start in range(0, len(node.key), self.page_size): + page = node.key.token_ids[start : start + self.page_size] + if not page: + continue + h = hash_token_ids(page, prior_hash=parent_hash) + hashes.append(h) + parent_hash = h + + node.hash_values = hashes + return hashes + + def pretty_print(self) -> None: + """Print the tree structure to stdout.""" + self._print_helper(self.root_node, 0) + print( + f"total={self.total_size()} evictable={self._evictable_size}" + + ( + f" swa_evictable={self._swa_evictable_size}" + if self.supports_swa + else "" + ) + ) + + def _match_normal(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode]: + node = self.root_node + now = time.monotonic() + node.last_access_time = now + values: List[torch.Tensor] = [] + + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + child = node.children[ck] + child.last_access_time = now + child.hit_count += 1 + plen = _key_match(child.key, key, self.page_size) + if plen < len(child.key): + new_node = self._split_node(child.key, child, plen) + values.append(new_node.value) + node = new_node + break + values.append(child.value) + node = child + key = key[plen:] + + return values, node + + def _match_swa(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode, int]: + """SWA-aware match. Returns *(values, last_node, best_value_count)*. + + ``best_value_count`` is the number of value tensors from *values* + that form a valid SWA-safe prefix (enough non-tombstone tokens within + the sliding window, or a tombstone-free path to root). + """ + node = self.root_node + values: List[torch.Tensor] = [] + non_tomb_len: float = float("inf") + best_count = 0 + best_node = node + + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + child = node.children[ck] + + if child.swa_tombstone: + if non_tomb_len >= self.sliding_window_size: + best_count = len(values) + best_node = node + non_tomb_len = 0 + + plen = _key_match(child.key, key, self.page_size) + if plen < len(child.key): + new_node = self._split_node(child.key, child, plen) + values.append(new_node.value) + if not new_node.swa_tombstone: + non_tomb_len += len(new_node.value) + node = new_node + break + values.append(child.value) + if not child.swa_tombstone: + non_tomb_len += len(child.value) + node = child + key = key[plen:] + + if non_tomb_len >= self.sliding_window_size: + best_count = len(values) + best_node = node + + return values, best_node, best_count + + def _insert_normal(self, node: TreeNode, key: RadixKey, value: torch.Tensor) -> int: + now = time.monotonic() + node.last_access_time = now + if len(key) == 0: + return 0 + + total_prefix = 0 + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + node = node.children[ck] + node.last_access_time = now + plen = _key_match(node.key, key, self.page_size) + if plen < len(node.key): + self._split_node(node.key, node, plen) + total_prefix += plen + key = key[plen:] + value = value[plen:] + + if len(key) > 0: + self._add_leaf(node, key, value) + + return total_prefix + + def _insert_swa( + self, + node: TreeNode, + key: RadixKey, + value: torch.Tensor, + prev_prefix_len: int, + swa_evicted_seqlen: int, + ) -> int: + """Insert with SWA tombstone awareness. + + When an existing node is tombstoned and the incoming *value* carries + fresh SWA KV (i.e. beyond *swa_evicted_seqlen*), the node is + un-tombstoned and its value is replaced. + """ + now = time.monotonic() + node.last_access_time = now + if len(key) == 0: + return 0 + + total_prefix = 0 + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + node = node.children[ck] + node.last_access_time = now + plen = _key_match(node.key, key, self.page_size) + + if plen < len(node.key): + self._split_node(node.key, node, plen) + + beyond_protected = prev_prefix_len < total_prefix + plen + if beyond_protected and node.swa_tombstone: + if swa_evicted_seqlen <= total_prefix: + self._free_indices(node.value[:plen]) + node.value = value[:plen].clone() + node.swa_tombstone = False + self._swa_evictable_size += len(node.value) + else: + self._free_indices(value[:plen]) + elif beyond_protected: + self._free_indices(value[:plen]) + + total_prefix += plen + key = key[plen:] + value = value[plen:] + + if len(key) > 0: + if ( + swa_evicted_seqlen > total_prefix + and swa_evicted_seqlen < total_prefix + len(key) + ): + tomb_len = swa_evicted_seqlen - total_prefix + self._add_leaf( + node, key[:tomb_len], value[:tomb_len], swa_tombstone=True + ) + node = node.children[_child_key(key, self.page_size)] + key = key[tomb_len:] + value = value[tomb_len:] + + if len(key) > 0: + self._add_leaf(node, key, value, swa_tombstone=False) + + return total_prefix + + def _add_leaf( + self, + parent: TreeNode, + key: RadixKey, + value: torch.Tensor, + swa_tombstone: bool = False, + ) -> TreeNode: + new_node = TreeNode() + new_node.parent = parent + new_node.key = key + new_node.value = value.clone() + new_node.swa_tombstone = swa_tombstone + parent.children[_child_key(key, self.page_size)] = new_node + self._evictable_size += len(key) + if self.supports_swa and not swa_tombstone: + self._swa_evictable_size += len(key) + return new_node + + def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode: + """Split *child* at *split_len*, returning the new parent node.""" + new_node = TreeNode() + new_node.children[_child_key(key[split_len:], self.page_size)] = child + new_node.parent = child.parent + new_node.lock_ref = child.lock_ref + new_node.swa_lock_ref = child.swa_lock_ref + new_node.swa_tombstone = child.swa_tombstone + new_node.swa_boundary_id = child.swa_boundary_id + child.swa_boundary_id = None + new_node.key = child.key[:split_len] + new_node.value = child.value[:split_len].clone() + + # Split hash values if they exist + if child.hash_values is not None: + pages = split_len // self.page_size if self.page_size > 1 else split_len + new_node.hash_values = child.hash_values[:pages] + child.hash_values = child.hash_values[pages:] + else: + new_node.hash_values = None + + child.parent = new_node + child.key = child.key[split_len:] + child.value = child.value[split_len:].clone() + new_node.parent.children[_child_key(key, self.page_size)] = new_node + return new_node + + def _delete_leaf(self, node: TreeNode) -> None: + ck = _child_key(node.key, self.page_size) + node.parent.children.pop(ck, None) + self._evictable_size -= len(node.key) + if self.supports_swa and not node.swa_tombstone: + self._swa_evictable_size -= len(node.key) + + def _tombstone_node(self, node: TreeNode) -> None: + node.swa_tombstone = True + self._swa_evictable_size -= len(node.key) + + def _collect_evictable_leaves(self) -> List[TreeNode]: + leaves: List[TreeNode] = [] + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.evicted: + continue + has_live_child = False + for c in n.children.values(): + if not c.evicted: + has_live_child = True + stack.append(c) + if not has_live_child and n.lock_ref == 0 and n != self.root_node: + leaves.append(n) + return leaves + + def _collect_swa_evictable(self) -> List[TreeNode]: + nodes: List[TreeNode] = [] + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.evicted: + continue + if n != self.root_node and not n.swa_tombstone and n.swa_lock_ref == 0: + nodes.append(n) + stack.extend(c for c in n.children.values() if not c.evicted) + return nodes + + def _page_align_key(self, key: RadixKey) -> RadixKey: + if self.page_size == 1: + return key + aligned = len(key) // self.page_size * self.page_size + return key[:aligned] + + def _free_indices(self, indices: torch.Tensor) -> None: + if self.pool is not None and len(indices) > 0: + self.pool.free(indices) + + def _free_swa_indices(self, indices: torch.Tensor) -> None: + if self.pool is not None and len(indices) > 0: + if hasattr(self.pool, "free_swa"): + self.pool.free_swa(indices) + else: + self.pool.free(indices) + + def _print_helper(self, node: TreeNode, indent: int) -> None: + stack = [(node, indent)] + while stack: + n, ind = stack.pop() + toks = n.key.token_ids[:10] if n.key else [] + klen = len(n.key) if n.key else 0 + flags = f"lock={n.lock_ref}" + if self.supports_swa: + flags += f" swa={n.swa_lock_ref} tomb={n.swa_tombstone}" + print(f"{' ' * ind}[{klen}] {toks} {flags}") + for c in n.children.values(): + stack.append((c, ind + 1)) diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py index 598d93eb..ef3fd5f0 100644 --- a/pymllm/orchestrator/async_disk_io_process.py +++ b/pymllm/orchestrator/async_disk_io_process.py @@ -1,3 +1,84 @@ +""" +AsyncDiskIoProcess -- optional subprocess for asynchronous disk I/O. + +Handles weight loading, checkpoint saving, or other heavy disk operations +without blocking the scheduler or model runner. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class AsyncDiskIoProcess: - def __init__(self): + """Runs inside a subprocess. Performs disk I/O on behalf of the scheduler.""" + + def __init__(self, recv_addr: str): + self._recv_addr = recv_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_sock: Optional[zmq.Socket] = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_sock = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_addr, bind=True, + ) + + def event_loop(self) -> None: + """Infinite loop: recv I/O request -> execute -> (optionally reply).""" + logger.info("AsyncDiskIoProcess event loop started") + while True: + io_request: Dict[str, Any] = self._recv_sock.recv_pyobj() + self._handle(io_request) + + # ------------------------------------------------------------------ + # I/O handling (placeholder) + # ------------------------------------------------------------------ + + def _handle(self, io_request: Dict[str, Any]) -> None: + """Dispatch an I/O request. + + TODO: implement weight loading, checkpoint save, etc. + """ + kind = io_request.get("kind") + logger.debug("AsyncDiskIoProcess received request kind=%s", kind) + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_sock is not None: + self._recv_sock.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_async_disk_io_process( + recv_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = AsyncDiskIoProcess(recv_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "async_disk_io"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index 47c1c595..e9d5184b 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -1,3 +1,114 @@ +""" +DetokenizerProcess -- subprocess that converts token IDs back to text. + +Receives ``BatchTokenIDOut``-style dicts from the SchedulerProcess, +detokenizes them, and forwards the decoded strings to the +RequestResponseProcess. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, List, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class DetokenizerProcess: - def __init__(self): + """Runs inside a subprocess. Detokenizes finished outputs.""" + + def __init__( + self, + recv_from_scheduler_addr: str, + send_to_rr_addr: str, + ): + self._recv_from_scheduler_addr = recv_from_scheduler_addr + self._send_to_rr_addr = send_to_rr_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_scheduler: Optional[zmq.Socket] = None + self._send_to_rr: Optional[zmq.Socket] = None + + # TODO: initialise the tokenizer (needed for decode) + self._tokenizer = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_scheduler = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_scheduler_addr, + bind=False, + ) + self._send_to_rr = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_rr_addr, + bind=False, + ) + + def event_loop(self) -> None: + """Infinite loop: recv token IDs -> detokenize -> send text to RR.""" + logger.info("DetokenizerProcess event loop started") + while True: + token_id_out = self._recv_from_scheduler.recv_pyobj() + str_out = self._detokenize(token_id_out) + self._send_to_rr.send_pyobj(str_out) + + # ------------------------------------------------------------------ + # Detokenization (placeholder) + # ------------------------------------------------------------------ + + def _detokenize(self, token_id_out: Dict[str, Any]) -> Dict[str, Any]: + """Convert token IDs to text. + + TODO: replace with real tokenizer.decode() call and incremental + detokenization logic. + """ + output_ids: List[int] = token_id_out.get("output_token_ids", []) + # placeholder: join ids as string + text = "" # TODO: self._tokenizer.decode(output_ids) + return { + "rid": token_id_out.get("rid"), + "text": text, + "output_token_ids": output_ids, + } + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_from_scheduler is not None: + self._recv_from_scheduler.close() + if self._send_to_rr is not None: + self._send_to_rr.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_detokenizer_process( + recv_from_scheduler_addr: str, + send_to_rr_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = DetokenizerProcess(recv_from_scheduler_addr, send_to_rr_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "detokenizer"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/ipc_utils.py b/pymllm/orchestrator/ipc_utils.py new file mode 100644 index 00000000..faaf7a6d --- /dev/null +++ b/pymllm/orchestrator/ipc_utils.py @@ -0,0 +1,70 @@ +"""ZMQ IPC utilities for inter-process communication. + +Provides helpers to generate unique IPC addresses and create pre-configured +ZMQ sockets so that every process uses the same conventions. +""" + +import os +import tempfile +from typing import Optional + +import zmq + + +_IPC_DIR = os.path.join(tempfile.gettempdir(), "pymllm_ipc") + + +def _ensure_ipc_dir() -> None: + os.makedirs(_IPC_DIR, exist_ok=True) + + +def make_ipc_address(name: str, unique_id: Optional[str] = None) -> str: + """Return an ``ipc://`` address for *name*, optionally scoped by *unique_id*. + + Parameters + ---------- + name + Logical channel name, e.g. ``"rr_to_tokenizer"``. + unique_id + Per-engine identifier (typically ``str(os.getpid())``) to avoid + collisions when multiple engines run on the same host. + """ + _ensure_ipc_dir() + suffix = f"_{unique_id}" if unique_id else "" + return f"ipc://{_IPC_DIR}/pymllm_{name}{suffix}" + + +def create_zmq_socket( + ctx: zmq.Context, + socket_type: int, + address: str, + bind: bool, +) -> zmq.Socket: + """Create a ZMQ socket, bind or connect it, and return it. + + Parameters + ---------- + ctx + A ``zmq.Context`` shared within the process. + socket_type + One of ``zmq.PUSH``, ``zmq.PULL``, ``zmq.PAIR``, etc. + address + The ``ipc://`` address string. + bind + If ``True`` the socket calls ``bind``; otherwise ``connect``. + """ + sock = ctx.socket(socket_type) + sock.setsockopt(zmq.LINGER, 0) + if bind: + sock.bind(address) + else: + sock.connect(address) + return sock + + +def close_zmq_socket(sock: zmq.Socket) -> None: + """Close a ZMQ socket, ignoring errors.""" + try: + sock.close() + except zmq.ZMQError: + pass diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index 45091b59..4b28645e 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -1,3 +1,114 @@ +""" +ModelRunnerProcess -- subprocess that executes model forward passes. + +Receives batches from the SchedulerProcess, runs the model forward + sampling, +and returns the results (logits, next_token_ids) back to the scheduler. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class ModelRunnerProcess: - def __init__(self): + """Runs inside a subprocess. Owns the model and performs forward passes.""" + + def __init__( + self, + recv_from_scheduler_addr: str, + send_to_scheduler_addr: str, + ): + self._recv_from_scheduler_addr = recv_from_scheduler_addr + self._send_to_scheduler_addr = send_to_scheduler_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_scheduler: Optional[zmq.Socket] = None + self._send_to_scheduler: Optional[zmq.Socket] = None + + # TODO: initialise model, attention backend, memory pool, etc. + self._model = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_from_scheduler_addr, bind=False, + ) + self._send_to_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=False, + ) + + def event_loop(self) -> None: + """Infinite loop: recv batch -> forward -> sample -> send result.""" + logger.info("ModelRunnerProcess event loop started") + while True: + batch = self._recv_from_scheduler.recv_pyobj() + result = self._forward_batch(batch) + self._send_to_scheduler.send_pyobj(result) + + # ------------------------------------------------------------------ + # Forward pass (placeholder) + # ------------------------------------------------------------------ + + def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: + """Run the model forward pass and sampling for *batch*. + + TODO: implement real forward pass, logits processing, and sampling. + """ + requests = batch.get("requests", []) + finished = [] + unfinished = [] + + for req in requests: + # TODO: actual model forward, logits -> next_token_ids + next_token_ids = [] # placeholder + req["output_token_ids"] = req.get("output_token_ids", []) + next_token_ids + # TODO: check EOS / max_tokens to decide finished vs. unfinished + finished.append(req) + + return { + "batch_id": batch.get("batch_id"), + "finished": finished, + "unfinished": unfinished, + } + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_from_scheduler is not None: + self._recv_from_scheduler.close() + if self._send_to_scheduler is not None: + self._send_to_scheduler.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_model_runner_process( + recv_from_scheduler_addr: str, + send_to_scheduler_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = ModelRunnerProcess(recv_from_scheduler_addr, send_to_scheduler_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "model_runner"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index 998c2655..74335428 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -1,10 +1,150 @@ """ -This module contains the request and response threads for the orchestrator. +RequestResponseProcess -- the main-process entry point for user requests. -NOTE: This RR(request and response) threads can only be used as the main thread of the orchestrator. +This process is **not** a subprocess; it lives in the engine's main process. +Incoming requests are placed into an ``asyncio.Queue`` and forwarded to the +TokenizerProcess via ZMQ. Decoded results arrive back from the +DetokenizerProcess and are dispatched to the waiting callers. + +The request-tracking model uses ``ReqState`` pattern: each request +gets an ``asyncio.Event`` + output list so that streaming (multiple incremental +chunks) and one-shot responses are both supported. """ +import asyncio +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +import zmq +import zmq.asyncio + +from pymllm.engine.io_struct import GenerateReqInput +from pymllm.orchestrator.ipc_utils import create_zmq_socket, close_zmq_socket + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class ReqState: + """Per-request state that supports both streaming and one-shot responses. + + ``ReqState`` (Event + out_list). + + The recv loop appends results to *out_list* and signals *event*; + callers ``await event.wait()`` in a loop, consuming results until + *finished* is ``True``. + """ + + out_list: List[Dict[str, Any]] = dataclasses.field(default_factory=list) + finished: bool = False + event: asyncio.Event = dataclasses.field(default_factory=asyncio.Event) + class RequestResponseProcess: - def __init__(self): - pass + """Sits in the main process; bridges user-facing API and subprocess pipeline.""" + + def __init__( + self, + send_to_tokenizer_addr: str, + recv_from_detokenizer_addr: str, + ): + self._send_to_tokenizer_addr: str = send_to_tokenizer_addr + self._recv_from_detokenizer_addr: str = recv_from_detokenizer_addr + + # asyncio queue that buffers incoming user requests + self._request_queue: asyncio.Queue[Dict[str, Any]] = asyncio.Queue() + + # rid -> ReqState (replaces the old rid -> Future dict) + self._rid_to_state: Dict[str, ReqState] = {} + + # ZMQ (async context, sockets created lazily in the event loop) + self._zmq_ctx: Optional[zmq.asyncio.Context] = None + self._send_to_tokenizer: Optional[zmq.asyncio.Socket] = None + self._recv_from_detokenizer: Optional[zmq.asyncio.Socket] = None + + self._loop_task: Optional[asyncio.Task] = None + + def start(self, loop: asyncio.AbstractEventLoop) -> None: + """Kick off the background send/recv tasks on *loop*.""" + self._zmq_ctx = zmq.asyncio.Context() + self._send_to_tokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_tokenizer_addr, + bind=True, + ) + self._recv_from_detokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_detokenizer_addr, + bind=True, + ) + self._loop_task = loop.create_task(self._run()) + + async def add_request(self, request: GenerateReqInput) -> ReqState: + """Enqueue a request and return its :class:`ReqState`. + + Callers should ``await state.event.wait()`` in a loop, consuming + ``state.out_list`` entries until ``state.finished`` is ``True``. + """ + if not isinstance(request.rid, str): + raise ValueError("RequestResponseProcess currently accepts single requests only.") + rid = request.rid + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(request.to_request_dict()) + return state + + def remove_state(self, rid: str) -> None: + """Remove the ``ReqState`` for *rid* (called by the caller once done).""" + self._rid_to_state.pop(rid, None) + + async def abort_request(self, rid: str) -> None: + """Cancel a pending request and notify downstream processes.""" + state = self._rid_to_state.pop(rid, None) + if state is not None and not state.finished: + state.finished = True + state.out_list.append({"rid": rid, "error": "aborted", "finished": True}) + state.event.set() + await self._send_to_tokenizer.send_pyobj({"rid": rid, "abort": True}) + + async def shutdown(self) -> None: + if self._loop_task is not None: + self._loop_task.cancel() + if self._send_to_tokenizer is not None: + close_zmq_socket(self._send_to_tokenizer) + if self._recv_from_detokenizer is not None: + close_zmq_socket(self._recv_from_detokenizer) + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + # ------------------------------------------------------------------ + # Internal loops + # ------------------------------------------------------------------ + + async def _run(self) -> None: + """Main loop: forward requests to tokenizer, receive results from detokenizer.""" + send_task = asyncio.create_task(self._send_loop()) + recv_task = asyncio.create_task(self._recv_loop()) + await asyncio.gather(send_task, recv_task) + + async def _send_loop(self) -> None: + """Drain the asyncio queue and push requests to the TokenizerProcess.""" + while True: + request = await self._request_queue.get() + await self._send_to_tokenizer.send_pyobj(request) + + async def _recv_loop(self) -> None: + """Receive decoded results from DetokenizerProcess and dispatch to ReqStates.""" + while True: + result = await self._recv_from_detokenizer.recv_pyobj() + rid = result.get("rid") + state = self._rid_to_state.get(rid) + if state is None: + logger.warning("Received result for unknown rid=%s", rid) + continue + state.out_list.append(result) + if result.get("finished", False): + state.finished = True + state.event.set() diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 7a7783d5..e7394dab 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -1,3 +1,248 @@ +""" +SchedulerProcess -- the central scheduling hub. + +Receives tokenized requests from the TokenizerProcess, organises them into +batches, dispatches batches to the ModelRunnerProcess for forward passes, +collects results, and streams finished token IDs to the DetokenizerProcess. + +The main ``event_loop`` scheduler flow:: + + while True: + recv_requests() + process_input_requests() + batch = get_next_batch_to_run() + if batch: + run_batch(batch) + process_batch_result(batch) + stream_output() +""" + +import logging +import time +from collections import deque +from multiprocessing.connection import Connection +from typing import Any, Deque, Dict, List, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class SchedulerProcess: - def __init__(self): + """Runs inside a subprocess. Central hub that drives the inference loop.""" + + def __init__( + self, + recv_from_tokenizer_addr: str, + send_to_model_runner_addr: str, + recv_from_model_runner_addr: str, + send_to_detokenizer_addr: str, + ): + # ZMQ addresses + self._recv_from_tokenizer_addr = recv_from_tokenizer_addr + self._send_to_model_runner_addr = send_to_model_runner_addr + self._recv_from_model_runner_addr = recv_from_model_runner_addr + self._send_to_detokenizer_addr = send_to_detokenizer_addr + + # ZMQ runtime objects (initialised in init_sockets) + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_tokenizer: Optional[zmq.Socket] = None + self._send_to_model_runner: Optional[zmq.Socket] = None + self._recv_from_model_runner: Optional[zmq.Socket] = None + self._send_to_detokenizer: Optional[zmq.Socket] = None + self._poller: Optional[zmq.Poller] = None + + # Request management + self._waiting_queue: Deque[Dict[str, Any]] = deque() + self._running_batch: Optional[Dict[str, Any]] = None + self._finished: List[Dict[str, Any]] = [] + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + + self._recv_from_tokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_tokenizer_addr, + bind=False, + ) + self._send_to_model_runner = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_model_runner_addr, + bind=True, + ) + self._recv_from_model_runner = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_model_runner_addr, + bind=True, + ) + self._send_to_detokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_detokenizer_addr, + bind=True, + ) + + # Poller for non-blocking recv from tokenizer + self._poller = zmq.Poller() + self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) + + def event_loop(self) -> None: + """Infinite scheduling loop.""" + logger.info("SchedulerProcess event loop started") + while True: + self.recv_requests() + self.process_input_requests() + batch = self.get_next_batch_to_run() + if batch is not None: + result = self.run_batch(batch) + self.process_batch_result(batch, result) + self.stream_output() + + # ------------------------------------------------------------------ + # Step 1: receive tokenized requests (non-blocking) + # ------------------------------------------------------------------ + + def recv_requests(self) -> None: + """Non-blocking receive of tokenized requests from TokenizerProcess. + + Uses ``zmq.Poller`` with a short timeout so the scheduler is never + stuck waiting when there are batches to run. + """ + while True: + events = dict(self._poller.poll(timeout=0)) # non-blocking + if self._recv_from_tokenizer not in events: + break + req = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) + self._waiting_queue.append(req) + + # ------------------------------------------------------------------ + # Step 2: process input requests + # ------------------------------------------------------------------ + + def process_input_requests(self) -> None: + """Pre-process and validate requests sitting in ``_waiting_queue``. + + TODO: attach sampling params, allocate KV-cache slots, etc. + """ pass + + # ------------------------------------------------------------------ + # Step 3: build the next batch + # ------------------------------------------------------------------ + + def get_next_batch_to_run(self) -> Optional[Dict[str, Any]]: + """Select requests from ``_waiting_queue`` and form a batch. + + TODO: implement real batching / scheduling policy. + """ + if not self._waiting_queue: + return None + + batch_requests: List[Dict[str, Any]] = [] + # TODO: respect max_running_requests, memory budget, etc. + while self._waiting_queue: + batch_requests.append(self._waiting_queue.popleft()) + + batch = { + "requests": batch_requests, + "batch_id": id(batch_requests), + "created_at": time.time(), + } + return batch + + # ------------------------------------------------------------------ + # Step 4: run the batch via ModelRunnerProcess + # ------------------------------------------------------------------ + + def run_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: + """Send *batch* to ModelRunnerProcess and wait for the result. + + This is a **blocking** call: the scheduler is synchronous with the + model runner for simplicity. Overlap scheduling can be added later. + """ + self._send_to_model_runner.send_pyobj(batch) + result = self._recv_from_model_runner.recv_pyobj() + return result + + # ------------------------------------------------------------------ + # Step 5: process batch result + # ------------------------------------------------------------------ + + def process_batch_result( + self, batch: Dict[str, Any], result: Dict[str, Any] + ) -> None: + """Handle the result returned by the ModelRunnerProcess. + + TODO: check completion status (EOS, max_tokens), manage KV-cache, + split finished vs. unfinished requests. + """ + finished_requests = result.get("finished", []) + unfinished_requests = result.get("unfinished", []) + + self._finished.extend(finished_requests) + + # Put unfinished requests back for the next iteration + for req in unfinished_requests: + self._waiting_queue.appendleft(req) + + # ------------------------------------------------------------------ + # Step 6: stream output to DetokenizerProcess + # ------------------------------------------------------------------ + + def stream_output(self) -> None: + """Send finished token-ID outputs to the DetokenizerProcess.""" + while self._finished: + item = self._finished.pop(0) + self._send_to_detokenizer.send_pyobj(item) + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + for sock in ( + self._recv_from_tokenizer, + self._send_to_model_runner, + self._recv_from_model_runner, + self._send_to_detokenizer, + ): + if sock is not None: + sock.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_scheduler_process( + recv_from_tokenizer_addr: str, + send_to_model_runner_addr: str, + recv_from_model_runner_addr: str, + send_to_detokenizer_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = SchedulerProcess( + recv_from_tokenizer_addr, + send_to_model_runner_addr, + recv_from_model_runner_addr, + send_to_detokenizer_addr, + ) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "scheduler"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: + pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 0dca2155..852fac11 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -1,3 +1,102 @@ +""" +TokenizerProcess -- subprocess that tokenizes incoming raw requests. + +Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, +and forwards the tokenized payloads to the SchedulerProcess. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, List + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class TokenizerProcess: - def __init__(self): + """Runs inside a subprocess spawned by ``torch.multiprocessing``.""" + + def __init__( + self, + recv_from_rr_addr: str, + send_to_scheduler_addr: str, + ): + self._recv_from_rr_addr = recv_from_rr_addr + self._send_to_scheduler_addr = send_to_scheduler_addr + + self._zmq_ctx: zmq.Context = None + self._recv_from_rr: zmq.Socket = None + self._send_to_scheduler: zmq.Socket = None + + # TODO: initialise the actual tokenizer (HuggingFace / custom) + self._tokenizer = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_rr = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_from_rr_addr, bind=False, + ) + self._send_to_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=True, + ) + + def event_loop(self) -> None: + """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" + logger.info("TokenizerProcess event loop started") + while True: + raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() + tokenized = self._tokenize(raw_request) + self._send_to_scheduler.send_pyobj(tokenized) + + # ------------------------------------------------------------------ + # Tokenization (placeholder) + # ------------------------------------------------------------------ + + def _tokenize(self, raw_request: Dict[str, Any]) -> Dict[str, Any]: + """Tokenize a single raw request and return the tokenized payload. + + TODO: replace with real tokenizer call. + """ + text = raw_request.get("text", "") + # placeholder: produce fake token ids + input_ids: List[int] = [] # TODO: self._tokenizer.encode(text) + return { + **raw_request, + "input_ids": input_ids, + } + + def shutdown(self) -> None: + if self._recv_from_rr is not None: + self._recv_from_rr.close() + if self._send_to_scheduler is not None: + self._send_to_scheduler.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_tokenizer_process( + recv_from_rr_addr: str, + send_to_scheduler_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = TokenizerProcess(recv_from_rr_addr, send_to_scheduler_addr) + proc.init_sockets() + + # Signal readiness to the parent process + pipe_writer.send({"status": "ready", "process": "tokenizer"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() From e5e1b789fe249c229e51a90cb5e1ea888bbbdd32 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 21 Feb 2026 15:05:58 +0000 Subject: [PATCH 09/13] refactor: improve socket initialization in TokenizerProcess - Enhanced readability by formatting socket creation parameters across multiple lines in the `init_sockets` method of `TokenizerProcess`. - Maintained functionality while improving code clarity for future maintenance. --- pymllm/orchestrator/tokenizer_process.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 852fac11..53714bb6 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -41,10 +41,16 @@ def __init__( def init_sockets(self) -> None: self._zmq_ctx = zmq.Context() self._recv_from_rr = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_from_rr_addr, bind=False, + self._zmq_ctx, + zmq.PULL, + self._recv_from_rr_addr, + bind=False, ) self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=True, + self._zmq_ctx, + zmq.PUSH, + self._send_to_scheduler_addr, + bind=True, ) def event_loop(self) -> None: From 65f00b4eda03ec1d86db16ae8d89b17fd4684e86 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 27 Feb 2026 11:23:17 +0000 Subject: [PATCH 10/13] feat(engine): support batch generation and enable shared memory queue for IPC - Add enable_shared_queue config option to server configuration - Implement shared memory queue for fast IPC between tokenizer and scheduler - Refactor Engine.generate and generate_async to support single and batch requests - Add colorful ASCII art banner on engine startup if dependencies are available - Add _make_rids utility to auto-generate request IDs for batch and single requests - Implement TokenizedGenerateReqInput with multimodal inputs support - Refactor RequestResponseProcess to handle batch requests and return list of ReqStates - Enhance SchedulerProcess to receive requests from shared queue or legacy ZMQ - Introduce SharedMemoryManager for managing metadata in shared memory segments - Create TensorQueue to support fast IPC of tensors via shared memory and queues - Add CUDA IPC Transport module for zero-copy GPU tensor sharing with workspace buffer - Refactor ModelRunnerProcess to handle batch requests with actual output structure placeholders - Improve resource management and error handling in shared memory IPC utilities --- pymllm/configs/server_config.py | 1 + .../normal.py => engine/__init__.py} | 0 pymllm/engine/forward_batch.py | 0 pymllm/engine/io_struct.py | 9 + pymllm/engine/launch.py | 181 +++++++-- pymllm/layers/attention/attention_backend.py | 0 pymllm/layers/attention/flashinfer_backend.py | 0 pymllm/layers/attention/radix_attention.py | 0 pymllm/orchestrator/cuda_ipc_transport.py | 373 ++++++++++++++++++ pymllm/orchestrator/model_runner_process.py | 49 ++- .../orchestrator/request_response_process.py | 40 +- pymllm/orchestrator/scheduler_process.py | 97 ++++- pymllm/orchestrator/shared_memory_queue.py | 190 +++++++++ pymllm/orchestrator/tokenizer_process.py | 311 ++++++++++++++- pyproject.toml | 2 + 15 files changed, 1184 insertions(+), 69 deletions(-) rename pymllm/{layers/attention/normal.py => engine/__init__.py} (100%) create mode 100644 pymllm/engine/forward_batch.py create mode 100644 pymllm/layers/attention/attention_backend.py create mode 100644 pymllm/layers/attention/flashinfer_backend.py create mode 100644 pymllm/layers/attention/radix_attention.py create mode 100644 pymllm/orchestrator/cuda_ipc_transport.py create mode 100644 pymllm/orchestrator/shared_memory_queue.py diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 7cda9c3b..9e399d62 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -78,6 +78,7 @@ class ServerConfig: # --------------------------------------------------------------------- # # Feature switches # --------------------------------------------------------------------- # + enable_shared_queue: bool = False # Use shared memory queue for fast IPC # enable_lora: bool = False # max_loaded_loras: Optional[int] = None # max_loras_per_batch: int = 8 diff --git a/pymllm/layers/attention/normal.py b/pymllm/engine/__init__.py similarity index 100% rename from pymllm/layers/attention/normal.py rename to pymllm/engine/__init__.py diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/engine/io_struct.py b/pymllm/engine/io_struct.py index 777186e2..06c8d78d 100644 --- a/pymllm/engine/io_struct.py +++ b/pymllm/engine/io_struct.py @@ -135,8 +135,17 @@ def to_request_dict(self) -> Dict[str, Any]: @dataclass class TokenizedGenerateReqInput(BaseReq): + # The decoded text passed to the tokenizer (empty string if only input_ids + # were provided by the caller). input_text: str = "" + # Token IDs produced by the tokenizer. input_ids: List[int] = field(default_factory=list) + # Multimodal inputs (processor output, e.g. pixel_values, or raw image / + # audio / video data when no processor is available). ``None`` means the + # request is text-only. + mm_inputs: Optional[Dict[str, Any]] = None + # Raw sampling parameters dict (parsed into a SamplingParams object by the + # model runner when needed). sampling_params: Dict[str, Any] = field(default_factory=dict) stream: bool = False return_logprob: bool = False diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index edad97af..2200d7f3 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -11,6 +11,14 @@ from transformers import AutoConfig from huggingface_hub import snapshot_download +try: + from pyfiglet import figlet_format + from termcolor import colored + + HAS_BANNER_LIBS = True +except ImportError: + HAS_BANNER_LIBS = False + from pymllm.configs import get_global_config from pymllm.engine.io_struct import GenerateReqInput from pymllm.orchestrator.ipc_utils import make_ipc_address @@ -18,6 +26,7 @@ ReqState, RequestResponseProcess, ) +from pymllm.orchestrator.shared_memory_queue import TensorQueue from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process from pymllm.orchestrator.model_runner_process import run_model_runner_process @@ -68,6 +77,26 @@ def _launch_processes(self) -> None: # Record all subprocesses procs_and_readers: List[tuple] = [] + # Config dict for the tokenizer subprocess (must be picklable). + cfg = get_global_config() + enable_shared_queue = cfg.server.enable_shared_queue + + # Create shared queue if enabled + shared_queue = None + if enable_shared_queue: + # TODO: WCH init CUDA IPC things. + shared_queue = TensorQueue(maxsize=1000) # Configurable max size + logger.info("Shared memory queue enabled for fast IPC") + + tokenizer_cfg: Dict[str, Any] = { + "tokenizer_path": str(cfg.server.tokenizer_path), + "tokenizer_mode": cfg.server.tokenizer_mode, + "trust_remote_code": cfg.server.trust_remote_code, + "context_length": cfg.server.context_length, + "hf_config": cfg.model.hf_config, + "enable_shared_queue": enable_shared_queue, + } + # Tokenizer tokenizer_reader, tokenizer_writer = mp.Pipe(duplex=False) tokenizer_proc = mp.Process( @@ -76,6 +105,8 @@ def _launch_processes(self) -> None: addr_request_response_to_tokenizer, addr_tokenizer_to_scheduler, tokenizer_writer, + tokenizer_cfg, + shared_queue, # Pass shared queue ), daemon=True, ) @@ -91,6 +122,8 @@ def _launch_processes(self) -> None: addr_model_runner_to_scheduler, addr_scheduler_to_detokenizer, scheduler_writer, + shared_queue, # Pass shared queue + enable_shared_queue, # Pass flag ), daemon=True, ) @@ -165,6 +198,29 @@ def _launch_processes(self) -> None: self._rr_process.start(self._loop) logger.info("RequestResponseProcess started in main process") + # Print colorful gradient ASCII art banner + if HAS_BANNER_LIBS: + try: + text = figlet_format("pymllm", font="slant") + fired_up = figlet_format("FIRED UP!", font="slant") + + # Apply blue-purple gradient + lines = text.strip().split("\n") + colors_cycle = ["blue", "cyan", "blue", "magenta", "magenta"] + for i, line in enumerate(lines): + color = colors_cycle[i % len(colors_cycle)] + print(colored(line, color, attrs=["bold"])) + + # Print "FIRED UP!" in bright magenta + for line in fired_up.strip().split("\n"): + print(colored(line, "magenta", attrs=["bold"])) + print() + except Exception as e: + logger.debug(f"Failed to print banner: {e}") + print("🚀 pymllm FIRED UP! 🚀\n") + else: + print("🚀 pymllm FIRED UP! 🚀\n") + def generate( self, prompt: Optional[Union[List[str], str]] = None, @@ -181,10 +237,14 @@ def generate( stream: bool = False, rid: Optional[Union[List[str], str]] = None, **kwargs, - ) -> Dict[str, Any]: - """Synchronous, non-streaming generation entry point.""" - if rid is None: - rid = uuid.uuid4().hex + ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """Synchronous, non-streaming generation entry point. + + Accepts a single prompt (``str``) or a batch (``List[str]``). Returns a + single result dict for single inputs and a list of result dicts for batch + inputs, preserving the input order. + """ + rid = self._make_rids(rid, prompt, input_ids) request = GenerateReqInput( rid=rid, text=prompt, @@ -203,11 +263,18 @@ def generate( ) request.normalize_batch_and_arguments() - async def _run() -> Dict[str, Any]: - state = await self._rr_process.add_request(request) - if isinstance(rid, list): - raise ValueError("Synchronous `generate` currently supports single request.") - return await self._wait_for_final_result(rid, state) + async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: + result = await self._rr_process.add_request(request) + if request.is_single: + single_rid = rid if isinstance(rid, str) else rid[0] + return await self._wait_for_final_result(single_rid, result) # type: ignore[arg-type] + # Batch: wait for every sub-request concurrently. + rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] + states: List[ReqState] = result # type: ignore[assignment] + outputs = await asyncio.gather( + *(self._wait_for_final_result(r, s) for r, s in zip(rids_list, states)) + ) + return list(outputs) return self._loop.run_until_complete(_run()) @@ -230,13 +297,14 @@ async def generate_async( ) -> AsyncIterator[Dict[str, Any]]: """Asynchronous generation entry point. - When *stream* is ``False`` (default) the returned async iterator - yields a **single** final result dict. When *stream* is ``True`` - every incremental chunk from the detokenizer is yielded as it - arrives, following the ``Event + out_list`` pattern. + For a **single** request and ``stream=False`` yields one final result + dict; with ``stream=True`` yields incremental chunks. + + For a **batch** request the iterator yields the final result for each + sub-request as it completes (order not guaranteed); streaming mode yields + incremental chunks from all sub-requests interleaved. """ - if rid is None: - rid = uuid.uuid4().hex + rid = self._make_rids(rid, prompt, input_ids) request = GenerateReqInput( rid=rid, text=prompt, @@ -254,18 +322,55 @@ async def generate_async( extra_options=kwargs, ) request.normalize_batch_and_arguments() - state = await self._rr_process.add_request(request) + result = await self._rr_process.add_request(request) - try: - if isinstance(rid, list): - raise ValueError("`generate_async` currently supports single request only.") - if stream: - async for chunk in self._stream_results(rid, state): - yield chunk - else: - yield await self._wait_for_final_result(rid, state) - finally: - self._rr_process.remove_state(rid) + if request.is_single: + single_rid = rid if isinstance(rid, str) else rid[0] # type: ignore[index] + state: ReqState = result # type: ignore[assignment] + try: + if stream: + async for chunk in self._stream_results(single_rid, state): + yield chunk + else: + yield await self._wait_for_final_result(single_rid, state) + finally: + self._rr_process.remove_state(single_rid) + else: + rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] + states: List[ReqState] = result # type: ignore[assignment] + try: + if stream: + # Merge streams from all sub-requests using an asyncio queue. + queue: asyncio.Queue = asyncio.Queue() + + async def _forward(r: str, s: ReqState) -> None: + async for chunk in self._stream_results(r, s): + await queue.put(chunk) + await queue.put(None) # sentinel + + tasks = [ + asyncio.create_task(_forward(r, s)) + for r, s in zip(rids_list, states) + ] + done_count = 0 + while done_count < len(tasks): + item = await queue.get() + if item is None: + done_count += 1 + else: + yield item + await asyncio.gather(*tasks) + else: + for coro in asyncio.as_completed( + [ + self._wait_for_final_result(r, s) + for r, s in zip(rids_list, states) + ] + ): + yield await coro + finally: + for r in rids_list: + self._rr_process.remove_state(r) @staticmethod async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: @@ -290,6 +395,30 @@ async def _stream_results( return state.event.clear() + @staticmethod + def _make_rids( + rid: Optional[Union[str, List[str]]], + prompt: Optional[Union[str, List[str]]], + input_ids: Optional[Union[List[int], List[List[int]]]], + ) -> Union[str, List[str]]: + """Return rids, auto-generating UUIDs when *rid* is ``None``. + + The helper infers whether the call is a batch from *prompt* / *input_ids* + so callers don't have to handle this case themselves. + """ + if rid is not None: + return rid + # Determine batch size from the text/input_ids argument. + is_batch = isinstance(prompt, list) or ( + isinstance(input_ids, list) + and len(input_ids) > 0 + and isinstance(input_ids[0], list) + ) + if is_batch: + n = len(prompt) if prompt is not None else len(input_ids) # type: ignore[arg-type] + return [uuid.uuid4().hex for _ in range(n)] + return uuid.uuid4().hex + def shutdown(self) -> None: """Terminate all subprocesses.""" if self._rr_process is not None: diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/layers/attention/radix_attention.py b/pymllm/layers/attention/radix_attention.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/orchestrator/cuda_ipc_transport.py b/pymllm/orchestrator/cuda_ipc_transport.py new file mode 100644 index 00000000..7052f0e8 --- /dev/null +++ b/pymllm/orchestrator/cuda_ipc_transport.py @@ -0,0 +1,373 @@ +""" +CUDA IPC Transport for zero-copy tensor sharing between processes. + +This module implements CUDA IPC with workspace buffer management +to avoid PyTorch's memory leak issue when sharing IPC handles. + +1. Create a workspace buffer on GPU (pre-allocated memory pool) +2. Copy tensor data to a chunk in the workspace +3. Get CUDA IPC handle for the chunk +4. Send handle + metadata (shape, dtype, offset) to another process +5. Reconstruct tensor in target process from IPC handle +6. Copy to local tensor and mark chunk as reusable + +Key Problem Solved: + PyTorch never releases tensors whose IPC handles are shared until process ends. + Solution: Use a fixed-size workspace buffer and recycle chunks. +""" + +import logging +import struct +import uuid +from dataclasses import dataclass +from multiprocessing import Queue +from multiprocessing.shared_memory import SharedMemory +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.cuda as cuda + +logger = logging.getLogger(__name__) + + +@dataclass +class MemoryChunk: + """Represents a chunk in the workspace buffer.""" + + offset: int # Offset in bytes from workspace start + size: int # Size in bytes + in_use: bool # Whether the chunk is currently occupied + sync_shm_name: Optional[str] = None # Shared memory name for sync flag + + +class WorkspaceBuffer: + """GPU memory pool for storing multimodal tensors temporarily. + + This prevents the PyTorch IPC handle memory leak by using a fixed-size + pre-allocated buffer and recycling chunks. + """ + + def __init__(self, size_gb: float = 4.0, device: int = 0): + """Initialize workspace buffer. + + Args: + size_gb: Total size of workspace in GB + device: CUDA device ID + """ + self.device = device + self.total_size = int(size_gb * 1024 * 1024 * 1024) # Convert GB to bytes + + # Allocate workspace on GPU + with torch.cuda.device(device): + self.workspace = torch.empty( + self.total_size // 4, # Divide by 4 because we use float32 + dtype=torch.float32, + device=f"cuda:{device}", + ) + + # Initialize chunk management + self.chunks: List[MemoryChunk] = [ + MemoryChunk(offset=0, size=self.total_size, in_use=False) + ] + + # Container for reusable sync buffers + self.sync_buffer_pool: List[str] = [] + + logger.info( + f"WorkspaceBuffer initialized: {size_gb}GB on cuda:{device}, " + f"ptr={self.workspace.data_ptr():#x}" + ) + + def allocate(self, size_bytes: int) -> Optional[Tuple[int, str]]: + """Allocate a chunk from the workspace. + + Args: + size_bytes: Required size in bytes + + Returns: + Tuple of (offset, sync_shm_name) if successful, None if no space + """ + # Find a free chunk that's large enough + for i, chunk in enumerate(self.chunks): + if not chunk.in_use and chunk.size >= size_bytes: + # Mark chunk as in use + chunk.in_use = True + + # Get or create sync buffer + if self.sync_buffer_pool: + sync_shm_name = self.sync_buffer_pool.pop() + # Reset sync flag to 0 (not ready) + self._reset_sync_buffer(sync_shm_name) + else: + sync_shm_name = self._create_sync_buffer() + + chunk.sync_shm_name = sync_shm_name + + # If chunk is larger than needed, split it + if chunk.size > size_bytes: + # Create a new free chunk for the remaining space + new_chunk = MemoryChunk( + offset=chunk.offset + size_bytes, + size=chunk.size - size_bytes, + in_use=False, + ) + chunk.size = size_bytes + self.chunks.insert(i + 1, new_chunk) + + logger.debug( + f"Allocated chunk: offset={chunk.offset}, size={size_bytes}, " + f"sync_shm={sync_shm_name}" + ) + return chunk.offset, sync_shm_name + + logger.warning(f"WorkspaceBuffer: No space for {size_bytes} bytes") + return None + + def release(self, offset: int) -> None: + """Release a chunk back to the pool. + + Args: + offset: Offset of the chunk to release + """ + for i, chunk in enumerate(self.chunks): + if chunk.offset == offset and chunk.in_use: + chunk.in_use = False + + # Return sync buffer to pool + if chunk.sync_shm_name: + self.sync_buffer_pool.append(chunk.sync_shm_name) + chunk.sync_shm_name = None + + # Try to merge with adjacent free chunks + self._merge_chunks() + + logger.debug(f"Released chunk: offset={offset}") + return + + logger.warning(f"Attempted to release unknown chunk at offset {offset}") + + def _merge_chunks(self) -> None: + """Merge adjacent free chunks to reduce fragmentation.""" + i = 0 + while i < len(self.chunks) - 1: + current = self.chunks[i] + next_chunk = self.chunks[i + 1] + + if not current.in_use and not next_chunk.in_use: + # Merge chunks + current.size += next_chunk.size + + # Keep first chunk's sync buffer, return second to pool + if next_chunk.sync_shm_name: + self.sync_buffer_pool.append(next_chunk.sync_shm_name) + + self.chunks.pop(i + 1) + else: + i += 1 + + def _create_sync_buffer(self) -> str: + """Create a new shared memory sync buffer (8 bytes, initialized to 0).""" + shm_name = f"pymllm_sync_{uuid.uuid4().hex[:12]}" + shm = SharedMemory(name=shm_name, create=True, size=8) + # Initialize to 0 (not ready) + shm.buf[:8] = struct.pack("Q", 0) + shm.close() + logger.debug(f"Created sync buffer: {shm_name}") + return shm_name + + def _reset_sync_buffer(self, shm_name: str) -> None: + """Reset sync buffer to 0 (not ready).""" + try: + shm = SharedMemory(name=shm_name, create=False) + shm.buf[:8] = struct.pack("Q", 0) + shm.close() + except Exception as e: + logger.warning(f"Failed to reset sync buffer {shm_name}: {e}") + + def copy_tensor_to_workspace(self, tensor: torch.Tensor, offset: int) -> None: + """Copy tensor data to workspace at given offset. + + Args: + tensor: Source tensor (must be on same CUDA device) + offset: Byte offset in workspace + """ + if not tensor.is_cuda or tensor.device.index != self.device: + raise ValueError(f"Tensor must be on cuda:{self.device}") + + size_bytes = tensor.numel() * tensor.element_size() + + # Get view of workspace at offset + offset_elements = offset // 4 # Workspace is float32 + num_elements = (size_bytes + 3) // 4 # Round up + + workspace_view = self.workspace[ + offset_elements : offset_elements + num_elements + ] + + # Copy tensor data (flatten and cast to float32 view) + tensor_flat = tensor.flatten().view(torch.uint8) + workspace_flat = workspace_view.view(torch.uint8)[: tensor_flat.numel()] + workspace_flat.copy_(tensor_flat) + + logger.debug(f"Copied tensor {tensor.shape} to workspace offset {offset}") + + def get_ipc_handle(self) -> bytes: + """Get CUDA IPC handle for the workspace buffer. + + Returns: + CUDA IPC handle as bytes + """ + # Get IPC handle using torch.cuda API + # Note: This requires CUDA-capable device with IPC support + handle = cuda.cudart().cudaIpcGetMemHandle(self.workspace.data_ptr()) + return bytes(handle) + + def cleanup(self) -> None: + """Cleanup all sync buffers.""" + all_shm_names = set() + for chunk in self.chunks: + if chunk.sync_shm_name: + all_shm_names.add(chunk.sync_shm_name) + all_shm_names.update(self.sync_buffer_pool) + + for shm_name in all_shm_names: + try: + shm = SharedMemory(name=shm_name, create=False) + shm.close() + shm.unlink() + except FileNotFoundError: + pass + except Exception as e: + logger.warning(f"Failed to cleanup sync buffer {shm_name}: {e}") + + logger.info("WorkspaceBuffer cleaned up") + + +@dataclass +class TensorMetadata: + """Metadata for reconstructing a tensor from CUDA IPC handle.""" + + shape: Tuple[int, ...] + dtype: torch.dtype + offset: int # Byte offset in workspace + size_bytes: int + sync_shm_name: str # Shared memory name for sync flag + + +class CudaIPCTransport: + """Transport for sharing CUDA tensors via IPC handles.""" + + def __init__( + self, + workspace_size_gb: float = 4.0, + device: int = 0, + ): + """Initialize CUDA IPC transport. + + Args: + workspace_size_gb: Size of workspace buffer in GB + device: CUDA device ID + """ + self.device = device + self.workspace = WorkspaceBuffer(workspace_size_gb, device) + self.ipc_handle = self.workspace.get_ipc_handle() + self.queue: Queue = Queue() + + def send_tensor(self, rid: str, tensor: torch.Tensor) -> bool: + """Send a tensor via CUDA IPC. + + Args: + rid: Request ID + tensor: Tensor to send (must be on CUDA) + + Returns: + True if sent via CUDA IPC, False if fallback needed + """ + if not tensor.is_cuda: + logger.debug(f"Tensor for {rid} not on CUDA, skipping IPC") + return False + + size_bytes = tensor.numel() * tensor.element_size() + + # Try to allocate from workspace + result = self.workspace.allocate(size_bytes) + if result is None: + logger.warning( + f"WorkspaceBuffer full, falling back to shared queue for {rid}" + ) + return False + + offset, sync_shm_name = result + + # Copy tensor to workspace + self.workspace.copy_tensor_to_workspace(tensor, offset) + + # Create metadata + metadata = TensorMetadata( + shape=tuple(tensor.shape), + dtype=tensor.dtype, + offset=offset, + size_bytes=size_bytes, + sync_shm_name=sync_shm_name, + ) + + # Send metadata through queue + self.queue.put((rid, metadata, self.ipc_handle)) + + logger.debug(f"Sent tensor {tensor.shape} for {rid} via CUDA IPC") + return True + + def receive_tensor( + self, timeout: float = 0.0001 + ) -> Optional[Tuple[str, torch.Tensor]]: + """Receive a tensor via CUDA IPC. + + Args: + timeout: Timeout for queue.get + + Returns: + Tuple of (rid, tensor) or None if queue empty + """ + try: + rid, metadata, ipc_handle = self.queue.get(timeout=timeout) + except Exception: + return None + + # Open IPC memory handle + # Note: This creates a tensor view into the remote process's workspace + with torch.cuda.device(self.device): + # Reconstruct tensor from IPC handle + # This is a view into remote memory, we need to copy it locally + + # For now, use a simpler approach: signal to copy later + # In production, you'd use cuda.cudart().cudaIpcOpenMemHandle + + logger.warning( + "CUDA IPC receive not fully implemented - requires cudaIpcOpenMemHandle" + ) + # TODO: Implement actual IPC handle opening + + # Create local tensor and signal copy completion + tensor = torch.empty( + metadata.shape, dtype=metadata.dtype, device=f"cuda:{self.device}" + ) + + # Mark chunk as ready for reuse by setting sync flag + self._mark_chunk_reusable(metadata.sync_shm_name) + + return rid, tensor + + def _mark_chunk_reusable(self, sync_shm_name: str) -> None: + """Mark a chunk as reusable by setting sync flag to 1.""" + try: + shm = SharedMemory(name=sync_shm_name, create=False) + shm.buf[:8] = struct.pack("Q", 1) # Set to 1 (ready for reuse) + shm.close() + logger.debug(f"Marked chunk reusable: {sync_shm_name}") + except Exception as e: + logger.error(f"Failed to mark chunk reusable {sync_shm_name}: {e}") + + def cleanup(self) -> None: + """Cleanup resources.""" + self.workspace.cleanup() + self.queue.close() diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index 4b28645e..b60966dd 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -7,7 +7,7 @@ import logging from multiprocessing.connection import Connection -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import zmq @@ -41,10 +41,16 @@ def __init__( def init_sockets(self) -> None: self._zmq_ctx = zmq.Context() self._recv_from_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_from_scheduler_addr, bind=False, + self._zmq_ctx, + zmq.PULL, + self._recv_from_scheduler_addr, + bind=False, ) self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=False, + self._zmq_ctx, + zmq.PUSH, + self._send_to_scheduler_addr, + bind=False, ) def event_loop(self) -> None: @@ -62,18 +68,41 @@ def event_loop(self) -> None: def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: """Run the model forward pass and sampling for *batch*. + *batch* is a dict produced by ``SchedulerProcess.get_next_batch_to_run`` + whose ``"requests"`` list contains + :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` objects. + + Returns a dict ``{"batch_id": ..., "finished": [...], "unfinished": [...]}`` + where each element of *finished* / *unfinished* is a plain output dict + containing at least ``"rid"`` and ``"output_token_ids"``. + TODO: implement real forward pass, logits processing, and sampling. """ requests = batch.get("requests", []) - finished = [] - unfinished = [] + finished: List[Dict[str, Any]] = [] + unfinished: List[Dict[str, Any]] = [] for req in requests: - # TODO: actual model forward, logits -> next_token_ids - next_token_ids = [] # placeholder - req["output_token_ids"] = req.get("output_token_ids", []) + next_token_ids - # TODO: check EOS / max_tokens to decide finished vs. unfinished - finished.append(req) + # Support both TokenizedGenerateReqInput dataclass (normal path) and + # legacy plain dicts (defensive). + rid: str = req.rid if hasattr(req, "rid") else req.get("rid") + input_ids: List[int] = ( + req.input_ids if hasattr(req, "input_ids") else req.get("input_ids", []) + ) + mm_inputs: Optional[Dict[str, Any]] = ( + req.mm_inputs if hasattr(req, "mm_inputs") else req.get("mm_inputs") + ) + + # TODO: actual model forward; pass input_ids and mm_inputs to the model. + next_token_ids: List[int] = [] # placeholder + + output: Dict[str, Any] = { + "rid": rid, + "output_token_ids": next_token_ids, + "finished": True, + } + # TODO: check EOS / max_tokens to decide finished vs. unfinished. + finished.append(output) return { "batch_id": batch.get("batch_id"), diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index 74335428..fa9d92ec 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -14,7 +14,7 @@ import asyncio import dataclasses import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import zmq import zmq.asyncio @@ -82,19 +82,39 @@ def start(self, loop: asyncio.AbstractEventLoop) -> None: ) self._loop_task = loop.create_task(self._run()) - async def add_request(self, request: GenerateReqInput) -> ReqState: - """Enqueue a request and return its :class:`ReqState`. + async def add_request( + self, request: GenerateReqInput + ) -> Union[ReqState, List[ReqState]]: + """Enqueue request(s) and return the corresponding :class:`ReqState`(s). + + * **Single request** (``request.is_single is True``): behaves exactly as + before – registers one ``ReqState`` and enqueues one message. + * **Batch request** (``request.is_single is False``): splits the batch + into *N* individual sub-requests, registers a ``ReqState`` per rid, and + enqueues each sub-request separately so the downstream pipeline sees + independent messages. Returns a ``List[ReqState]`` in the same order + as the input rids. Callers should ``await state.event.wait()`` in a loop, consuming ``state.out_list`` entries until ``state.finished`` is ``True``. """ - if not isinstance(request.rid, str): - raise ValueError("RequestResponseProcess currently accepts single requests only.") - rid = request.rid - state = ReqState() - self._rid_to_state[rid] = state - await self._request_queue.put(request.to_request_dict()) - return state + if request.is_single: + rid = request.rid if isinstance(request.rid, str) else request.rid[0] + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(request.to_request_dict()) + return state + + # Batch path: fan-out into individual sub-requests. + states: List[ReqState] = [] + for i in range(request.batch_size): + sub = request[i] + rid = sub.rid if isinstance(sub.rid, str) else str(sub.rid) + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(sub.to_request_dict()) + states.append(state) + return states def remove_state(self, rid: str) -> None: """Remove the ``ReqState`` for *rid* (called by the caller once done).""" diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index e7394dab..64ea55b0 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -5,6 +5,10 @@ batches, dispatches batches to the ModelRunnerProcess for forward passes, collects results, and streams finished token IDs to the DetokenizerProcess. +Supports two modes: + 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj + 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory + The main ``event_loop`` scheduler flow:: while True: @@ -18,6 +22,7 @@ """ import logging +import queue as stdlib_queue import time from collections import deque from multiprocessing.connection import Connection @@ -25,7 +30,9 @@ import zmq +from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -39,6 +46,8 @@ def __init__( send_to_model_runner_addr: str, recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, + shared_queue: Optional[TensorQueue] = None, + enable_shared_queue: bool = False, ): # ZMQ addresses self._recv_from_tokenizer_addr = recv_from_tokenizer_addr @@ -46,6 +55,10 @@ def __init__( self._recv_from_model_runner_addr = recv_from_model_runner_addr self._send_to_detokenizer_addr = send_to_detokenizer_addr + # Shared queue configuration + self._shared_queue = shared_queue + self._enable_shared_queue = enable_shared_queue + # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_tokenizer: Optional[zmq.Socket] = None @@ -55,7 +68,7 @@ def __init__( self._poller: Optional[zmq.Poller] = None # Request management - self._waiting_queue: Deque[Dict[str, Any]] = deque() + self._waiting_queue: Deque[TokenizedGenerateReqInput] = deque() self._running_batch: Optional[Dict[str, Any]] = None self._finished: List[Dict[str, Any]] = [] @@ -97,7 +110,10 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite scheduling loop.""" - logger.info("SchedulerProcess event loop started") + logger.info( + "SchedulerProcess event loop started (shared_queue=%s)", + self._enable_shared_queue, + ) while True: self.recv_requests() self.process_input_requests() @@ -114,15 +130,80 @@ def event_loop(self) -> None: def recv_requests(self) -> None: """Non-blocking receive of tokenized requests from TokenizerProcess. - Uses ``zmq.Poller`` with a short timeout so the scheduler is never - stuck waiting when there are batches to run. + Supports two modes: + 1. Legacy ZMQ: Uses ``zmq.Poller`` with a short timeout + 2. Shared queue: Non-blocking get from multiprocessing.Queue + + Messages are either: + * A :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` + dataclass – appended to ``_waiting_queue``. + * A plain abort sentinel dict ``{"rid": ..., "abort": True}`` – handled + inline by removing the matching rid from the waiting queue. """ + if self._enable_shared_queue and self._shared_queue is not None: + self._recv_from_shared_queue() + else: + self._recv_from_zmq() + + def _recv_from_zmq(self) -> None: + """Receive requests via legacy ZMQ path.""" while True: events = dict(self._poller.poll(timeout=0)) # non-blocking if self._recv_from_tokenizer not in events: break - req = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) - self._waiting_queue.append(req) + msg = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) + # Abort sentinel: plain dict with "abort" key. + if isinstance(msg, dict) and msg.get("abort"): + rid = msg.get("rid") + logger.debug("Scheduler received abort for rid=%s", rid) + self._waiting_queue = type(self._waiting_queue)( + r for r in self._waiting_queue if r.rid != rid + ) + else: + self._waiting_queue.append(msg) + + def _recv_from_shared_queue(self) -> None: + """Receive requests via shared memory + shared queue fast path.""" + while True: + try: + # Non-blocking get from shared queue + rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.0001) + + # Read metadata from shared memory (and unlink immediately) + metadata: TokenizedGenerateReqInput = SharedMemoryManager.read_metadata( + shm_name, unlink=True + ) + + # Reconstruct the full TokenizedGenerateReqInput with mm_inputs + full_request = TokenizedGenerateReqInput( + rid=metadata.rid, + input_text=metadata.input_text, + input_ids=metadata.input_ids, + mm_inputs=mm_inputs, # Restored from shared queue + sampling_params=metadata.sampling_params, + stream=metadata.stream, + return_logprob=metadata.return_logprob, + logprob_start_len=metadata.logprob_start_len, + top_logprobs_num=metadata.top_logprobs_num, + lora_path=metadata.lora_path, + session_params=metadata.session_params, + ) + + self._waiting_queue.append(full_request) + logger.debug(f"Received request {rid} from shared queue") + + except stdlib_queue.Empty: + # No more requests available + break + except Exception as e: + logger.error(f"Error receiving from shared queue: {e}", exc_info=True) + # Try to cleanup shared memory if possible + try: + if "shm_name" in locals(): + SharedMemoryManager.cleanup(shm_name) + except: + pass + break # ------------------------------------------------------------------ # Step 2: process input requests @@ -227,6 +308,8 @@ def run_scheduler_process( recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, pipe_writer: Connection, + shared_queue: Optional[TensorQueue] = None, + enable_shared_queue: bool = False, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" proc = SchedulerProcess( @@ -234,6 +317,8 @@ def run_scheduler_process( send_to_model_runner_addr, recv_from_model_runner_addr, send_to_detokenizer_addr, + shared_queue=shared_queue, + enable_shared_queue=enable_shared_queue, ) proc.init_sockets() diff --git a/pymllm/orchestrator/shared_memory_queue.py b/pymllm/orchestrator/shared_memory_queue.py new file mode 100644 index 00000000..3d26ebf1 --- /dev/null +++ b/pymllm/orchestrator/shared_memory_queue.py @@ -0,0 +1,190 @@ +""" +Shared memory and queue utilities for fast IPC between tokenizer and scheduler. + +This module implements shared-queue fast path to avoid expensive +ZMQ serialization of large multimodal tensors. + +Design: + - Metadata lane: Small tokenized objects stored in shared memory keyed by rid + - Tensor lane: Large tensors made shareable via share_memory_() and passed by handle +""" + +import logging +import pickle +import uuid +from multiprocessing import Queue +from multiprocessing.shared_memory import SharedMemory +from typing import Any, Dict, Optional + +import torch + +logger = logging.getLogger(__name__) + + +class SharedMemoryManager: + """Manages shared memory segments for passing metadata between processes. + + Each tokenized request's metadata is written to a unique shared memory segment + keyed by its request ID (rid). The scheduler reads and immediately unlinks the + segment to prevent memory leaks. + """ + + @staticmethod + def write_metadata(rid: str, metadata: Any) -> str: + """Write metadata to shared memory and return the segment name. + + Args: + rid: Request ID (used as part of the shared memory name) + metadata: Serializable metadata object + + Returns: + str: The shared memory segment name + """ + # Serialize the metadata + data = pickle.dumps(metadata) + size = len(data) + + # Create unique shared memory segment name + shm_name = f"pymllm_meta_{rid}_{uuid.uuid4().hex[:8]}" + + try: + # Create shared memory segment + shm = SharedMemory(name=shm_name, create=True, size=size) + # Write data + shm.buf[:size] = data + shm.close() + logger.debug(f"Wrote {size} bytes to shared memory {shm_name}") + return shm_name + except Exception as e: + logger.error(f"Failed to write metadata to shared memory: {e}") + raise + + @staticmethod + def read_metadata(shm_name: str, unlink: bool = True) -> Any: + """Read metadata from shared memory and optionally unlink it. + + Args: + shm_name: The shared memory segment name + unlink: If True, immediately unlink the segment after reading + + Returns: + The deserialized metadata object + """ + try: + # Open existing shared memory segment + shm = SharedMemory(name=shm_name, create=False) + # Read and deserialize data + data = bytes(shm.buf[:]) + metadata = pickle.loads(data) + shm.close() + + # Unlink to free memory immediately + if unlink: + try: + shm.unlink() + logger.debug(f"Read and unlinked shared memory {shm_name}") + except FileNotFoundError: + # Already unlinked, ignore + pass + + return metadata + except Exception as e: + logger.error(f"Failed to read metadata from shared memory {shm_name}: {e}") + raise + + @staticmethod + def cleanup(shm_name: str) -> None: + """Manually cleanup a shared memory segment (for error recovery).""" + try: + shm = SharedMemory(name=shm_name, create=False) + shm.close() + shm.unlink() + logger.debug(f"Cleaned up shared memory {shm_name}") + except FileNotFoundError: + pass # Already cleaned up + except Exception as e: + logger.warning(f"Failed to cleanup shared memory {shm_name}: {e}") + + +class TensorQueue: + """Queue for passing large tensors between processes using shared memory. + + Tensors are made shareable via .share_memory_() and passed through a + multiprocessing.Queue by handle (metadata only, not the actual data). + """ + + def __init__(self, maxsize: int = 0): + """Initialize the tensor queue. + + Args: + maxsize: Maximum queue size (0 for unlimited) + """ + self._queue: Queue = Queue(maxsize=maxsize) + + def put(self, rid: str, shm_name: str, mm_inputs: Optional[Dict[str, Any]]) -> None: + """Put a request with multimodal inputs into the queue. + + Args: + rid: Request ID + shm_name: Shared memory segment name for metadata + mm_inputs: Multimodal inputs dict (can contain torch tensors) + """ + # Make tensors shareable if present + if mm_inputs is not None: + mm_inputs = self._make_tensors_shareable(mm_inputs) + + self._queue.put((rid, shm_name, mm_inputs)) + logger.debug(f"Put request {rid} into tensor queue (shm={shm_name})") + + def get( + self, timeout: Optional[float] = None + ) -> tuple[str, str, Optional[Dict[str, Any]]]: + """Get a request from the queue. + + Args: + timeout: Timeout in seconds (None for blocking indefinitely) + + Returns: + Tuple of (rid, shm_name, mm_inputs) + """ + rid, shm_name, mm_inputs = self._queue.get(timeout=timeout) + logger.debug(f"Got request {rid} from tensor queue (shm={shm_name})") + return rid, shm_name, mm_inputs + + def empty(self) -> bool: + """Check if the queue is empty.""" + return self._queue.empty() + + def qsize(self) -> int: + """Return the approximate size of the queue.""" + try: + return self._queue.qsize() + except NotImplementedError: + return 0 # Some platforms don't support qsize + + def close(self) -> None: + """Close the queue.""" + self._queue.close() + + @staticmethod + def _make_tensors_shareable(data: Any) -> Any: + """Recursively make all torch tensors in a data structure shareable. + + Args: + data: Nested dict/list/tensor structure + + Returns: + The same structure with tensors made shareable via share_memory_() + """ + if isinstance(data, torch.Tensor): + # Make tensor shareable across processes + if not data.is_shared(): + data = data.share_memory_() + return data + elif isinstance(data, dict): + return {k: TensorQueue._make_tensors_shareable(v) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [TensorQueue._make_tensors_shareable(item) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 53714bb6..43db5ba0 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -3,15 +3,22 @@ Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, and forwards the tokenized payloads to the SchedulerProcess. + +Supports two modes: + 1. Legacy ZMQ path: Send TokenizedGenerateReqInput via ZMQ send_pyobj + 2. Shared queue fast path: Write metadata to shared memory and put rid in shared queue """ import logging from multiprocessing.connection import Connection -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Union import zmq +from transformers import AutoProcessor, AutoTokenizer +from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -23,16 +30,42 @@ def __init__( self, recv_from_rr_addr: str, send_to_scheduler_addr: str, + tokenizer_cfg: Dict[str, Any], + shared_queue: Optional[TensorQueue] = None, ): + """ + Parameters + ---------- + tokenizer_cfg: + Serialisable dict built by the parent process (``Engine``) before + spawning. Required keys: + + * ``tokenizer_path`` – str, path to the tokenizer directory. + * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. + * ``trust_remote_code`` – bool. + * ``context_length`` – Optional[int], explicit cap; inferred from + ``hf_config`` when ``None``. + * ``hf_config`` – Optional HuggingFace PretrainedConfig + (pickled by multiprocessing); used only to infer ``context_length``. + * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + shared_queue: + Optional TensorQueue for shared memory fast path communication. + """ self._recv_from_rr_addr = recv_from_rr_addr self._send_to_scheduler_addr = send_to_scheduler_addr + self._tokenizer_cfg = tokenizer_cfg + self._enable_shared_queue = tokenizer_cfg.get("enable_shared_queue", False) + self._shared_queue = shared_queue - self._zmq_ctx: zmq.Context = None - self._recv_from_rr: zmq.Socket = None - self._send_to_scheduler: zmq.Socket = None + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_rr: Optional[zmq.Socket] = None + self._send_to_scheduler: Optional[zmq.Socket] = None - # TODO: initialise the actual tokenizer (HuggingFace / custom) self._tokenizer = None + self._mm_processor = None + self._context_length: Optional[int] = None + + self._init_tokenizers() # ------------------------------------------------------------------ # Lifecycle @@ -55,29 +88,269 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" - logger.info("TokenizerProcess event loop started") + logger.info( + "TokenizerProcess event loop started (shared_queue=%s)", + self._enable_shared_queue, + ) while True: raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() tokenized = self._tokenize(raw_request) + + if self._enable_shared_queue and self._shared_queue is not None: + # Shared queue fast path + self._send_via_shared_queue(tokenized) + else: + # Legacy ZMQ path + self._send_to_scheduler.send_pyobj(tokenized) + + def _send_via_shared_queue( + self, tokenized: Union[TokenizedGenerateReqInput, Dict[str, Any]] + ) -> None: + """Send tokenized request via shared memory + shared queue fast path. + + Args: + tokenized: Either TokenizedGenerateReqInput dataclass or abort dict + """ + # Handle abort sentinel + if isinstance(tokenized, dict) and tokenized.get("abort"): + # Fallback to ZMQ for abort messages self._send_to_scheduler.send_pyobj(tokenized) + return + + assert isinstance(tokenized, TokenizedGenerateReqInput), ( + f"Expected TokenizedGenerateReqInput, got {type(tokenized)}" + ) + + rid = tokenized.rid + mm_inputs = tokenized.mm_inputs + + # Create a lightweight metadata object (without mm_inputs) + metadata = TokenizedGenerateReqInput( + rid=tokenized.rid, + input_text=tokenized.input_text, + input_ids=tokenized.input_ids, + mm_inputs=None, # Will be passed separately via shared queue + sampling_params=tokenized.sampling_params, + stream=tokenized.stream, + return_logprob=tokenized.return_logprob, + logprob_start_len=tokenized.logprob_start_len, + top_logprobs_num=tokenized.top_logprobs_num, + lora_path=tokenized.lora_path, + session_params=tokenized.session_params, + ) + + # Write metadata to shared memory + shm_name = SharedMemoryManager.write_metadata(rid, metadata) + + # Put (rid, shm_name, mm_inputs) into shared queue + self._shared_queue.put(rid, shm_name, mm_inputs) + + logger.debug(f"Sent request {rid} via shared queue (shm={shm_name})") # ------------------------------------------------------------------ - # Tokenization (placeholder) + # Tokenization and multimodal preprocessing # ------------------------------------------------------------------ - def _tokenize(self, raw_request: Dict[str, Any]) -> Dict[str, Any]: - """Tokenize a single raw request and return the tokenized payload. + def _init_tokenizers(self) -> None: + """Initialise text tokenizer and (optionally) multimodal processor. - TODO: replace with real tokenizer call. + All configuration is read from ``self._tokenizer_cfg`` which was + serialised by the parent process before ``spawn``. No global config + access happens inside the subprocess. """ - text = raw_request.get("text", "") - # placeholder: produce fake token ids - input_ids: List[int] = [] # TODO: self._tokenizer.encode(text) - return { - **raw_request, - "input_ids": input_ids, + cfg = self._tokenizer_cfg + tokenizer_path: str = cfg["tokenizer_path"] + tokenizer_mode: str = cfg.get("tokenizer_mode", "auto") + trust_remote_code: bool = bool(cfg.get("trust_remote_code", False)) + + tokenizer_kwargs: Dict[str, Any] = { + "use_fast": tokenizer_mode != "slow", + "trust_remote_code": trust_remote_code, } + self._tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path, + **tokenizer_kwargs, + ) + + # Default to left padding for generation. + try: + self._tokenizer.padding_side = "left" + except Exception: + pass + + # Context length: explicit config value takes priority; fall back to + # common HF config field names. + context_len: Optional[int] = cfg.get("context_length") + if context_len is None: + hf_cfg = cfg.get("hf_config") + for name in ("max_position_embeddings", "max_sequence_length", "seq_len"): + if hf_cfg is not None and hasattr(hf_cfg, name): + context_len = int(getattr(hf_cfg, name)) + break + self._context_length = context_len + + # Try to load multimodal processor (optional). + try: + self._mm_processor = AutoProcessor.from_pretrained( + tokenizer_path, + trust_remote_code=trust_remote_code, + ) + except Exception: + # Text-only models don't provide a processor; that's fine. + self._mm_processor = None + + def _tokenize( + self, raw_request: Dict[str, Any] + ) -> Union[TokenizedGenerateReqInput, Dict[str, Any]]: + """Tokenize one raw request dict and return a typed object. + + * **Abort** messages (``{"rid": ..., "abort": True}``) are returned as + plain dicts so the scheduler can intercept them without importing the + io_struct. + * Normal requests are returned as a :class:`TokenizedGenerateReqInput` + dataclass instance that carries ``input_ids``, ``mm_inputs``, and all + sampling meta-data in typed fields. + + Each message arriving here corresponds to exactly one sub-request + because batch splitting happens upstream in ``RequestResponseProcess``. + """ + # Abort: propagate as a plain sentinel dict. + if raw_request.get("abort"): + return {"rid": raw_request.get("rid"), "abort": True} + + # ------------------------------------------------------------------ # + # 1. Text tokenization + # ------------------------------------------------------------------ # + if raw_request.get("input_ids") is not None: + # Caller already tokenized – skip text processing. + input_ids: List[int] = list(raw_request["input_ids"]) + raw_text = raw_request.get("text") + input_text: str = ( + str(raw_text[0]) if isinstance(raw_text, list) else str(raw_text or "") + ) + else: + text = raw_request.get("text") + if text is None: + raise ValueError( + "TokenizerProcess expects either `text` or `input_ids`." + ) + # Accept a list for robustness; take the first element. + input_text = str(text[0]) if isinstance(text, list) else str(text) + + encode_kwargs: Dict[str, Any] = { + "add_special_tokens": True, + "return_attention_mask": False, + } + if self._context_length is not None: + encode_kwargs.update( + {"truncation": True, "max_length": self._context_length} + ) + + encoding = self._tokenizer(input_text, **encode_kwargs) + input_ids = encoding["input_ids"] + + # ------------------------------------------------------------------ # + # 2. Multimodal pre-processing + # ------------------------------------------------------------------ # + mm_inputs = self._collect_mm_inputs(raw_request, text=input_text) + + # ------------------------------------------------------------------ # + # 3. Pack into the typed dataclass + # ------------------------------------------------------------------ # + return TokenizedGenerateReqInput( + rid=raw_request.get("rid"), + input_text=input_text, + input_ids=input_ids, + mm_inputs=mm_inputs, + sampling_params=raw_request.get("sampling_params") or {}, + stream=bool(raw_request.get("stream", False)), + return_logprob=bool(raw_request.get("return_logprob", False)), + logprob_start_len=int(raw_request.get("logprob_start_len", -1)), + top_logprobs_num=int(raw_request.get("top_logprobs_num", 0)), + lora_path=raw_request.get("lora_path"), + session_params=raw_request.get("session_params"), + ) + + def _normalize_image_input(self, image_data: Any) -> List[Any]: + """Normalise ``image_data`` into a list of image-like objects. + + Supported input forms: + - single PIL.Image / numpy array / torch.Tensor + - path string or bytes + - list/tuple of the above + """ + + def _to_image(obj: Any) -> Any: + # Lazily import Pillow to avoid hard dependency for text-only models. + try: + from PIL import Image # type: ignore + except Exception as exc: # pragma: no cover - optional dependency + raise RuntimeError( + "Pillow is required for image preprocessing in TokenizerProcess" + ) from exc + + if obj is None: + return None + if isinstance(obj, Image.Image): + return obj + if isinstance(obj, (str, bytes)): + return Image.open(obj) + return obj + + if isinstance(image_data, (list, tuple)): + return [ + img for img in (_to_image(x) for x in image_data) if img is not None + ] + return [img for img in (_to_image(image_data),) if img is not None] + + def _collect_mm_inputs( + self, raw_request: Dict[str, Any], text: Optional[str] = None + ) -> Optional[Dict[str, Any]]: + """Pre-process multimodal data and return a consolidated ``mm_inputs`` dict. + + Returns ``None`` for text-only requests. Otherwise returns a flat dict + whose keys are ready to be unpacked by the model runner: + + * ``image_inputs`` – output of ``AutoProcessor`` (contains + ``pixel_values``, etc.) when a processor is available. + * ``image_data`` – raw image objects when no processor is available. + * ``audio_data`` – forwarded verbatim (no processor yet). + * ``video_data`` – forwarded verbatim (no processor yet). + """ + image_data = raw_request.get("image_data") + video_data = raw_request.get("video_data") + audio_data = raw_request.get("audio_data") + + if not any(x is not None for x in (image_data, video_data, audio_data)): + return None # text-only request + + mm: Dict[str, Any] = {} + + # Image: prefer AutoProcessor output; fall back to raw data. + if image_data is not None: + if self._mm_processor is not None: + images = self._normalize_image_input(image_data) + try: + processor_inputs = self._mm_processor( + images=images, + text=text if text is not None else raw_request.get("text"), + return_tensors="pt", + ) + mm["image_inputs"] = processor_inputs + except Exception: + mm["image_data"] = image_data + else: + mm["image_data"] = image_data + + # Audio / video forwarded verbatim for now. + if audio_data is not None: + mm["audio_data"] = audio_data + if video_data is not None: + mm["video_data"] = video_data + + return mm + def shutdown(self) -> None: if self._recv_from_rr is not None: self._recv_from_rr.close() @@ -91,9 +364,13 @@ def run_tokenizer_process( recv_from_rr_addr: str, send_to_scheduler_addr: str, pipe_writer: Connection, + tokenizer_cfg: Dict[str, Any], + shared_queue: Optional[TensorQueue] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = TokenizerProcess(recv_from_rr_addr, send_to_scheduler_addr) + proc = TokenizerProcess( + recv_from_rr_addr, send_to_scheduler_addr, tokenizer_cfg, shared_queue + ) proc.init_sockets() # Signal readiness to the parent process diff --git a/pyproject.toml b/pyproject.toml index d417b579..d752ddc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ dependencies=[ "typer", "torch", "torchao", + "pyfiglet", + "termcolor", ] [project.optional-dependencies] From b057360804e0b54a329faf81c73f2c84aceb1082 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 2 Mar 2026 06:45:16 +0000 Subject: [PATCH 11/13] feat(mllm-kernel): add high-performance create_kv_indices CUDA kernel and benchmark - Implement CUDA kernel to convert ReqToTokenPool mapping into flat KV index arrays - Use block-per-sequence parallelism for fully coalesced memory access - Validate tensor shapes, dtypes, and devices with TensorMatcher utilities - Provide Python JIT wrapper using mllm-kernel JIT system for easy integration - Add detailed documentation and usage guide for kernel implementation - Create benchmark script comparing kernel against naive PyTorch gather - Support optional start offsets for sliding-window decode scenarios - Ensure robust out-of-bounds checks to prevent segmentation faults - Establish testing and benchmarking patterns for future kernel development --- .claude/skills/impl-jit-kernel/SKILL.md | 486 +++++++++ .../benchmarks/bench_create_kv_indices.py | 218 ++++ .../cuda/csrc/create_kv_indices.cuh | 282 +++++ .../mllm_kernel/cuda/csrc/vocab_embedding.cuh | 0 .../mllm_kernel/cuda/jit/create_kv_indices.py | 118 +++ mllm-kernel/pyproject.toml | 2 +- mllm-kernel/tests/test_create_kv_indices.py | 191 ++++ pymllm/configs/server_config.py | 45 + pymllm/engine/__init__.py | 8 + pymllm/engine/forward_batch.py | 182 ++++ pymllm/engine/launch.py | 32 +- pymllm/layers/attention/__init__.py | 25 + pymllm/layers/attention/attention_backend.py | 143 +++ pymllm/layers/attention/flashinfer_backend.py | 964 ++++++++++++++++++ pymllm/layers/attention/radix_attention.py | 171 ++++ pymllm/layers/sampling.py | 0 pymllm/mem_cache/memory_pool.py | 16 +- pymllm/orchestrator/cuda_ipc_transport.py | 859 ++++++++++------ pymllm/orchestrator/scheduler_process.py | 50 +- pymllm/orchestrator/shared_memory_queue.py | 226 ++-- pymllm/orchestrator/tokenizer_process.py | 153 ++- 21 files changed, 3773 insertions(+), 398 deletions(-) create mode 100644 .claude/skills/impl-jit-kernel/SKILL.md create mode 100644 mllm-kernel/benchmarks/bench_create_kv_indices.py create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py create mode 100644 mllm-kernel/tests/test_create_kv_indices.py create mode 100644 pymllm/layers/sampling.py diff --git a/.claude/skills/impl-jit-kernel/SKILL.md b/.claude/skills/impl-jit-kernel/SKILL.md new file mode 100644 index 00000000..39cc02b6 --- /dev/null +++ b/.claude/skills/impl-jit-kernel/SKILL.md @@ -0,0 +1,486 @@ +--- +name: impl-jit-kernel +description: Guide for implementing CUDA or CPU JIT kernels in mllm-kernel. Use when the user asks to create, add, or implement a new kernel in mllm-kernel. +--- + +# Implementing a JIT Kernel in mllm-kernel + +## Overview + +mllm-kernel uses a JIT (Just-In-Time) compilation system built on `tvm_ffi`. Kernels are written in C++20 (`.cuh` for CUDA, `.cpp` for CPU), validated at runtime via `TensorMatcher`, and exposed to Python through a `@jit` decorator. No pre-compilation is needed -- kernels compile on first call and are cached at `~/.cache/mllm_kernel/`. + +## File Layout + +For a kernel named `my_kernel`: + +``` +mllm-kernel/ + mllm_kernel/ + cuda/ + csrc/my_kernel.cuh # CUDA kernel implementation + jit/my_kernel.py # Python JIT wrapper + jit/__init__.py # Add export here + cpu/ + csrc/my_kernel.cpp # CPU kernel implementation (Highway SIMD) + include/mllm_kernel/cpu/ + my_kernel.hpp # CPU SIMD body (NO #pragma once) + jit/my_kernel.py # Python JIT wrapper + jit/__init__.py # Add export here + tests/test_my_kernel.py # Pytest correctness tests + benchmarks/bench_my_kernel.py # Profiler benchmark vs PyTorch reference +``` + +--- + +## CUDA Kernel Walkthrough + +### Step 1: Write the `.cuh` kernel + +Create `mllm_kernel/cuda/csrc/my_kernel.cuh`: + +```cpp +#pragma once + +#include // TensorMatcher, SymbolicSize, SymbolicDevice, SymbolicDType +#include // RuntimeCheck, Panic, div_ceil +#include // LaunchKernel, fp16_t, bf16_t, PDL helpers + +#include +#include + +#include + +namespace { + +// --------------------------------------------------------------------------- +// 1. Parameter struct (trivially copyable, passed to kernel by value) +// --------------------------------------------------------------------------- +struct MyKernelParams { + const float* __restrict__ input; + float* __restrict__ output; + int32_t num_elements; +}; + +// --------------------------------------------------------------------------- +// 2. CUDA kernel +// --------------------------------------------------------------------------- +__global__ void my_kernel(const MyKernelParams params) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= params.num_elements) return; + params.output[idx] = params.input[idx] * 2.0f; +} + +// --------------------------------------------------------------------------- +// 3. Host-side launcher (entry point for TVM FFI binding) +// --------------------------------------------------------------------------- +struct MyKernel { + static void run(tvm::ffi::TensorView input, tvm::ffi::TensorView output) { + using namespace mllm_kernel::host; + + // --- Validate tensors --- + SymbolicSize N{"num_elements"}; + SymbolicDevice device; + + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device) + .verify(input); + + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device) + .verify(output); + + const int64_t n = N.unwrap(); + RuntimeCheck(n > 0, "num_elements must be positive, got ", n); + + // --- Build params --- + MyKernelParams params{ + .input = static_cast(input.data_ptr()), + .output = static_cast(output.data_ptr()), + .num_elements = static_cast(n), + }; + + // --- Launch --- + constexpr int kBlock = 256; + const int grid = static_cast(div_ceil(n, kBlock)); + LaunchKernel(grid, kBlock, device.unwrap())(my_kernel, params); + } +}; + +} // namespace +``` + +**Key rules:** + +- **Always wrap in `namespace {}`** (anonymous namespace). +- **Entry point** is a `static void run(tvm::ffi::TensorView ...)` method. +- **Validate every tensor** with `TensorMatcher` before reading `.data_ptr()`. +- **Never dereference device pointers on host** -- `data_ptr()` returns a GPU pointer. +- **Use `LaunchKernel`** to launch -- it handles stream resolution and error checking. + +### Step 2: Write the Python JIT wrapper + +Create `mllm_kernel/cuda/jit/my_kernel.py`: + +```python +"""JIT wrapper for my_kernel CUDA kernel.""" + +import torch +from mllm_kernel.jit_utils import jit + + +@jit( + args=[], + device="cuda", + cuda_files=["my_kernel.cuh"], + cpp_wrappers=[], + cuda_wrappers=[("my_kernel", "MyKernel::run")], + func_name="my_kernel", +) +def _kernel(compiled_module, input: torch.Tensor, output: torch.Tensor) -> None: + compiled_module.my_kernel(input, output) + + +def my_kernel(input: torch.Tensor) -> torch.Tensor: + """Double every element in *input*. + + Parameters + ---------- + input : torch.Tensor + 1-D float32 tensor on CUDA. + + Returns + ------- + torch.Tensor + Same shape and dtype as *input*. + """ + output = torch.empty_like(input) + _kernel(input, output) + return output +``` + +### Step 3: Export in `__init__.py` + +Edit `mllm_kernel/cuda/jit/__init__.py` and add: + +```python +from mllm_kernel.cuda.jit.my_kernel import my_kernel +``` + +### Step 4: Clear JIT cache after editing `.cuh` + +Any time you modify the `.cuh` file, delete the cached `.so`: + +```bash +rm -rf ~/.cache/mllm_kernel/cuda_my_kernel* +``` + +The next Python call will trigger recompilation automatically. + +--- + +## Template-Parameterized CUDA Kernels + +When the kernel takes compile-time constants (e.g. block size, dtype), use `make_cpp_args`: + +```python +from mllm_kernel.jit_utils import jit, make_cpp_args + +def _make_kernel(block_size: int, use_pdl: bool): + cpp_args = make_cpp_args(block_size, use_pdl) # -> "256, true" + + @jit( + args=[block_size, use_pdl], + device="cuda", + cuda_files=["my_kernel.cuh"], + cpp_wrappers=[], + cuda_wrappers=[("my_kernel", f"MyKernel<{cpp_args}>::run")], + func_name="my_kernel", + ) + def _kernel(compiled_module, input, output): + compiled_module.my_kernel(input, output) + return _kernel +``` + +`make_cpp_args` converts Python types to C++ literals: +- `int/float` -> string literal +- `bool` -> `"true"` / `"false"` +- `torch.dtype` -> C++ type (`torch.float32` -> `"fp32_t"`, `torch.float16` -> `"fp16_t"`, `torch.bfloat16` -> `"bf16_t"`, `torch.int32` -> `"int32_t"`, etc.) + +--- + +## CPU Kernel Walkthrough + +CPU kernels use **Google Highway** for portable SIMD. The key difference: the `.hpp` body is included **multiple times** by Highway's `foreach_target` dispatch, so it must NOT have `#pragma once`. + +### Step 1: Write the SIMD body (`.hpp`) + +Create `mllm_kernel/cpu/include/mllm_kernel/cpu/my_kernel.hpp`: + +```cpp +// NOTE: NO #pragma once -- this file is included multiple times by Highway. + +#include + +HWY_BEFORE_NAMESPACE(); +namespace mllm_kernel::cpu { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +template +inline void my_kernel_impl(float* HWY_RESTRICT dst, + const float* HWY_RESTRICT src, + size_t count) { + const hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + const auto vc = hn::Set(d, static_cast(Constant)); + size_t i = 0; + for (; i + lanes <= count; i += lanes) { + const auto v = hn::Load(d, src + i); + hn::Store(hn::Add(v, vc), d, dst + i); + } + for (; i < count; ++i) { + dst[i] = src[i] + static_cast(Constant); + } +} + +// Named entry points for HWY_EXPORT +static HWY_NOINLINE HWY_MAYBE_UNUSED void my_kernel_1(float* d, const float* s, size_t n) { + my_kernel_impl<1>(d, s, n); +} + +} // namespace HWY_NAMESPACE +} // namespace mllm_kernel::cpu +HWY_AFTER_NAMESPACE(); +``` + +### Step 2: Write the `.cpp` source + +Create `mllm_kernel/cpu/csrc/my_kernel.cpp`: + +```cpp +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "../csrc/my_kernel.cpp" +#include + +#include + +#if HWY_ONCE +#include +#endif + +namespace mllm_kernel::cpu { +#if HWY_ONCE + +HWY_EXPORT(my_kernel_1); + +template +void my_kernel(tvm::ffi::TensorView dst, tvm::ffi::TensorView src) { + using namespace mllm_kernel::host; + SymbolicSize N{"num_elements"}; + SymbolicDevice device_; + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device_) + .verify(dst) + .verify(src); + const size_t n = N.unwrap(); + auto* dst_ptr = static_cast(dst.data_ptr()); + const auto* src_ptr = static_cast(src.data_ptr()); + HWY_DYNAMIC_DISPATCH(my_kernel_1)(dst_ptr, src_ptr, n); +} + +// Explicit instantiation +template void my_kernel<1>(tvm::ffi::TensorView, tvm::ffi::TensorView); + +#endif +} // namespace mllm_kernel::cpu +``` + +### Step 3: Write the Python JIT wrapper + +Create `mllm_kernel/cpu/jit/my_kernel.py`: + +```python +import torch +from mllm_kernel.jit_utils import jit + +@jit( + args=1, + device="cpu", + cpp_files=["my_kernel.cpp"], + cpp_wrappers=[("my_kernel", "mllm_kernel::cpu::my_kernel<1>")], + func_name="my_kernel", +) +def _kernel_1(compiled_module, dst, src): + compiled_module.my_kernel(dst, src) + +def my_kernel(src: torch.Tensor) -> torch.Tensor: + dst = torch.empty_like(src) + _kernel_1(dst, src) + return dst +``` + +**Key CPU differences from CUDA:** + +| Aspect | CUDA | CPU | +|--------|------|-----| +| Source file | `.cuh` in `cuda/csrc/` | `.cpp` + `.hpp` in `cpu/csrc/` and `cpu/include/` | +| Namespace | Anonymous `namespace {}` | `mllm_kernel::cpu` | +| Device check | `with_device` | `with_device` | +| Launch | `LaunchKernel(grid, block, device)(...)` | Direct function call via `HWY_DYNAMIC_DISPATCH` | +| SIMD | CUDA warps | Highway `ScalableTag` | +| Wrapper fields | `cuda_files`, `cuda_wrappers` | `cpp_files`, `cpp_wrappers` | +| Wrapper name | `"MyKernel::run"` | `"mllm_kernel::cpu::my_kernel<1>"` (fully qualified) | + +--- + +## TensorMatcher Reference + +`TensorMatcher` validates shape, dtype, device, and strides of `tvm::ffi::TensorView` arguments. + +```cpp +using namespace mllm_kernel::host; + +// Symbolic dimensions -- bind on first .verify(), check consistency on subsequent calls +SymbolicSize B{"batch"}, N{"seq_len"}, D{"dim"}; +SymbolicSize Stride0{"stride0"}; +SymbolicDType dtype; +SymbolicDevice device; + +// Shape [B, N, D], contiguous, float32, on CUDA +(void)TensorMatcher({B, N, D}) + .with_dtype(dtype) + .with_device(device) + .verify(tensor_a); + +// Shape [B, N, D], same dtype and device (already bound) +(void)TensorMatcher({B, N, D}) + .with_dtype(dtype) + .with_device(device) + .verify(tensor_b); + +// Shape [B, D] with explicit strides (non-contiguous OK) +(void)TensorMatcher({B, D}) + .with_strides({Stride0, 1}) + .with_dtype() + .with_device(device) + .verify(indices); + +// Multiple acceptable dtypes +SymbolicDType flex_dtype; +(void)TensorMatcher({N}) + .with_dtype(flex_dtype) + .with_device(device) + .verify(mixed_tensor); + +// Extract bound values +int64_t batch = B.unwrap(); +int64_t dim = D.unwrap(); +DLDevice dev = device.unwrap(); +``` + +--- + +## LaunchKernel Reference + +```cpp +using namespace mllm_kernel::host; + +// Basic launch (resolves CUDA stream from DLDevice) +DLDevice dev = device.unwrap(); +LaunchKernel(grid_dim, block_dim, dev)(kernel_func, param_struct); + +// With shared memory +LaunchKernel(grid, block, dev, shared_mem_bytes)(kernel, params); + +// With PDL (Programmatic Dependent Launch, sm_90+) +LaunchKernel(grid, block, dev).enable_pdl(true)(kernel, params); +``` + +--- + +## Utility Reference (`mllm_kernel::host`) + +| Function | Description | +|----------|-------------| +| `RuntimeCheck(cond, msg...)` | Throws `PanicError` if `cond` is false | +| `Panic(msg...)` | Always throws (unreachable code) | +| `div_ceil(a, b)` | Integer ceiling division | +| `dtype_bytes(DLDataType)` | Byte size of a DLPack dtype | + +CUDA-only (`mllm_kernel::device`): + +| Symbol | Value | +|--------|-------| +| `kWarpThreads` | 32 | +| `kFullMask` | 0xffffffff | +| `fp16_t` | `__half` | +| `bf16_t` | `__nv_bfloat16` | + +--- + +## Testing Pattern + +Create `tests/test_my_kernel.py`: + +```python +import pytest +import torch +from mllm_kernel.cuda.jit.my_kernel import my_kernel + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +@pytest.mark.parametrize("n", [1, 128, 1024, 65536]) +def test_my_kernel(n): + x = torch.randn(n, dtype=torch.float32, device="cuda") + result = my_kernel(x) + torch.cuda.synchronize() + expected = x * 2.0 + assert torch.allclose(result, expected) +``` + +Run: +```bash +pytest tests/test_my_kernel.py -v +``` + +--- + +## Benchmark Pattern + +Create `benchmarks/bench_my_kernel.py`. Use `torch.profiler.profile` with `ProfilerActivity.CPU` and `ProfilerActivity.CUDA`. Compare the JIT kernel against a naive PyTorch implementation and report speedup. + +Run: +```bash +python benchmarks/bench_my_kernel.py --num-elements 1000000 +``` + +--- + +## Checklist for a New Kernel + +- [ ] `.cuh` / `.cpp` + `.hpp` kernel source created +- [ ] `TensorMatcher` validates all tensor arguments (shape, dtype, device) +- [ ] No host-side dereference of device pointers +- [ ] Python `@jit` wrapper created with correct `cuda_wrappers` or `cpp_wrappers` +- [ ] Public API function added (allocates output, calls internal `_kernel`) +- [ ] Exported in `jit/__init__.py` +- [ ] JIT cache cleared after `.cuh` edits (`rm -rf ~/.cache/mllm_kernel/cuda_*`) +- [ ] Pytest test with `@pytest.mark.parametrize` and PyTorch reference +- [ ] Benchmark with `torch.profiler` (optional but recommended) + +--- + +## Common Pitfalls + +1. **Segfault from dereferencing device pointer on host** -- `tensor.data_ptr()` returns a GPU pointer for CUDA tensors. Never read its contents in host code. Use `TensorMatcher` for validation instead. +2. **Stale JIT cache** -- After editing `.cuh`, delete `~/.cache/mllm_kernel/cuda_*/`. The old `.so` will be reused otherwise. +3. **Missing `#include `** -- CPU kernels must include this inside `#if HWY_ONCE` to provide `GetChosenTarget` for the JIT-built module. +4. **`#pragma once` in Highway `.hpp`** -- Highway's `foreach_target` includes the file multiple times for different SIMD targets. `#pragma once` breaks this. +5. **Wrong wrapper name** -- CUDA uses short names (`"MyKernel::run"`); CPU uses fully qualified names (`"mllm_kernel::cpu::my_kernel<1>"`). +6. **Generator device mismatch in tests** -- `torch.randperm` needs a CUDA generator on CUDA; `torch.randint` only accepts CPU generators. Use separate generators. diff --git a/mllm-kernel/benchmarks/bench_create_kv_indices.py b/mllm-kernel/benchmarks/bench_create_kv_indices.py new file mode 100644 index 00000000..f570e66d --- /dev/null +++ b/mllm-kernel/benchmarks/bench_create_kv_indices.py @@ -0,0 +1,218 @@ +"""Benchmark create_kv_indices vs naive torch gather using torch.profiler. + +Example: + python benchmarks/bench_create_kv_indices.py --batch-size 512 --max-reqs 2048 --max-ctx 4096 +""" + +from __future__ import annotations + +import argparse + +import torch +from torch.profiler import ProfilerActivity, profile + +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + + +def _make_batch( + *, + max_reqs: int, + max_ctx: int, + batch_size: int, + use_start_offsets: bool, + device: torch.device, + seed: int, +): + g_cuda = torch.Generator(device=device).manual_seed(seed) + g_cpu = torch.Generator(device="cpu").manual_seed(seed) + + req_to_token = torch.arange( + max_reqs * max_ctx, dtype=torch.int32, device=device + ).reshape(max_reqs, max_ctx) + + assert batch_size <= max_reqs + req_pool_indices = torch.randperm(max_reqs, generator=g_cuda, device=device)[ + :batch_size + ].to(torch.int32) + + page_kernel_lens_list = [] + kv_start_idx_list = [] + for _ in range(batch_size): + L = int(torch.randint(1, max_ctx, (1,), generator=g_cpu).item()) + if use_start_offsets: + start_max = max_ctx - L + start = int(torch.randint(0, max(start_max, 1), (1,), generator=g_cpu).item()) + else: + start = 0 + page_kernel_lens_list.append(L) + kv_start_idx_list.append(start) + + page_kernel_lens = torch.tensor( + page_kernel_lens_list, dtype=torch.int32, device=device + ) + kv_start_idx = torch.tensor(kv_start_idx_list, dtype=torch.int32, device=device) + + kv_indptr = torch.empty(batch_size + 1, dtype=torch.int32, device=device) + kv_indptr[0] = 0 + kv_indptr[1:] = torch.cumsum(page_kernel_lens, dim=0) + + kv_indices = torch.empty( + int(kv_indptr[-1].item()), dtype=torch.int32, device=device + ) + + return ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + +def _profile( + name: str, fn, *, warmup: int, iters: int, row_limit: int, trace_path: str | None +): + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=False, + profile_memory=False, + with_stack=False, + ) as prof: + for _ in range(iters): + fn() + torch.cuda.synchronize() + + events = prof.key_averages() + time_attr = ( + "self_cuda_time_total" + if events and hasattr(events[0], "self_cuda_time_total") + else "self_device_time_total" + ) + sort_key = ( + "self_cuda_time_total" + if time_attr == "self_cuda_time_total" + else "self_device_time_total" + ) + total_us = sum(float(getattr(evt, time_attr, 0.0)) for evt in events) + avg_us = total_us / max(iters, 1) + + print(f"\n=== {name} ===") + print( + prof.key_averages().table( + sort_by=sort_key, + row_limit=row_limit, + ) + ) + print(f"{name} total self device time: {total_us:.2f} us") + print(f"{name} avg self device time/iter: {avg_us:.2f} us") + + if trace_path: + prof.export_chrome_trace(trace_path) + print(f"{name} trace exported: {trace_path}") + + return avg_us + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark create_kv_indices vs naive torch gather", + ) + parser.add_argument("--batch-size", type=int, default=512) + parser.add_argument("--max-reqs", type=int, default=2048) + parser.add_argument("--max-ctx", type=int, default=4096) + parser.add_argument("--warmup", type=int, default=50) + parser.add_argument("--iters", type=int, default=200) + parser.add_argument("--row-limit", type=int, default=20) + parser.add_argument("--export-trace-dir", type=str, default="") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--use-start-offsets", + action="store_true", + help="Enable non-zero kv_start_idx to emulate sliding-window decode", + ) + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for this benchmark") + + torch.manual_seed(args.seed) + device = torch.device("cuda") + + ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) = _make_batch( + max_reqs=args.max_reqs, + max_ctx=args.max_ctx, + batch_size=args.batch_size, + use_start_offsets=args.use_start_offsets, + device=device, + seed=args.seed, + ) + + print("=== create_kv_indices profiler benchmark ===") + print( + f"batch_size={args.batch_size}, max_reqs={args.max_reqs}, max_ctx={args.max_ctx}, " + f"use_start_offsets={args.use_start_offsets}" + ) + print(f"warmup={args.warmup}, iters={args.iters}, row_limit={args.row_limit}") + + trace_dir = args.export_trace_dir.strip() + kernel_trace = f"{trace_dir}/create_kv_indices_trace.json" if trace_dir else None + torch_trace = f"{trace_dir}/torch_gather_trace.json" if trace_dir else None + + def _run_kernel_once(): + create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + def _run_torch_once(): + # Torch reference implementation on device: gather per-sequence ranges + # from req_to_token into a flat buffer. + out = [] + for i in range(args.batch_size): + req = req_pool_indices[i].item() + start = kv_start_idx[i].item() if args.use_start_offsets else 0 + L = page_kernel_lens[i].item() + row = req_to_token[req, start : start + L] + out.append(row) + torch.cat(out, out=kv_indices) + + kernel_avg_us = _profile( + "create_kv_indices", + _run_kernel_once, + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=kernel_trace, + ) + + torch_avg_us = _profile( + "torch_reference", + _run_torch_once, + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=torch_trace, + ) + + speedup = torch_avg_us / max(kernel_avg_us, 1e-12) + print(f"\nSpeedup: {speedup:.3f}x") + + +if __name__ == "__main__": + main() diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh new file mode 100644 index 00000000..0b9e4c88 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh @@ -0,0 +1,282 @@ +// High-performance CUDA kernel to build FlashInfer KV index arrays from +// pymllm's ReqToTokenPool mapping table. +// +// This is the CUDA-C equivalent of the Triton kernel +// `_create_kv_indices_triton` previously defined in +// `pymllm/layers/attention/flashinfer_backend.py`. +// +// Motivation +// ---------- +// FlashInfer's paged KV attention API expects a *flat* buffer of KV indices +// (`kv_indices`) together with a prefix-sum pointer array (`kv_indptr`). +// +// * `kv_indices` is a 1-D int32 array that stores, for every token of every +// sequence in a batch, the corresponding *slot index* in the KV cache. +// * `kv_indptr` (length = batch_size + 1) stores prefix sums over the +// per-sequence token counts. For sequence `i` we have tokens in: +// +// kv_indices[kv_indptr[i] : kv_indptr[i + 1]] +// +// In pymllm, the mapping from (request_slot, position_in_sequence) to KV slot +// index is stored in a 2-D tensor `req_to_token` owned by `ReqToTokenPool`: +// +// req_to_token[req_slot, position] -> kv_index (int32) +// +// For each batch we also know: +// * which request slots we are serving: `req_pool_indices[bs]` +// * how many tokens to use from each sequence: `page_kernel_lens[bs]` +// * the starting position inside each sequence: `kv_start_idx[bs]` (optional, +// used for sliding-window / partial-context attention) +// +// This kernel converts that 2-D layout into the flat `(kv_indptr, kv_indices)` +// layout in a single, highly parallel CUDA pass: +// +// For each sequence i in the batch: +// - let req = req_pool_indices[i] +// - let len = page_kernel_lens[i] +// - let start = kv_start_idx[i] (or 0 if not provided) +// - let offset = kv_indptr[i] +// - for j in [0, len): +// kv_indices[offset + j] = req_to_token[req, start + j] +// +// Requirements / invariants +// ------------------------- +// * `req_to_token` is int32 (aligned with sglang). +// * All tensors must reside on the same CUDA device. +// * The kernel is designed for extremely high throughput: +// - a block is assigned per sequence (batch element), +// - threads cooperate within the block to copy the token range with +// coalesced loads/stores. +// * Shape and dtype checks are performed at runtime via mllm_kernel's +// TensorMatcher utilities, so misuse is caught with clear error messages. +// +// Integration +// ----------- +// The exported entry point is `CreateKvIndicesKernel::run(...)`. The Python +// wrapper in `mllm_kernel/cuda/jit/create_kv_indices.py` JIT-compiles this +// kernel and exposes a `create_kv_indices(...)` function which is then called +// by `pymllm.layers.attention.flashinfer_backend`. + +#pragma once + +#include // TensorMatcher, SymbolicSize, SymbolicDevice, SymbolicDType +#include // div_ceil, RuntimeCheck, Panic +#include // LaunchKernel + +#include +#include + +#include + +namespace { + +// --------------------------------------------------------------------------- +// Parameter block passed to the CUDA kernel +// --------------------------------------------------------------------------- +// +// We keep this struct trivially-copyable so it can be passed via +// `__grid_constant__` if desired. Each field is carefully documented to make +// the data flow explicit. + +struct CreateKvIndicesParams { + // Pointer to ReqToTokenPool mapping table: + // req_to_token[req_slot, position] -> kv_index (int32) + // shape: [max_reqs, max_context_len] + const int32_t* __restrict__ req_to_token; + + // Request slots participating in this batch. + // shape: [batch_size] + const int32_t* __restrict__ req_pool_indices; + + // Number of tokens to copy for each sequence in the batch. + // shape: [batch_size] + const int32_t* __restrict__ page_kernel_lens; + + // Prefix sums over per-sequence token counts. + // kv_indptr[i] is the starting offset in kv_indices for sequence i. + // shape: [batch_size + 1] + const int32_t* __restrict__ kv_indptr; + + // Optional starting position inside each request's sequence. When nullptr, + // we assume start = 0 for all sequences. When non-null, shape is + // [batch_size]. + const int32_t* __restrict__ kv_start_idx; + + // Output flat KV index buffer (int32). Length must be at least + // kv_indptr[batch_size]. + int32_t* __restrict__ kv_indices; + + // Stride of the first dimension of req_to_token, i.e. the number of + // positions per request (max_context_len). + int32_t req_to_token_stride; + + // Number of sequences in the batch. + uint32_t batch_size; + + // Whether kv_start_idx is valid (1) or should be ignored (0). + uint32_t has_kv_start; +}; + +// We use a fixed block size chosen to balance occupancy and per-sequence +// parallelism. Each block is mapped to a single sequence and threads within +// the block cooperate to copy its token range. +constexpr int kBlockSize = 256; + +// --------------------------------------------------------------------------- +// Core CUDA kernel +// --------------------------------------------------------------------------- +// +// Grid mapping: +// * blockIdx.x -> sequence index `i` in [0, batch_size) +// * threadIdx.x -> intra-sequence worker; threads stride over the token +// range [0, len) with step `blockDim.x`. +// +// This design has several advantages: +// * No inter-block synchronisation is required. +// * Memory accesses are fully coalesced because each thread block walks a +// contiguous segment of the `req_to_token` and `kv_indices` arrays. +// * It handles variable-length sequences naturally; sequences with more +// tokens simply iterate more in the inner loop. + +__global__ void create_kv_indices_kernel(const CreateKvIndicesParams params) { + const uint32_t seq_id = blockIdx.x; // which sequence in the batch + if (seq_id >= params.batch_size) { return; } + + // Resolve the request slot for this sequence. + const int32_t req_slot = params.req_pool_indices[seq_id]; + + // Compute the output range [out_offset, out_offset + len) in kv_indices. + const int32_t out_offset = params.kv_indptr[seq_id]; + const int32_t len = params.page_kernel_lens[seq_id]; + + // Compute the starting position inside the original sequence. + int32_t start = 0; + if (params.has_kv_start && params.kv_start_idx != nullptr) { start = params.kv_start_idx[seq_id]; } + + // Base pointers for this sequence. + const int32_t* __restrict__ row = params.req_to_token + static_cast(req_slot) * params.req_to_token_stride; + int32_t* __restrict__ out = params.kv_indices + out_offset; + + // Each thread in the block handles a strided subset of [0, len). + for (int32_t t = threadIdx.x; t < len; t += blockDim.x) { + // Guard against out-of-bounds reads if (start + t) exceeds the + // configured context length. Under normal conditions upstream + // invariants guarantee `start + len <= req_to_token_stride`, but + // this check makes the kernel robust against misconfigured inputs + // and prevents rare segmentation faults observed during testing. + const int32_t pos = start + t; + if (pos < 0 || pos >= params.req_to_token_stride) { continue; } + + out[t] = row[pos]; + } +} + +// --------------------------------------------------------------------------- +// Host-side launcher used by the JIT wrapper +// --------------------------------------------------------------------------- +// +// `CreateKvIndicesKernel::run(...)` is the C++ entry point that will be bound +// to a TVM FFI function and called from Python via the JIT utility. It is +// responsible for: +// 1. Validating tensor shapes / dtypes / devices. +// 2. Extracting symbolic sizes and strides. +// 3. Building the parameter block. +// 4. Launching the CUDA kernel using mllm_kernel::host::LaunchKernel. + +struct CreateKvIndicesKernel { + static void run(tvm::ffi::TensorView req_to_token, tvm::ffi::TensorView req_pool_indices, + tvm::ffi::TensorView page_kernel_lens, tvm::ffi::TensorView kv_indptr, tvm::ffi::TensorView kv_start_idx, + tvm::ffi::TensorView kv_indices) { + using namespace mllm_kernel::host; + + // --------------------------------------------------------------------- + // 1. Validate input tensors + // --------------------------------------------------------------------- + // req_to_token: [max_reqs, max_context_len], int32, CUDA + SymbolicSize MaxReqs{"max_reqs"}; + SymbolicSize MaxCtx{"max_context_len"}; + SymbolicSize ReqStride{"req_stride"}; + SymbolicDType req_dtype; + SymbolicDevice device; + + (void)TensorMatcher({MaxReqs, MaxCtx}) + .with_strides({ReqStride, 1}) + .with_dtype(req_dtype) + .with_device(device) + .verify(req_to_token); + + // req_pool_indices: [B], int32, CUDA + SymbolicSize B{"batch_size"}; + SymbolicSize ReqPoolStride{"req_pool_stride"}; + (void)TensorMatcher({B}).with_strides({ReqPoolStride}).with_dtype().with_device(device).verify(req_pool_indices); + + // page_kernel_lens: [B], int32, same device + SymbolicSize PageStride{"page_stride"}; + (void)TensorMatcher({B}).with_strides({PageStride}).with_dtype().with_device(device).verify(page_kernel_lens); + + // kv_indptr: [Nind], int32, same device (we later require Nind >= B + 1) + SymbolicSize Nind{"indptr_len"}; + (void)TensorMatcher({Nind}).with_dtype().with_device(device).verify(kv_indptr); + + // kv_start_idx: either [B] or [0]; int32, same device + SymbolicSize StartLen{"start_len"}; + SymbolicSize StartStride{"start_stride"}; + (void)TensorMatcher({StartLen}).with_strides({StartStride}).with_dtype().with_device(device).verify(kv_start_idx); + + // kv_indices: [Nidx], int32, same device + SymbolicSize Nidx{"num_indices"}; + (void)TensorMatcher({Nidx}).with_dtype().with_device(device).verify(kv_indices); + + // Extract concrete sizes. + const int64_t batch_size = B.unwrap(); + const int64_t indptr_len = Nind.unwrap(); + const int64_t req_stride = ReqStride.unwrap(); + + // Basic consistency checks. + RuntimeCheck(batch_size > 0, "batch_size must be positive, got ", batch_size); + RuntimeCheck(indptr_len >= batch_size + 1, "kv_indptr length (", indptr_len, ") must be at least batch_size+1 (", + batch_size + 1, ")"); + + // NOTE: We intentionally do NOT read kv_indptr[batch_size] on the host to + // validate that kv_indices is large enough. kv_indptr resides in device + // memory and dereferencing it from host code would be an illegal memory + // access (segfault). Callers are responsible for ensuring that + // kv_indices.numel() >= kv_indptr[batch_size]. + + // kv_start_idx is optional; when StartLen == 0 we treat it as absent. + RuntimeCheck(StartLen.unwrap() == 0 || StartLen.unwrap() == batch_size, + "kv_start_idx must have length 0 or batch_size; got ", StartLen.unwrap(), " vs batch_size=", batch_size); + + const bool has_kv_start = (StartLen.unwrap() == batch_size); + + // --------------------------------------------------------------------- + // 2. Build parameter block + // --------------------------------------------------------------------- + CreateKvIndicesParams params{ + .req_to_token = static_cast(req_to_token.data_ptr()), + .req_pool_indices = static_cast(req_pool_indices.data_ptr()), + .page_kernel_lens = static_cast(page_kernel_lens.data_ptr()), + .kv_indptr = static_cast(kv_indptr.data_ptr()), + .kv_start_idx = has_kv_start ? static_cast(kv_start_idx.data_ptr()) : nullptr, + .kv_indices = static_cast(kv_indices.data_ptr()), + .req_to_token_stride = static_cast(req_stride), + .batch_size = static_cast(batch_size), + .has_kv_start = has_kv_start ? 1u : 0u, + }; + + const DLDevice dl_device = device.unwrap(); + + // --------------------------------------------------------------------- + // 3. Launch the CUDA kernel + // --------------------------------------------------------------------- + // We launch one block per sequence so that each sequence can be processed + // independently with fully coalesced memory accesses. The per-thread + // inner loop runs over the token range [0, len) with stride = blockDim.x. + + const int grid_size = static_cast(batch_size); + + LaunchKernel(grid_size, kBlockSize, dl_device)(create_kv_indices_kernel, params); + } +}; + +} // namespace diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh new file mode 100644 index 00000000..e69de29b diff --git a/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py b/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py new file mode 100644 index 00000000..565686a4 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py @@ -0,0 +1,118 @@ +"""High-performance CUDA JIT wrapper for create_kv_indices. + +This module exposes a single function: + + create_kv_indices(req_to_token, req_pool_indices, + page_kernel_lens, kv_indptr, + kv_start_idx, kv_indices) + +which is a Python binding around the C++/CUDA kernel defined in +`mllm_kernel/cuda/csrc/create_kv_indices.cuh`. + +The kernel transforms pymllm's 2-D ReqToTokenPool mapping table into the flat +`(kv_indptr, kv_indices)` layout expected by FlashInfer's paged KV attention +wrappers. It is carefully written for maximum throughput and is intended to +replace the Triton implementation `_create_kv_indices_triton` in +`pymllm.layers.attention.flashinfer_backend`. +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_create_kv_indices_kernel(): + """JIT-compile the CUDA kernel and return a callable wrapper. + + The JIT system will: + * locate `create_kv_indices.cuh` under the mllm-kernel CUDA csrc tree, + * compile it into a TVM FFI module, + * expose `CreateKvIndicesKernel::run` as `compiled_module.create_kv_indices`. + """ + + @jit( + args=[], + device="cuda", + cuda_files=["create_kv_indices.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("create_kv_indices", "CreateKvIndicesKernel::run"), + ], + func_name="create_kv_indices", + ) + def _kernel( + compiled_module, + req_to_token: torch.Tensor, + req_pool_indices: torch.Tensor, + page_kernel_lens: torch.Tensor, + kv_indptr: torch.Tensor, + kv_start_idx: torch.Tensor, + kv_indices: torch.Tensor, + ) -> None: + compiled_module.create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + return _kernel + + +def create_kv_indices( + req_to_token: torch.Tensor, + req_pool_indices: torch.Tensor, + page_kernel_lens: torch.Tensor, + kv_indptr: torch.Tensor, + kv_start_idx: torch.Tensor | None, + kv_indices: torch.Tensor, +) -> None: + """Fill a flat KV-index buffer from the ReqToTokenPool mapping. + + This is a thin Python wrapper that forwards to the JIT-compiled CUDA + kernel. All tensors must be placed on the same CUDA device. + + Args + ---- + req_to_token: + Mapping tensor from ReqToTokenPool, shape + ``[max_reqs, max_context_len]``, dtype ``torch.int32``. + req_pool_indices: + Request slots participating in this batch, shape ``[batch_size]``, + dtype ``torch.int32``. + page_kernel_lens: + Per-sequence token counts (how many tokens to attend), shape + ``[batch_size]``, dtype ``torch.int32``. + kv_indptr: + Prefix sums over per-sequence token counts, shape ``[batch_size + 1]``, + dtype ``torch.int32``. ``kv_indptr[i]`` is the starting offset in + ``kv_indices`` for sequence ``i``. + kv_start_idx: + Optional starting positions inside each sequence, shape + ``[batch_size]`` or ``[0]``, dtype ``torch.int32``. When + ``None``, the kernel assumes 0 for all sequences. + kv_indices: + Output flat KV-index buffer, shape ``[N]``, dtype ``torch.int32``. + ``N`` must be at least ``kv_indptr[batch_size]``. + """ + if kv_start_idx is None: + # Use an empty tensor to signal "no start offsets". The C++ launcher + # treats length==0 as "no kv_start" and will pass a nullptr into the + # parameter block, which is slightly cheaper than materialising a + # full zero tensor on every call. + kv_start_idx = req_pool_indices.new_empty(0, dtype=torch.int32) + + kernel = _make_create_kv_indices_kernel() + kernel( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index 77340b29..13147f06 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -55,7 +55,7 @@ logging.level = "INFO" # Wheel configuration - include the Python package wheel.packages = ["mllm_kernel"] -wheel.install-dir = "mllm_kernel" +wheel.install-dir = "" # Install directories for cmake targets wheel.cmake = true diff --git a/mllm-kernel/tests/test_create_kv_indices.py b/mllm-kernel/tests/test_create_kv_indices.py new file mode 100644 index 00000000..e8bf770a --- /dev/null +++ b/mllm-kernel/tests/test_create_kv_indices.py @@ -0,0 +1,191 @@ +from __future__ import annotations + +import pytest +import torch + +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + + +def _make_batch( + *, + max_reqs: int, + max_ctx: int, + batch_size: int, + use_start_offsets: bool, + seed: int = 0, +): + """Construct a random-but-bounded test batch for create_kv_indices. + + The constraints ensure that for every sequence i: + 0 <= kv_start_idx[i] + 0 < page_kernel_lens[i] + kv_start_idx[i] + page_kernel_lens[i] <= max_ctx + so the kernel never reads beyond the ReqToTokenPool row. + """ + # Use a CUDA generator for randperm (which requires matching device) + # and a separate CPU generator for randint (which only accepts CPU). + g_cuda = torch.Generator(device="cuda").manual_seed(seed) + g_cpu = torch.Generator(device="cpu").manual_seed(seed) + + device = "cuda" + # req_to_token[req_slot, position] -> kv_index (here we simply use a + # monotonically increasing pattern so correctness is easy to check). + req_to_token = torch.arange( + max_reqs * max_ctx, dtype=torch.int32, device=device + ).reshape(max_reqs, max_ctx) + + # Sample distinct request slots for the batch. + assert batch_size <= max_reqs + req_pool_indices = torch.randperm(max_reqs, generator=g_cuda, device=device)[ + :batch_size + ].to(torch.int32) + + # For each sequence choose a valid (start, length) pair. + page_kernel_lens_list = [] + kv_start_idx_list = [] + for _ in range(batch_size): + # ensure at least 1 token per sequence + L = int(torch.randint(1, max_ctx, (1,), generator=g_cpu).item()) + if use_start_offsets: + start_max = max_ctx - L + start = int(torch.randint(0, max(start_max, 1), (1,), generator=g_cpu).item()) + else: + start = 0 + page_kernel_lens_list.append(L) + kv_start_idx_list.append(start) + + page_kernel_lens = torch.tensor( + page_kernel_lens_list, dtype=torch.int32, device=device + ) + kv_start_idx = torch.tensor(kv_start_idx_list, dtype=torch.int32, device=device) + + # Build kv_indptr prefix sums. + kv_indptr = torch.empty(batch_size + 1, dtype=torch.int32, device=device) + kv_indptr[0] = 0 + kv_indptr[1:] = torch.cumsum(page_kernel_lens, dim=0) + + kv_indices = torch.empty( + int(kv_indptr[-1].item()), dtype=torch.int32, device=device + ) + + return ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +@pytest.mark.parametrize("use_start_offsets", [False, True]) +@pytest.mark.parametrize( + "batch_size,max_reqs,max_ctx", + [ + (1, 4, 16), # minimal batch + (4, 8, 64), # small batch + (32, 64, 512), # medium batch, longer context + (128, 256, 2048), # larger batch, stress inner loop + ], +) +def test_create_kv_indices_matches_reference( + use_start_offsets: bool, + batch_size: int, + max_reqs: int, + max_ctx: int, +): + """create_kv_indices must match a naive PyTorch reference implementation. + + The reference is computed on CPU using explicit loops over + (request_slot, start, length); the CUDA kernel must produce identical + flat kv_indices for the same inputs. + """ + ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) = _make_batch( + max_reqs=max_reqs, + max_ctx=max_ctx, + batch_size=batch_size, + use_start_offsets=use_start_offsets, + seed=2026, + ) + + # Call CUDA kernel (kv_start_idx can be None to exercise that path). + create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx if use_start_offsets else None, + kv_indices, + ) + torch.cuda.synchronize() + + # Naive reference on CPU. + req_to_token_cpu = req_to_token.cpu() + req_pool_indices_cpu = req_pool_indices.cpu().to(torch.long) + page_kernel_lens_cpu = page_kernel_lens.cpu() + kv_start_idx_cpu = kv_start_idx.cpu() + + ref_segments = [] + for i in range(batch_size): + req = req_pool_indices_cpu[i].item() + start = kv_start_idx_cpu[i].item() if use_start_offsets else 0 + L = page_kernel_lens_cpu[i].item() + row = req_to_token_cpu[req, start : start + L] + ref_segments.append(row) + ref = torch.cat(ref_segments, dim=0) + + assert kv_indices.shape == ref.shape + assert torch.equal(kv_indices.cpu(), ref) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_single_token_per_sequence(): + """Each sequence has exactly 1 token — exercises the minimal-work path.""" + device = "cuda" + bs = 8 + max_ctx = 32 + req_to_token = torch.arange(bs * max_ctx, dtype=torch.int32, device=device).reshape(bs, max_ctx) + req_pool_indices = torch.arange(bs, dtype=torch.int32, device=device) + page_kernel_lens = torch.ones(bs, dtype=torch.int32, device=device) + kv_indptr = torch.arange(bs + 1, dtype=torch.int32, device=device) + kv_indices = torch.empty(bs, dtype=torch.int32, device=device) + + create_kv_indices(req_to_token, req_pool_indices, page_kernel_lens, kv_indptr, None, kv_indices) + torch.cuda.synchronize() + + # Each sequence contributes req_to_token[i, 0]. + expected = req_to_token[:, 0] + assert torch.equal(kv_indices, expected) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_oversized_output_buffer(): + """kv_indices buffer is larger than needed (prefill path uses +256 padding).""" + device = "cuda" + bs = 4 + max_ctx = 64 + req_to_token = torch.arange(bs * max_ctx, dtype=torch.int32, device=device).reshape(bs, max_ctx) + req_pool_indices = torch.arange(bs, dtype=torch.int32, device=device) + page_kernel_lens = torch.full((bs,), 10, dtype=torch.int32, device=device) + kv_indptr = torch.arange(0, bs * 10 + 1, 10, dtype=torch.int32, device=device) + # Allocate with extra padding, like the prefill path does. + kv_indices = torch.full((bs * 10 + 256,), -1, dtype=torch.int32, device=device) + + create_kv_indices(req_to_token, req_pool_indices, page_kernel_lens, kv_indptr, None, kv_indices) + torch.cuda.synchronize() + + # First bs*10 entries should match; padding should remain -1. + ref_segments = [] + for i in range(bs): + ref_segments.append(req_to_token[i, :10]) + ref = torch.cat(ref_segments, dim=0) + assert torch.equal(kv_indices[:bs * 10], ref) + assert torch.all(kv_indices[bs * 10:] == -1) diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 9e399d62..f6a2090f 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -79,6 +79,39 @@ class ServerConfig: # Feature switches # --------------------------------------------------------------------- # enable_shared_queue: bool = False # Use shared memory queue for fast IPC + + # CUDA IPC transport for multimodal GPU tensors. + # Requires enable_shared_queue=True to take effect. + # + # Three transport modes (mutually exclusive for GPU tensors): + # + # "default" + # GPU tensors are moved to CPU first (GPU→CPU copy), then placed in + # POSIX shared memory via share_memory_(). Safe but adds a device copy. + # + # "cuda_ipc" + # GPU tensors stay on GPU. Each tensor is wrapped in a + # TransportProxyTensor whose __getstate__ calls storage._share_cuda_() + # to obtain an IPC handle; the receiver reconstructs via + # UntypedStorage._new_shared_cuda(*handle). Simple, but the underlying + # GPU allocation is never freed until the sender process exits + # (PyTorch limitation) -- can leak GPU memory in long-running services. + # + # "cuda_ipc_pool" [recommended for production] + # GPU tensors are copied into a pre-allocated fixed-size GPU workspace + # (MmItemMemoryPool). Each outgoing tensor occupies a "chunk" of the + # pool; the chunk's IPC handle is sent via CudaIpcTensorTransportProxy. + # After the receiver finishes copying data it increments a shared-memory + # sync flag; a background recycler thread in the sender watches these + # flags and returns chunks to the available pool. No GPU memory is leaked. + tensor_transport_mode: str = "default" # one of: default, cuda_ipc, cuda_ipc_pool + + # Size of the pre-allocated CUDA IPC memory pool in MB. + # Only used when tensor_transport_mode == "cuda_ipc_pool". + cuda_ipc_pool_size_mb: int = 512 + + # How often (seconds) the pool recycler thread wakes up. + cuda_ipc_recycle_interval: float = 0.1 # enable_lora: bool = False # max_loaded_loras: Optional[int] = None # max_loras_per_batch: int = 8 @@ -102,6 +135,18 @@ def __post_init__(self) -> None: self._validate() def _validate(self) -> None: + valid_modes = {"default", "cuda_ipc", "cuda_ipc_pool"} + if self.tensor_transport_mode not in valid_modes: + raise ValueError( + f"`tensor_transport_mode` must be one of {valid_modes}, " + f"got {self.tensor_transport_mode!r}." + ) + if self.tensor_transport_mode != "default" and not self.enable_shared_queue: + raise ValueError( + "`tensor_transport_mode` != 'default' requires `enable_shared_queue=True`." + ) + if self.cuda_ipc_pool_size_mb <= 0: + raise ValueError("`cuda_ipc_pool_size_mb` must be > 0.") if self.port <= 0 or self.port > 65535: raise ValueError("`port` must be in range [1, 65535].") if self.max_prefill_tokens is not None and self.max_prefill_tokens <= 0: diff --git a/pymllm/engine/__init__.py b/pymllm/engine/__init__.py index e69de29b..50f2b724 100644 --- a/pymllm/engine/__init__.py +++ b/pymllm/engine/__init__.py @@ -0,0 +1,8 @@ +"""Engine module for pymllm.""" + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + +__all__ = [ + "ForwardBatch", + "ForwardMode", +] diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py index e69de29b..ebb715ff 100644 --- a/pymllm/engine/forward_batch.py +++ b/pymllm/engine/forward_batch.py @@ -0,0 +1,182 @@ +"""ForwardMode and ForwardBatch for pymllm. + +Simplified forward-batch abstraction: no speculative decoding, no +encoder-decoder support, and no distributed-attention complexity (DP/TP +head splitting is handled at the layer level by the model code, not here). + +Typical data flow +----------------- + ModelRunner builds a ForwardBatch + ↓ + attn_backend.init_forward_metadata(forward_batch) + ↓ + model.forward(input_ids, positions, forward_batch) + ↓ + RadixAttention.forward(q, k, v, forward_batch) + ↓ + forward_batch.attn_backend.forward(q, k, v, layer, forward_batch) +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import IntEnum, auto +from typing import TYPE_CHECKING, List, Optional + +import torch + +if TYPE_CHECKING: + from pymllm.layers.attention.attention_backend import AttentionBackend + from pymllm.mem_cache.memory_pool import KVPool, ReqToTokenPool + + +# --------------------------------------------------------------------------- +# ForwardMode +# --------------------------------------------------------------------------- + + +class ForwardMode(IntEnum): + """Describes what kind of forward pass is being performed. + + Covers standard prefill / decode inference without speculative decoding. + """ + + # Prefill / extend: process new tokens. The KV cache of the prefix (if + # any) is already populated (e.g. shared system-prompt via radix cache). + EXTEND = auto() + + # Decode: generate exactly one new token per sequence. + DECODE = auto() + + # Mixed: a chunked-prefill batch that contains both extend and decode + # sequences simultaneously. + MIXED = auto() + + # Idle: no sequences to process (used with data-parallel workers when some + # ranks have no allocated sequences). + IDLE = auto() + + # ---- helpers ---- + + def is_extend(self) -> bool: + """True for EXTEND or MIXED (i.e. any prefill-style pass).""" + return self in (ForwardMode.EXTEND, ForwardMode.MIXED) + + def is_prefill(self) -> bool: + """Alias for ``is_extend()``.""" + return self.is_extend() + + def is_decode(self) -> bool: + return self == ForwardMode.DECODE + + def is_mixed(self) -> bool: + return self == ForwardMode.MIXED + + def is_idle(self) -> bool: + return self == ForwardMode.IDLE + + def is_decode_or_idle(self) -> bool: + return self == ForwardMode.DECODE or self == ForwardMode.IDLE + + +# --------------------------------------------------------------------------- +# ForwardBatch +# --------------------------------------------------------------------------- + + +@dataclass +class ForwardBatch: + """All tensors required by a single forward pass through the model. + + Parameters + ---------- + forward_mode + The kind of pass being performed (EXTEND / DECODE / MIXED / IDLE). + batch_size + Number of sequences in the batch. + input_ids + Token ids for every position in the batch, shape ``[num_tokens]``. + For decode, ``num_tokens == batch_size``; for extend, + ``num_tokens == extend_num_tokens``. + req_pool_indices + Index of each sequence in ``ReqToTokenPool``, shape ``[batch_size]`` + (int32 or int64, on the target device). + seq_lens + Total (prefix + new) length of each sequence, shape ``[batch_size]`` + (int32). + out_cache_loc + KV-pool slot that each *output* token is written to, shape + ``[num_tokens]`` (int64). + seq_lens_sum + Python ``int`` equal to ``seq_lens.sum()``. Cached to avoid repeated + device-to-host syncs. + seq_lens_cpu + CPU copy of ``seq_lens`` (optional; used by some attention backends + for plan computation without a device sync). + positions + Token position for each input token, shape ``[num_tokens]`` + (int32 or int64). + extend_num_tokens + Total number of new (non-prefix) tokens across the batch. Only set + during EXTEND / MIXED passes. + extend_seq_lens + Number of *new* tokens for each sequence, shape ``[batch_size]`` + (int32). Only set during EXTEND / MIXED. + extend_prefix_lens + Length of the already-cached prefix for each sequence, + shape ``[batch_size]`` (int32). Only set during EXTEND / MIXED. + extend_start_loc + Cumulative start offset of each sequence in the flattened extend + token stream, shape ``[batch_size]`` (int32). + extend_prefix_lens_cpu + CPU list mirror of ``extend_prefix_lens``. + extend_seq_lens_cpu + CPU list mirror of ``extend_seq_lens``. + return_logprob + Whether to compute per-token log-probabilities. + top_logprobs_nums + Number of top log-probs to return per sequence (None or list of ints). + req_to_token_pool + Reference to the ``ReqToTokenPool`` (set by the model runner). + token_to_kv_pool + Reference to the ``KVPool`` (set by the model runner). + attn_backend + The attention backend to use (set by the model runner before calling + ``model.forward``). + """ + + # ---- required fields (positional) ---- + forward_mode: ForwardMode + batch_size: int + input_ids: torch.Tensor # [num_tokens] + req_pool_indices: torch.Tensor # [batch_size] int32/int64 + seq_lens: torch.Tensor # [batch_size] int32 + out_cache_loc: torch.Tensor # [num_tokens] int64 + seq_lens_sum: int # python int + + # ---- optional metadata ---- + + # CPU mirror of seq_lens + seq_lens_cpu: Optional[torch.Tensor] = None + + # Position encoding – shape [num_tokens], int32 or int64 + positions: Optional[torch.Tensor] = None + + # ---- extend / prefill specific ---- + extend_num_tokens: Optional[int] = None + extend_seq_lens: Optional[torch.Tensor] = None # [batch_size] int32 + extend_prefix_lens: Optional[torch.Tensor] = None # [batch_size] int32 + extend_start_loc: Optional[torch.Tensor] = None # [batch_size] int32 + extend_prefix_lens_cpu: Optional[List[int]] = None + extend_seq_lens_cpu: Optional[List[int]] = None + + # ---- logprob options ---- + return_logprob: bool = False + top_logprobs_nums: Optional[List[int]] = None + + # ---- memory pools (set by model runner) ---- + req_to_token_pool: Optional["ReqToTokenPool"] = None + token_to_kv_pool: Optional["KVPool"] = None + + # ---- attention backend (set by model runner) ---- + attn_backend: Optional["AttentionBackend"] = None diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 2200d7f3..2ba04e1c 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -26,7 +26,6 @@ ReqState, RequestResponseProcess, ) -from pymllm.orchestrator.shared_memory_queue import TensorQueue from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process from pymllm.orchestrator.model_runner_process import run_model_runner_process @@ -80,13 +79,30 @@ def _launch_processes(self) -> None: # Config dict for the tokenizer subprocess (must be picklable). cfg = get_global_config() enable_shared_queue = cfg.server.enable_shared_queue - - # Create shared queue if enabled + transport_mode: str = ( + cfg.server.tensor_transport_mode + ) # "default" | "cuda_ipc" | "cuda_ipc_pool" + + # Create shared queue if enabled. + # Note: the MmItemMemoryPool (for "cuda_ipc_pool") is created *inside* + # the tokenizer subprocess after CUDA is initialised. The queue here + # is constructed without a pool; TokenizerProcess._ensure_pool() will + # swap in a pool-aware queue at runtime. shared_queue = None if enable_shared_queue: - # TODO: WCH init CUDA IPC things. - shared_queue = TensorQueue(maxsize=1000) # Configurable max size - logger.info("Shared memory queue enabled for fast IPC") + from pymllm.orchestrator.shared_memory_queue import TensorQueue as _TQ + + # Construct with the configured transport mode. The pool is not + # supplied here; it will be lazily initialised inside the subprocess. + shared_queue = _TQ( + maxsize=1000, + transport_mode=transport_mode, + pool=None, # pool initialised lazily inside TokenizerProcess + ) + logger.info( + "Shared memory queue enabled for fast IPC (transport_mode=%s)", + transport_mode, + ) tokenizer_cfg: Dict[str, Any] = { "tokenizer_path": str(cfg.server.tokenizer_path), @@ -95,6 +111,9 @@ def _launch_processes(self) -> None: "context_length": cfg.server.context_length, "hf_config": cfg.model.hf_config, "enable_shared_queue": enable_shared_queue, + "tensor_transport_mode": transport_mode, + "cuda_ipc_pool_size_mb": cfg.server.cuda_ipc_pool_size_mb, + "cuda_ipc_recycle_interval": cfg.server.cuda_ipc_recycle_interval, } # Tokenizer @@ -124,6 +143,7 @@ def _launch_processes(self) -> None: scheduler_writer, shared_queue, # Pass shared queue enable_shared_queue, # Pass flag + transport_mode, # Pass tensor transport mode ), daemon=True, ) diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py index e69de29b..5d0dbf07 100644 --- a/pymllm/layers/attention/__init__.py +++ b/pymllm/layers/attention/__init__.py @@ -0,0 +1,25 @@ +"""Attention layers and backends for pymllm.""" + +from pymllm.layers.attention.attention_backend import AttentionBackend +from pymllm.layers.attention.flashinfer_backend import ( + DecodeMetadata, + FlashInferAttnBackend, + PrefillMetadata, + WrapperDispatch, + should_use_tensor_core, +) +from pymllm.layers.attention.radix_attention import AttentionType, RadixAttention + +__all__ = [ + # Base + "AttentionBackend", + # RadixAttention + "AttentionType", + "RadixAttention", + # FlashInfer backend + "FlashInferAttnBackend", + "DecodeMetadata", + "PrefillMetadata", + "WrapperDispatch", + "should_use_tensor_core", +] diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py index e69de29b..07e2f6a1 100644 --- a/pymllm/layers/attention/attention_backend.py +++ b/pymllm/layers/attention/attention_backend.py @@ -0,0 +1,143 @@ +"""Abstract base class for pymllm attention backends. + +Every concrete backend (FlashInfer, Triton, torch-native, …) must implement +at minimum: + + * ``init_forward_metadata`` – called once per batch before the model forward. + * ``forward_extend`` – prefill / extend attention. + * ``forward_decode`` – single-token decode attention. + +The public ``forward`` method dispatches to the correct variant based on +``forward_batch.forward_mode``. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Optional + +import torch + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + from pymllm.layers.attention.radix_attention import RadixAttention + + +class AttentionBackend(ABC): + """Abstract base class for attention backends. + + All concrete backends inherit from this class and implement the abstract + methods below. + """ + + # ------------------------------------------------------------------ + # Core interface – must be implemented by every backend + # ------------------------------------------------------------------ + + @abstractmethod + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Prepare per-batch metadata before the model's attention layers run. + + For FlashInfer this plans the KV-index arrays and calls + ``wrapper.begin_forward``; for Triton / torch-native this is a no-op. + Must be called once per batch *before* ``model.forward``. + """ + raise NotImplementedError + + @abstractmethod + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for a decode step (one new token per sequence).""" + raise NotImplementedError + + @abstractmethod + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for a prefill / extend step.""" + raise NotImplementedError + + # ------------------------------------------------------------------ + # Dispatch – shared logic; do not override in normal backends + # ------------------------------------------------------------------ + + def forward( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Dispatch to ``forward_decode`` or ``forward_extend`` based on mode. + + For IDLE batches a zero-filled output tensor is returned without any + compute. + """ + if forward_batch.forward_mode.is_idle(): + # Return empty output without computation. + return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim) + elif forward_batch.forward_mode.is_decode(): + return self.forward_decode( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + else: + return self.forward_extend( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + # ------------------------------------------------------------------ + # Optional CUDA-graph interface + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + """Fill value used to pad ``seq_lens`` tensors for CUDA-graph capture. + + Most backends use ``1`` (not ``0``) to avoid division-by-zero in + attention kernels. + """ + raise NotImplementedError + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate shared CUDA-graph state (buffers reused across captures).""" + raise NotImplementedError + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: "ForwardMode", + ) -> None: + """Set up per-batch metadata for capturing a CUDA graph.""" + raise NotImplementedError + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: "ForwardMode", + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata when replaying a captured CUDA graph.""" + raise NotImplementedError diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py index e69de29b..479fb5ce 100644 --- a/pymllm/layers/attention/flashinfer_backend.py +++ b/pymllm/layers/attention/flashinfer_backend.py @@ -0,0 +1,964 @@ +"""FlashInfer attention backend for pymllm. + + * No model-runner object -- constructor takes explicit scalar / tensor params. + * No tensor-parallelism head splitting (handled at the model layer level). + * No speculative decoding support. + * ``KVPool`` API: + - ``get_kv_buffer(layer_id)`` returns ``(k_buf, v_buf)`` each shaped + ``[buf_len, num_heads, head_dim]``. + - ``set_kv_buffer(layer_id, indices, k, v)`` -- no scale arguments. + +Supports: + * Single-wrapper mode (full context, no sliding window) + * Sliding-window mode (two wrappers: window + full) + * CUDA-graph capture / replay for decode and target-verify passes. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass +from enum import Enum, auto +from typing import List, Optional, Union + +import torch + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.layers.attention.attention_backend import AttentionBackend +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Optional FlashInfer import +# --------------------------------------------------------------------------- + +_flashinfer_available = False +try: + from flashinfer import ( + BatchDecodeWithPagedKVCacheWrapper, + BatchPrefillWithPagedKVCacheWrapper, + BatchPrefillWithRaggedKVCacheWrapper, + ) + + try: + from flashinfer import fast_decode_plan + from functools import partial as _partial + + _has_fast_decode_plan = True + except ImportError: + _has_fast_decode_plan = False + + from flashinfer.cascade import merge_state + + _flashinfer_available = True +except ImportError: + logger.warning( + "flashinfer is not installed; FlashInferAttnBackend will raise " + "NotImplementedError if used." + ) + +# --------------------------------------------------------------------------- +# Global workspace buffer (shared across all FlashInfer wrapper instances) +# --------------------------------------------------------------------------- + +_global_workspace_buffer: Optional[torch.Tensor] = None + +# Default workspace size (128 MB); can be overridden via environment variable. +_DEFAULT_WORKSPACE_BYTES = int( + os.environ.get("PYMLLM_FLASHINFER_WORKSPACE_SIZE", 128 * 1024 * 1024) +) + +# --------------------------------------------------------------------------- +# Enums / dataclasses +# --------------------------------------------------------------------------- + + +class WrapperDispatch(Enum): + """Indicates which wrapper to use for a given attention layer.""" + + SLIDING_WINDOW = auto() + CROSS_ATTENTION = auto() + + +@dataclass +class DecodeMetadata: + """Per-batch metadata for a decode step.""" + + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]" + + +@dataclass +class PrefillMetadata: + """Per-batch metadata for a prefill / extend step.""" + + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]" + use_ragged: bool + extend_no_prefix: bool + + +# --------------------------------------------------------------------------- +# CUDA kernel – build the flat kv_indices array for FlashInfer +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# Helper – choose whether to use tensor cores for decode +# --------------------------------------------------------------------------- + + +def should_use_tensor_core( + kv_cache_dtype: torch.dtype, + num_attention_heads: int, + num_kv_heads: int, +) -> bool: + """Return whether FlashInfer decode should use tensor cores. + + For FP8 we always use tensor cores. For fp16 / bf16 we use them when + the GQA group size (num_attention_heads / num_kv_heads) is ≥ 4, which + fuses the head group with the token dimension in the MMA instruction. + """ + env_override = os.environ.get("PYMLLM_FLASHINFER_USE_TENSOR_CORE") + if env_override is not None: + return env_override.lower() == "true" + + try: + from flashinfer.decode import _grouped_size_compiled_for_decode_kernels + + return not _grouped_size_compiled_for_decode_kernels( + num_attention_heads, num_kv_heads + ) + except (ImportError, AttributeError): + pass + + gqa_group_size = num_attention_heads // num_kv_heads + if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2): + return True + if kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16): + return gqa_group_size >= 4 + return False + + +# --------------------------------------------------------------------------- +# FlashInferAttnBackend +# --------------------------------------------------------------------------- + + +class FlashInferAttnBackend(AttentionBackend): + """FlashInfer-based attention backend for pymllm. + + This class does not depend on a ``ModelRunner`` object. Instead it takes + all required configuration explicitly so that it can be constructed + independently of any particular model runner. + + Parameters + ---------- + num_heads + Number of query heads per device (after any TP sharding). + num_kv_heads + Number of KV heads per device. + head_dim + Per-head dimension for Q and K. + kv_cache_dtype + ``torch.dtype`` of the KV cache (e.g. ``torch.float16``). + q_dtype + ``torch.dtype`` of the query tensor. + max_context_len + Maximum sequence length the model supports. + req_to_token + The ``[max_reqs, max_context_len]`` int32 tensor from + ``ReqToTokenPool.req_to_token``. + device + Target device (e.g. ``torch.device("cuda")``) + max_req_pool_size + Maximum number of concurrent requests (= ``ReqToTokenPool.size``). + Used to pre-allocate ``kv_indptr`` / ``kv_last_page_len`` buffers. + sliding_window_size + When not ``None``, enables sliding-window attention mode which + allocates two wrapper sets (window + full context). + skip_prefill + When ``True``, skip creating prefill wrappers (for backends that only + perform decode, e.g. multi-step draft backends). + kv_indptr_buf + Optional pre-allocated ``kv_indptr`` buffer. Used when sharing + buffers across multiple backend instances (e.g. multi-step draft). + kv_last_page_len_buf + Optional pre-allocated ``kv_last_page_len`` buffer. + init_new_workspace + When ``True`` allocate a fresh workspace buffer instead of reusing the + global one. + """ + + def __init__( + self, + num_heads: int, + num_kv_heads: int, + head_dim: int, + kv_cache_dtype: torch.dtype, + q_dtype: torch.dtype, + max_context_len: int, + req_to_token: torch.Tensor, + device: torch.device, + max_req_pool_size: int, + sliding_window_size: Optional[int] = None, + skip_prefill: bool = False, + kv_indptr_buf: Optional[torch.Tensor] = None, + kv_last_page_len_buf: Optional[torch.Tensor] = None, + init_new_workspace: bool = False, + ): + if not _flashinfer_available: + raise RuntimeError( + "flashinfer is required for FlashInferAttnBackend but is not " + "installed. Run: pip install flashinfer-python" + ) + + super().__init__() + + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.kv_cache_dtype = kv_cache_dtype + self.q_dtype = q_dtype + self.max_context_len = max_context_len + self.req_to_token = req_to_token + self.device = device + self.skip_prefill = skip_prefill + + # Tensor-core preference for decode + self.decode_use_tensor_cores = should_use_tensor_core( + kv_cache_dtype, num_heads, num_kv_heads + ) + + # Sliding-window / cross-attention wrapper dispatch + if sliding_window_size is not None: + self.num_wrappers = 2 + self.dispatch_reason: Optional[WrapperDispatch] = ( + WrapperDispatch.SLIDING_WINDOW + ) + self.sliding_window_size: Optional[int] = sliding_window_size + else: + self.num_wrappers = 1 + self.dispatch_reason = None + self.sliding_window_size = None + + # ------------------------------------------------------------------ + # Workspace buffer + # ------------------------------------------------------------------ + global _global_workspace_buffer + if _global_workspace_buffer is None: + _global_workspace_buffer = torch.empty( + _DEFAULT_WORKSPACE_BYTES, + dtype=torch.uint8, + device=device, + ) + if init_new_workspace: + self.workspace_buffer = torch.empty( + _DEFAULT_WORKSPACE_BYTES, + dtype=torch.uint8, + device=device, + ) + else: + self.workspace_buffer = _global_workspace_buffer + + # ------------------------------------------------------------------ + # kv_indptr [num_wrappers × (max_req_pool_size + 1)] + # kv_last_page_len [max_req_pool_size] + # ------------------------------------------------------------------ + if kv_indptr_buf is None: + self.kv_indptr: List[torch.Tensor] = [ + torch.zeros((max_req_pool_size + 1,), dtype=torch.int32, device=device) + for _ in range(self.num_wrappers) + ] + else: + assert self.num_wrappers == 1 + self.kv_indptr = [kv_indptr_buf] + + if kv_last_page_len_buf is None: + self.kv_last_page_len = torch.ones( + (max_req_pool_size,), dtype=torch.int32, device=device + ) + else: + assert self.num_wrappers == 1 + self.kv_last_page_len = kv_last_page_len_buf + + # qo_indptr – only needed for prefill + if not skip_prefill: + self.qo_indptr: List[torch.Tensor] = [ + torch.zeros((max_req_pool_size + 1,), dtype=torch.int32, device=device) + for _ in range(self.num_wrappers) + ] + + # ------------------------------------------------------------------ + # Create FlashInfer wrappers + # ------------------------------------------------------------------ + self.prefill_wrapper_ragged: Optional[ + "BatchPrefillWithRaggedKVCacheWrapper" + ] = None + self.prefill_wrappers_paged: List["BatchPrefillWithPagedKVCacheWrapper"] = [] + self.decode_wrappers: List["BatchDecodeWithPagedKVCacheWrapper"] = [] + + if not skip_prefill: + self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( + self.workspace_buffer, "NHD" + ) + + for _ in range(self.num_wrappers): + if not skip_prefill: + self.prefill_wrappers_paged.append( + BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD") + ) + self.decode_wrappers.append( + BatchDecodeWithPagedKVCacheWrapper( + self.workspace_buffer, + "NHD", + use_tensor_cores=self.decode_use_tensor_cores, + ) + ) + + # ------------------------------------------------------------------ + # Indices updaters + # ------------------------------------------------------------------ + if not skip_prefill: + self.indices_updater_prefill = _FlashInferIndicesUpdaterPrefill(self) + self.indices_updater_decode = _FlashInferIndicesUpdaterDecode(self) + + # Per-batch metadata set by init_forward_metadata + self.forward_metadata: Optional[Union[DecodeMetadata, PrefillMetadata]] = None + + # CUDA-graph metadata stores + self.decode_cuda_graph_metadata: dict = {} + self.prefill_cuda_graph_metadata: dict = {} + + # ------------------------------------------------------------------ + # init_forward_metadata + # ------------------------------------------------------------------ + + def init_forward_metadata(self, forward_batch: ForwardBatch) -> None: + """Prepare FlashInfer wrappers for the current batch. + + Must be called once per batch before the model's ``forward`` method. + """ + if forward_batch.forward_mode.is_decode_or_idle(): + self.indices_updater_decode.update( + forward_batch.req_pool_indices, + forward_batch.seq_lens, + forward_batch.seq_lens_cpu, + forward_batch.seq_lens_sum, + decode_wrappers=self.decode_wrappers, + ) + self.forward_metadata = DecodeMetadata(self.decode_wrappers) + else: + # Extend / prefill + prefix_lens = forward_batch.extend_prefix_lens + extend_no_prefix = ( + forward_batch.extend_prefix_lens_cpu is not None + and not any(forward_batch.extend_prefix_lens_cpu) + ) + use_ragged = extend_no_prefix + + self.indices_updater_prefill.update( + forward_batch.req_pool_indices, + forward_batch.seq_lens, + forward_batch.seq_lens_cpu, + forward_batch.seq_lens_sum, + prefix_lens=prefix_lens, + prefill_wrappers=self.prefill_wrappers_paged, + use_ragged=use_ragged, + ) + self.forward_metadata = PrefillMetadata( + self.prefill_wrappers_paged, + use_ragged=use_ragged, + extend_no_prefix=extend_no_prefix, + ) + + # ------------------------------------------------------------------ + # forward_extend + # ------------------------------------------------------------------ + + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", # noqa: F821 + forward_batch: ForwardBatch, + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + from pymllm.layers.attention.radix_attention import RadixAttention + + assert isinstance(layer, RadixAttention) + meta: PrefillMetadata = self.forward_metadata + + prefill_wrapper_paged = meta.prefill_wrappers[self._get_wrapper_idx(layer)] + cache_loc = forward_batch.out_cache_loc + + # Write K/V into the pool + if k is not None: + assert v is not None + if save_kv_cache: + forward_batch.token_to_kv_pool.set_kv_buffer( + layer.layer_id, cache_loc, k, v + ) + + q_3d = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim) + + if not meta.use_ragged: + # Paged-only path: uses the full KV cache (prefix + extend). + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + # Reshape to [buf_len, page_size=1, num_heads, head_dim] for FlashInfer. + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + + o = prefill_wrapper_paged.forward( + q_3d, + paged_kv, + causal=not layer.is_cross_attention, + sm_scale=layer.scaling, + window_left=layer.sliding_window_size, + logits_soft_cap=layer.logit_cap if layer.logit_cap > 0 else None, + ) + else: + # Ragged path: query attends only to the new (ragged) K/V; + # prefix K/V is in the paged pool. + if k is None: + # Fallback: load K/V from the pool. + k_buf, v_buf = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + k = k_buf + v = v_buf + + k_3d = k.view(-1, layer.tp_k_head_num, layer.head_dim) + v_3d = v.view(-1, layer.tp_v_head_num, layer.v_head_dim) + + if meta.extend_no_prefix: + # Pure prefill – no prefix at all. + o = self.prefill_wrapper_ragged.forward( + q_3d, + k_3d, + v_3d, + causal=True, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + else: + # Extend with prefix: merge ragged (new) and paged (prefix). + o1, s1 = self.prefill_wrapper_ragged.forward_return_lse( + q_3d, + k_3d, + v_3d, + causal=True, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + o2, s2 = prefill_wrapper_paged.forward_return_lse( + q_3d, + paged_kv, + causal=False, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + + o, _ = merge_state(o1, s1, o2, s2) + + return o.view(-1, layer.tp_q_head_num * layer.v_head_dim) + + # ------------------------------------------------------------------ + # forward_decode + # ------------------------------------------------------------------ + + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", # noqa: F821 + forward_batch: ForwardBatch, + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + from pymllm.layers.attention.radix_attention import RadixAttention + + assert isinstance(layer, RadixAttention) + meta: DecodeMetadata = self.forward_metadata + + decode_wrapper = meta.decode_wrappers[self._get_wrapper_idx(layer)] + cache_loc = forward_batch.out_cache_loc + + if k is not None: + assert v is not None + if save_kv_cache: + forward_batch.token_to_kv_pool.set_kv_buffer( + layer.layer_id, cache_loc, k, v + ) + + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id) + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + + o = decode_wrapper.forward( + q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim), + paged_kv, + sm_scale=layer.scaling, + logits_soft_cap=layer.logit_cap if layer.logit_cap > 0 else None, + ) + + return o.view(-1, layer.tp_q_head_num * layer.v_head_dim) + + # ------------------------------------------------------------------ + # CUDA-graph support + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + return 1 + + def init_cuda_graph_state( + self, + max_bs: int, + max_num_tokens: int, + kv_indices_buf: Optional[torch.Tensor] = None, + ) -> None: + """Allocate CUDA-graph shared state buffers.""" + if kv_indices_buf is None: + cuda_graph_kv_indices = torch.zeros( + (max_num_tokens * self.max_context_len,), + dtype=torch.int32, + device=self.device, + ) + else: + cuda_graph_kv_indices = kv_indices_buf + + self.cuda_graph_kv_indices = [cuda_graph_kv_indices] + [ + cuda_graph_kv_indices.clone() for _ in range(self.num_wrappers - 1) + ] + + if not self.skip_prefill: + self.cuda_graph_custom_mask = torch.zeros( + (max_num_tokens * self.max_context_len,), + dtype=torch.uint8, + device=self.device, + ) + self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr] + self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr] + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: ForwardMode, + ) -> None: + """Set up metadata for CUDA-graph capture of a decode step.""" + if not forward_mode.is_decode_or_idle(): + raise ValueError( + "CUDA-graph capture is only supported for decode / idle modes." + ) + + decode_wrappers = [] + for i in range(self.num_wrappers): + decode_wrappers.append( + BatchDecodeWithPagedKVCacheWrapper( + self.workspace_buffer, + "NHD", + use_cuda_graph=True, + use_tensor_cores=self.decode_use_tensor_cores, + paged_kv_indptr_buffer=self.kv_indptr[i][: num_tokens + 1], + paged_kv_indices_buffer=self.cuda_graph_kv_indices[i], + paged_kv_last_page_len_buffer=self.kv_last_page_len[:num_tokens], + ) + ) + + seq_lens_sum = seq_lens.sum().item() + self.indices_updater_decode.update( + req_pool_indices, + seq_lens, + seq_lens.cpu(), + seq_lens_sum, + decode_wrappers=decode_wrappers, + ) + self.decode_cuda_graph_metadata[bs] = decode_wrappers + self.forward_metadata = DecodeMetadata(decode_wrappers) + + if _has_fast_decode_plan: + for i in range(self.num_wrappers): + decode_wrappers[i].begin_forward = _partial( + fast_decode_plan, decode_wrappers[i] + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: ForwardMode, + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata when replaying a CUDA graph for decode.""" + if not forward_mode.is_decode_or_idle(): + raise ValueError( + "CUDA-graph replay is only supported for decode / idle modes." + ) + + self.indices_updater_decode.update( + req_pool_indices[:bs], + seq_lens[:bs], + seq_lens_cpu[:bs] if seq_lens_cpu is not None else None, + seq_lens_sum, + decode_wrappers=self.decode_cuda_graph_metadata[bs], + ) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _get_wrapper_idx(self, layer) -> int: + """Return the wrapper index for the given attention layer.""" + if self.num_wrappers == 1: + return 0 + if self.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + # Wrapper 0 → sliding window attention. + # Wrapper 1 → full-context attention. + return int(layer.sliding_window_size == -1) + raise ValueError(f"Unknown dispatch reason: {self.dispatch_reason}") + + +# --------------------------------------------------------------------------- +# _FlashInferIndicesUpdaterDecode +# --------------------------------------------------------------------------- + + +class _FlashInferIndicesUpdaterDecode: + """Populates ``kv_indptr`` / ``kv_indices`` and calls + ``wrapper.begin_forward`` before every decode step. + """ + + def __init__(self, backend: FlashInferAttnBackend): + self.num_qo_heads = backend.num_heads + self.num_kv_heads = backend.num_kv_heads + self.head_dim = backend.head_dim + self.data_type = backend.kv_cache_dtype + self.q_data_type = backend.q_dtype + self.sliding_window_size = backend.sliding_window_size + self.backend = backend + + self.kv_indptr = backend.kv_indptr + self.kv_last_page_len = backend.kv_last_page_len + self.req_to_token = backend.req_to_token + + def update( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]", + kv_start_idx: Optional[torch.Tensor] = None, + ) -> None: + if self.backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + self._update_sliding_window( + req_pool_indices, + seq_lens, + seq_lens_cpu, + seq_lens_sum, + decode_wrappers, + ) + else: + # Single-wrapper: full-context decode. Build kv_indptr/kv_indices + # and call FlashInfer's plan function via the CUDA kernel. + bs = len(req_pool_indices) + kv_indptr = self.kv_indptr[0] + + # Fill kv_indptr: prefix sums of paged_kernel_lens. + kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0) + kv_indptr_sliced = kv_indptr[: bs + 1] + + if seq_lens_cpu is not None: + seq_lens_sum = int(seq_lens_cpu.sum().item()) + else: + seq_lens_sum = int(seq_lens.sum().item()) + + # Allocate KV indices buffer. + if decode_wrappers and decode_wrappers[0].is_cuda_graph_enabled: + kv_indices = decode_wrappers[0]._paged_kv_indices_buf + else: + kv_indices = torch.empty( + seq_lens_sum, dtype=torch.int32, device=self.req_to_token.device + ) + + # Use high-performance CUDA kernel to populate kv_indices. + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + seq_lens.to(torch.int32), + kv_indptr_sliced, + None, + kv_indices, + ) + + decode_wrappers = decode_wrappers or self.decode_wrappers + decode_wrappers[0].begin_forward( + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + data_type=self.data_type, + q_data_type=self.q_data_type, + non_blocking=True, + ) + + def _update_sliding_window( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]", + ) -> None: + assert self.sliding_window_size is not None + for wrapper_id in range(2): + if wrapper_id == 0: + # Sliding-window attention: clamp to window size + 1 + paged_kernel_lens = torch.clamp( + seq_lens, max=self.sliding_window_size + 1 + ) + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) + kv_start_idx = seq_lens - paged_kernel_lens + seq_lens_cpu_tmp = ( + torch.clamp(seq_lens_cpu, max=self.sliding_window_size + 1) + if seq_lens_cpu is not None + else None + ) + else: + # Full-context attention + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + kv_start_idx = None + seq_lens_cpu_tmp = seq_lens_cpu + + bs = len(req_pool_indices) + kv_indptr = self.kv_indptr[wrapper_id] + kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0) + kv_indptr_sliced = kv_indptr[: bs + 1] + + if decode_wrappers and decode_wrappers[wrapper_id].is_cuda_graph_enabled: + kv_indices = decode_wrappers[wrapper_id]._paged_kv_indices_buf + else: + kv_indices = torch.empty( + paged_kernel_lens_sum, + dtype=torch.int32, + device=self.req_to_token.device, + ) + + # High-performance CUDA kernel populates kv_indices from req_to_token. + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + paged_kernel_lens.to(torch.int32), + kv_indptr_sliced, + kv_start_idx.to(torch.int32) if kv_start_idx is not None else None, + kv_indices, + ) + + decode_wrappers[wrapper_id].begin_forward( + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + data_type=self.data_type, + q_data_type=self.q_data_type, + non_blocking=True, + ) + + +# --------------------------------------------------------------------------- +# _FlashInferIndicesUpdaterPrefill +# --------------------------------------------------------------------------- + + +class _FlashInferIndicesUpdaterPrefill: + """Populates indices and calls ``wrapper.begin_forward`` before extend.""" + + def __init__(self, backend: FlashInferAttnBackend): + self.num_qo_heads = backend.num_heads + self.num_kv_heads = backend.num_kv_heads + self.head_dim = backend.head_dim + self.data_type = backend.kv_cache_dtype + self.q_data_type = backend.q_dtype + self.sliding_window_size = backend.sliding_window_size + self.backend = backend + + self.kv_indptr = backend.kv_indptr + self.kv_last_page_len = backend.kv_last_page_len + self.qo_indptr = backend.qo_indptr + self.req_to_token = backend.req_to_token + self.prefill_wrapper_ragged = backend.prefill_wrapper_ragged + + def update( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + prefix_lens: Optional[torch.Tensor], + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]", + use_ragged: bool, + ) -> None: + if self.backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + self._update_sliding_window( + req_pool_indices, + seq_lens, + seq_lens_cpu, + seq_lens_sum, + prefix_lens, + prefill_wrappers, + use_ragged, + ) + else: + if use_ragged: + paged_kernel_lens = prefix_lens + paged_kernel_lens_sum = paged_kernel_lens.sum().item() + else: + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + + self._call_begin_forward( + self.prefill_wrapper_ragged, + prefill_wrappers[0], + req_pool_indices, + paged_kernel_lens, + paged_kernel_lens_sum, + seq_lens, + prefix_lens, + kv_start_idx=None, + kv_indptr=self.kv_indptr[0], + qo_indptr=self.qo_indptr[0], + use_ragged=use_ragged, + ) + + def _update_sliding_window( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + prefix_lens: Optional[torch.Tensor], + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]", + use_ragged: bool, + ) -> None: + assert self.sliding_window_size is not None + for wrapper_id in range(2): + if wrapper_id == 0: + # Sliding-window portion uses a limited context window. + extend_lens = seq_lens - prefix_lens + paged_kernel_lens = torch.minimum( + seq_lens, + torch.tensor(self.sliding_window_size, device=seq_lens.device) + + extend_lens, + ) + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) + kv_start_idx = seq_lens - paged_kernel_lens + else: + # Full-context portion. + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + kv_start_idx = None + + kv_indptr = self.kv_indptr[wrapper_id] + qo_indptr = self.qo_indptr[wrapper_id] + + self._call_begin_forward( + self.prefill_wrapper_ragged, + prefill_wrappers[wrapper_id], + req_pool_indices, + paged_kernel_lens, + paged_kernel_lens_sum, + seq_lens, + prefix_lens, + kv_start_idx=kv_start_idx, + kv_indptr=kv_indptr, + qo_indptr=qo_indptr, + use_ragged=use_ragged, + ) + + def _call_begin_forward( + self, + wrapper_ragged: "BatchPrefillWithRaggedKVCacheWrapper", + wrapper_paged: "BatchPrefillWithPagedKVCacheWrapper", + req_pool_indices: torch.Tensor, + paged_kernel_lens: torch.Tensor, + paged_kernel_lens_sum: int, + seq_lens: torch.Tensor, + prefix_lens: Optional[torch.Tensor], + kv_start_idx: Optional[torch.Tensor], + kv_indptr: torch.Tensor, + qo_indptr: torch.Tensor, + use_ragged: bool, + ) -> None: + bs = len(seq_lens) + + # Build kv_indptr and kv_indices using the CUDA kernel. + kv_indptr_sliced = kv_indptr[: bs + 1] + kv_indptr_sliced[1:] = torch.cumsum(paged_kernel_lens, dim=0) + + kv_indices = torch.empty( + paged_kernel_lens_sum + 256, + dtype=torch.int32, + device=req_pool_indices.device, + ) + + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + paged_kernel_lens.to(torch.int32), + kv_indptr_sliced, + kv_start_idx.to(torch.int32) if kv_start_idx is not None else None, + kv_indices, + ) + + # Build qo_indptr (number of new tokens per sequence). + if prefix_lens is not None: + extend_lens = seq_lens - prefix_lens + else: + extend_lens = seq_lens + qo_indptr_sliced = qo_indptr[: bs + 1] + qo_indptr_sliced[1:] = torch.cumsum(extend_lens, dim=0) + + # Plan the ragged wrapper (new tokens only). + if use_ragged: + wrapper_ragged.begin_forward( + qo_indptr_sliced, + qo_indptr_sliced, + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + q_data_type=self.q_data_type, + ) + + # Plan the paged wrapper (cached prefix tokens). + wrapper_paged.begin_forward( + qo_indptr_sliced, + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + q_data_type=self.q_data_type, + kv_data_type=self.data_type, + non_blocking=True, + ) diff --git a/pymllm/layers/attention/radix_attention.py b/pymllm/layers/attention/radix_attention.py index e69de29b..114130db 100644 --- a/pymllm/layers/attention/radix_attention.py +++ b/pymllm/layers/attention/radix_attention.py @@ -0,0 +1,171 @@ +"""RadixAttention -- the attention layer used by pymllm models. + +This module is kept small intentionally: all heavy computation is delegated +to the pluggable ``AttentionBackend`` that is attached to the ``ForwardBatch``. +""" + +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Optional + +import torch +from torch import nn + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + + +# --------------------------------------------------------------------------- +# AttentionType +# --------------------------------------------------------------------------- + + +class AttentionType(Enum): + """Attention variant used by a :class:`RadixAttention` layer. + + Uses string values so that ``torch.compile`` can treat them as constants. + """ + + # Standard causal self-attention in a decoder layer. + DECODER = "decoder" + + # Bidirectional self-attention for image tokens inside a decoder + # (e.g. VLM visual encoder embedded in the language model). + DECODER_BIDIRECTIONAL = "decoder_bidirectional" + + # Full bidirectional self-attention in an encoder-only model. + ENCODER_ONLY = "encoder_only" + + +# --------------------------------------------------------------------------- +# RadixAttention +# --------------------------------------------------------------------------- + + +class RadixAttention(nn.Module): + """Attention layer that delegates computation to a pluggable backend. + + Each transformer attention layer in a pymllm model creates exactly one + ``RadixAttention`` with a unique ``layer_id``. During the forward pass + the layer looks up the correct KV buffer via ``layer_id`` and calls the + backend attached to the current :class:`~pymllm.engine.forward_batch.ForwardBatch`. + + Parameters + ---------- + num_heads + Number of query attention heads (after any tensor-parallelism + sharding; pass the full count if not using TP). + head_dim + Per-head dimension for query and key projections. + scaling + Softmax pre-scale, typically ``1 / sqrt(head_dim)``. + num_kv_heads + Number of key / value heads (supports GQA / MQA). + layer_id + Zero-based index of this layer within the model. Used to index into + ``KVPool.k_buffer`` / ``v_buffer``. + logit_cap + If > 0, attention logits are soft-capped to this value via a ``tanh`` + gate (used by Gemma2 / Gemma3 style models). Set to ``0.0`` to + disable. + v_head_dim + Per-head dimension of the value projection. Defaults to ``head_dim`` + (i.e. standard square QKV). + sliding_window_size + Sliding-window attention span. ``-1`` means full context (no window). + is_cross_attention + ``True`` for cross-attention layers in encoder-decoder models. + attn_type + One of :class:`AttentionType`. + """ + + def __init__( + self, + num_heads: int, + head_dim: int, + scaling: float, + num_kv_heads: int, + layer_id: int, + logit_cap: float = 0.0, + v_head_dim: int = -1, + sliding_window_size: int = -1, + is_cross_attention: bool = False, + attn_type: AttentionType = AttentionType.DECODER, + ): + super().__init__() + + self.tp_q_head_num: int = num_heads + self.tp_k_head_num: int = num_kv_heads + self.tp_v_head_num: int = num_kv_heads + + self.head_dim: int = head_dim + self.qk_head_dim: int = head_dim + self.v_head_dim: int = v_head_dim if v_head_dim != -1 else head_dim + + self.scaling: float = scaling + self.layer_id: int = layer_id + self.logit_cap: float = logit_cap + self.sliding_window_size: int = ( + sliding_window_size if sliding_window_size is not None else -1 + ) + self.is_cross_attention: bool = is_cross_attention + self.attn_type: AttentionType = attn_type + + # ------------------------------------------------------------------ + # forward + # ------------------------------------------------------------------ + + def forward( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for one batch. + + Parameters + ---------- + q + Query tensor, shape ``[num_tokens, tp_q_head_num * head_dim]`` + (or already reshaped to ``[num_tokens, tp_q_head_num, head_dim]``). + k + Key tensor, same leading dimension as ``q``, shape + ``[num_tokens, tp_k_head_num * qk_head_dim]``. + Pass ``None`` for cross-layer KV sharing (``v`` must also be + ``None`` in this case). + v + Value tensor, shape + ``[num_tokens, tp_v_head_num * v_head_dim]``. + forward_batch + Batch metadata and references to memory pools / backend. + save_kv_cache + When ``False``, skip writing K/V into the pool (useful for draft + models in speculative decoding). + **kwargs + Passed through to the backend (e.g. ``q_rope``, ``k_rope``). + """ + if k is not None: + assert v is not None, "k and v must both be provided or both be None" + k = k.view(-1, self.tp_k_head_num, self.qk_head_dim) + v = v.view(-1, self.tp_v_head_num, self.v_head_dim) + + return forward_batch.attn_backend.forward( + q, k, v, self, forward_batch, save_kv_cache, **kwargs + ) + + def extra_repr(self) -> str: + return ( + f"layer_id={self.layer_id}, " + f"q_heads={self.tp_q_head_num}, " + f"kv_heads={self.tp_k_head_num}, " + f"head_dim={self.head_dim}, " + f"v_head_dim={self.v_head_dim}, " + f"scaling={self.scaling:.4f}, " + f"logit_cap={self.logit_cap}, " + f"sliding_window={self.sliding_window_size}, " + f"attn_type={self.attn_type.value}" + ) diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py new file mode 100644 index 00000000..e69de29b diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py index 0721fd71..f9c176a9 100644 --- a/pymllm/mem_cache/memory_pool.py +++ b/pymllm/mem_cache/memory_pool.py @@ -6,7 +6,7 @@ TokenToKVPoolAllocator manages a free-list of integer indices KVPool holds the actual GPU K/V tensors -All indices are **int64** tensors on the target device. Slot 0 in the KV +All indices are **int32** tensors on the target device. Slot 0 in the KV buffers is reserved as a padding / dummy-output slot and is never allocated. """ @@ -210,7 +210,7 @@ class TokenToKVPoolAllocator: allocator = TokenToKVPoolAllocator(size=4096, device="cuda") # --- basic alloc / free --- - indices = allocator.alloc(128) # 128 free slot indices (int64) + indices = allocator.alloc(128) # 128 free slot indices (int32) allocator.free(indices[:64]) # return 64 slots # --- batch free (amortised) --- @@ -251,14 +251,14 @@ def clear(self) -> None: """Reset the allocator so that all slots ``[1, size]`` are free. The first slot is reserved for padding.""" if self.page_size == 1: self.free_slots = torch.arange( - 1, self.size + 1, dtype=torch.int64, device=self.device + 1, self.size + 1, dtype=torch.int32, device=self.device ) else: num_pages = self.size // self.page_size self.free_slots = torch.arange( - 1, num_pages + 1, dtype=torch.int64, device=self.device + 1, num_pages + 1, dtype=torch.int32, device=self.device ) - self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self.release_slots = torch.empty((0,), dtype=torch.int32, device=self.device) self._is_not_in_free_group = True self._free_group: List[torch.Tensor] = [] @@ -273,7 +273,7 @@ def merge_and_sort_free(self) -> None: self.free_slots = torch.cat((self.free_slots, self.release_slots)) if self.need_sort: self.free_slots, _ = torch.sort(self.free_slots) - self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self.release_slots = torch.empty((0,), dtype=torch.int32, device=self.device) def free_group_begin(self) -> None: """Start collecting ``free()`` calls; actual release is deferred to ``free_group_end``.""" @@ -290,7 +290,7 @@ def free_group_end(self) -> None: def alloc(self, need_size: int) -> Optional[torch.Tensor]: """Allocate *need_size* token indices. - Returns a 1-D ``int64`` tensor on success, or ``None`` if the pool is + Returns a 1-D ``int32`` tensor on success, or ``None`` if the pool is exhausted. """ if self.page_size == 1: @@ -380,7 +380,7 @@ def __init__( self.device = torch.device(device) self.req_to_token = torch.zeros( - (max_reqs, max_context_len), dtype=torch.int64, device=self.device + (max_reqs, max_context_len), dtype=torch.int32, device=self.device ) self._free_slots: List[int] = list(range(max_reqs)) diff --git a/pymllm/orchestrator/cuda_ipc_transport.py b/pymllm/orchestrator/cuda_ipc_transport.py index 7052f0e8..938132c8 100644 --- a/pymllm/orchestrator/cuda_ipc_transport.py +++ b/pymllm/orchestrator/cuda_ipc_transport.py @@ -1,373 +1,648 @@ """ -CUDA IPC Transport for zero-copy tensor sharing between processes. +CUDA IPC Transport for zero-copy GPU tensor sharing between processes. -This module implements CUDA IPC with workspace buffer management -to avoid PyTorch's memory leak issue when sharing IPC handles. +## Background -1. Create a workspace buffer on GPU (pre-allocated memory pool) -2. Copy tensor data to a chunk in the workspace -3. Get CUDA IPC handle for the chunk -4. Send handle + metadata (shape, dtype, offset) to another process -5. Reconstruct tensor in target process from IPC handle -6. Copy to local tensor and mark chunk as reusable +When sharing CUDA tensors between processes, there are two fundamentally different paths: -Key Problem Solved: - PyTorch never releases tensors whose IPC handles are shared until process ends. - Solution: Use a fixed-size workspace buffer and recycle chunks. +1. **CPU shared memory path** (``enable_shared_queue=True, enable_cuda_ipc=False``): + GPU tensors are moved to CPU / POSIX shared memory via ``tensor.share_memory_()``. + This is safe but incurs a GPU→CPU copy which is expensive for large vision features. + +2. **CUDA IPC path** (``enable_cuda_ipc=True``): + GPU tensors stay on GPU. PyTorch's ``storage._share_cuda_()`` yields a serialisable + IPC handle; the receiver calls ``UntypedStorage._new_shared_cuda(*handle)`` to map + the same physical GPU memory without any copy. + +These two paths are **mutually exclusive for GPU tensors**. ``enable_cuda_ipc`` takes +priority; when active the CPU-copy step in ``TensorQueue._make_tensors_shareable`` is +skipped. + +## CUDA IPC memory-leak problem and its fix + +PyTorch never releases the GPU allocation backing an IPC-exported tensor until the +*sending* process exits. If we export raw model tensors we permanently leak GPU memory. + +**Solution** (pool-based recycling via ``MmItemMemoryPool``): + +* Allocate a single, fixed-size GPU workspace (``MmItemMemoryPool``). +* For each outgoing GPU tensor, copy it into a chunk of the workspace and export the + *chunk* via IPC (the workspace is never freed; its chunks are recycled). +* After the receiving process has finished with the data it writes a sync flag + (``ShmSyncBuffer``) to signal that the chunk may be reused. +* A background recycler thread in the sender walks ``occupied_chunks`` and returns + chunks whose sync flag has been incremented back to ``available_chunks``. + +## Transport modes + +``TensorTransportMode``: +* ``"default"`` – CPU/shared-memory path; no CUDA IPC. +* ``"cuda_ipc"`` – Simple CUDA IPC: wraps GPU tensors in ``TransportProxyTensor`` + (a ``torch.Tensor`` subclass whose ``__getstate__``/``__setstate__`` use + ``_share_cuda_``). Suitable for single-process-group scenarios; incurs the + PyTorch memory-leak noted above. +* ``"cuda_ipc_pool"`` – Pool-based CUDA IPC: copies GPU tensors into a pre-allocated + ``MmItemMemoryPool`` and wraps the slice in ``CudaIpcTensorTransportProxy``. + The pool is recycled, so there is no memory leak. """ +from __future__ import annotations + +import fcntl import logging -import struct -import uuid -from dataclasses import dataclass -from multiprocessing import Queue -from multiprocessing.shared_memory import SharedMemory -from typing import Any, Dict, List, Optional, Tuple +import threading +import time +from multiprocessing import shared_memory +from typing import Any, Dict, List, Literal, Optional, Tuple +import numpy as np import torch -import torch.cuda as cuda logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Type alias for transport mode +# --------------------------------------------------------------------------- + +TensorTransportMode = Literal["default", "cuda_ipc", "cuda_ipc_pool"] -@dataclass -class MemoryChunk: - """Represents a chunk in the workspace buffer.""" - offset: int # Offset in bytes from workspace start - size: int # Size in bytes - in_use: bool # Whether the chunk is currently occupied - sync_shm_name: Optional[str] = None # Shared memory name for sync flag +# --------------------------------------------------------------------------- +# ShmSyncBuffer – a tiny POSIX shared memory float used as a sync counter +# --------------------------------------------------------------------------- -class WorkspaceBuffer: - """GPU memory pool for storing multimodal tensors temporarily. +class ShmSyncBuffer: + """A single float32 in POSIX shared memory used as a sync counter. - This prevents the PyTorch IPC handle memory leak by using a fixed-size - pre-allocated buffer and recycling chunks. + The sender resets it to 0 before exporting a chunk. The receiver + increments it (atomically under a file lock) once it has finished copying + data out of the chunk. When the value reaches the number of consumers + (``tp_size``) the sender recycles the chunk. """ - def __init__(self, size_gb: float = 4.0, device: int = 0): - """Initialize workspace buffer. + def __init__(self, byte_size: int = 4) -> None: + self.buffer = shared_memory.SharedMemory(create=True, size=byte_size) + self._arr = np.ndarray(1, dtype=np.float32, buffer=self.buffer.buf) + self._arr *= 0 # initialise to 0 + self.meta_data: Dict[str, Any] = { + "handle": self.buffer.name, + "shape": self._arr.shape, + "dtype": str(self._arr.dtype), + } + + # ------------------------------------------------------------------ + # Helpers consumed by the *receiver* side + # ------------------------------------------------------------------ + + @staticmethod + def open( + meta_data: Dict[str, Any], + ) -> Tuple[shared_memory.SharedMemory, np.ndarray]: + """Open an existing ShmSyncBuffer from the metadata dict.""" + shm = shared_memory.SharedMemory(name=meta_data["handle"]) + arr = np.ndarray(meta_data["shape"], dtype=meta_data["dtype"], buffer=shm.buf) + return shm, arr + + def __del__(self) -> None: + try: + self.buffer.close() + self.buffer.unlink() + except Exception: + pass - Args: - size_gb: Total size of workspace in GB - device: CUDA device ID - """ - self.device = device - self.total_size = int(size_gb * 1024 * 1024 * 1024) # Convert GB to bytes - # Allocate workspace on GPU - with torch.cuda.device(device): - self.workspace = torch.empty( - self.total_size // 4, # Divide by 4 because we use float32 - dtype=torch.float32, - device=f"cuda:{device}", - ) +# Lock file used to serialise writes to sync flags across processes +_SHM_LOCK_FILE = "/tmp/pymllm_shm_wr_lock.lock" - # Initialize chunk management - self.chunks: List[MemoryChunk] = [ - MemoryChunk(offset=0, size=self.total_size, in_use=False) - ] - # Container for reusable sync buffers - self.sync_buffer_pool: List[str] = [] +def _increment_sync_flag(meta_data: Dict[str, Any]) -> None: + """Increment the sync flag by 1 under a process-level file lock.""" + shm, arr = ShmSyncBuffer.open(meta_data) + try: + open(_SHM_LOCK_FILE, "a").close() # ensure file exists + with open(_SHM_LOCK_FILE, "w+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + arr += 1.0 + fcntl.flock(f, fcntl.LOCK_UN) + finally: + shm.close() - logger.info( - f"WorkspaceBuffer initialized: {size_gb}GB on cuda:{device}, " - f"ptr={self.workspace.data_ptr():#x}" - ) - def allocate(self, size_bytes: int) -> Optional[Tuple[int, str]]: - """Allocate a chunk from the workspace. +# --------------------------------------------------------------------------- +# MmItemMemoryChunk +# --------------------------------------------------------------------------- - Args: - size_bytes: Required size in bytes - Returns: - Tuple of (offset, sync_shm_name) if successful, None if no space - """ - # Find a free chunk that's large enough - for i, chunk in enumerate(self.chunks): - if not chunk.in_use and chunk.size >= size_bytes: - # Mark chunk as in use - chunk.in_use = True - - # Get or create sync buffer - if self.sync_buffer_pool: - sync_shm_name = self.sync_buffer_pool.pop() - # Reset sync flag to 0 (not ready) - self._reset_sync_buffer(sync_shm_name) - else: - sync_shm_name = self._create_sync_buffer() - - chunk.sync_shm_name = sync_shm_name - - # If chunk is larger than needed, split it - if chunk.size > size_bytes: - # Create a new free chunk for the remaining space - new_chunk = MemoryChunk( - offset=chunk.offset + size_bytes, - size=chunk.size - size_bytes, - in_use=False, - ) - chunk.size = size_bytes - self.chunks.insert(i + 1, new_chunk) +class MmItemMemoryChunk: + """A contiguous slice of the ``MmItemMemoryPool`` workspace tensor.""" - logger.debug( - f"Allocated chunk: offset={chunk.offset}, size={size_bytes}, " - f"sync_shm={sync_shm_name}" - ) - return chunk.offset, sync_shm_name + def __init__(self, area: Tuple[int, int], sync_flag: ShmSyncBuffer) -> None: + self.area = area + self.sync_flag = sync_flag - logger.warning(f"WorkspaceBuffer: No space for {size_bytes} bytes") - return None + @property + def mem_size(self) -> int: + return self.area[1] - self.area[0] - def release(self, offset: int) -> None: - """Release a chunk back to the pool. + @property + def start(self) -> int: + return self.area[0] - Args: - offset: Offset of the chunk to release - """ - for i, chunk in enumerate(self.chunks): - if chunk.offset == offset and chunk.in_use: - chunk.in_use = False + @property + def end(self) -> int: + return self.area[1] - # Return sync buffer to pool - if chunk.sync_shm_name: - self.sync_buffer_pool.append(chunk.sync_shm_name) - chunk.sync_shm_name = None + def try_to_recycle(self, num_consumers: int = 1) -> bool: + """Return True if all consumers have finished and the chunk can be reused.""" + val = float(self.sync_flag._arr.item()) + logger.debug( + "[try_to_recycle] area=%s flag=%.0f consumers=%d", + self.area, + val, + num_consumers, + ) + if val >= float(num_consumers): + self.sync_flag._arr *= 0.0 # reset for next use + return True + return False - # Try to merge with adjacent free chunks - self._merge_chunks() - logger.debug(f"Released chunk: offset={offset}") - return +# --------------------------------------------------------------------------- +# MmItemMemoryPool – pre-allocated GPU workspace to avoid IPC memory leaks +# --------------------------------------------------------------------------- - logger.warning(f"Attempted to release unknown chunk at offset {offset}") - def _merge_chunks(self) -> None: - """Merge adjacent free chunks to reduce fragmentation.""" - i = 0 - while i < len(self.chunks) - 1: - current = self.chunks[i] - next_chunk = self.chunks[i + 1] +class MmItemMemoryPool: + """Pre-allocated GPU memory pool for CUDA IPC tensor transport. - if not current.in_use and not next_chunk.in_use: - # Merge chunks - current.size += next_chunk.size + Chunks are allocated from a contiguous ``torch.int8`` tensor on GPU. + A background thread periodically recycles chunks whose sync flags show + that all consumers have finished reading. - # Keep first chunk's sync buffer, return second to pool - if next_chunk.sync_shm_name: - self.sync_buffer_pool.append(next_chunk.sync_shm_name) + Args: + memory_size: Pool size in **bytes**. + recycle_interval: How often (seconds) the recycler thread runs. + num_consumers: Number of consumer processes (tp_size). Each consumer + must increment the sync flag once before a chunk is recycled. + device: CUDA device index. + """ - self.chunks.pop(i + 1) - else: - i += 1 - - def _create_sync_buffer(self) -> str: - """Create a new shared memory sync buffer (8 bytes, initialized to 0).""" - shm_name = f"pymllm_sync_{uuid.uuid4().hex[:12]}" - shm = SharedMemory(name=shm_name, create=True, size=8) - # Initialize to 0 (not ready) - shm.buf[:8] = struct.pack("Q", 0) - shm.close() - logger.debug(f"Created sync buffer: {shm_name}") - return shm_name + def __init__( + self, + memory_size: int, + recycle_interval: float = 0.1, + num_consumers: int = 1, + device: int = 0, + ) -> None: + self.num_consumers = num_consumers + self._recycle_interval = recycle_interval + self._lock = threading.Lock() + self._stop = False - def _reset_sync_buffer(self, shm_name: str) -> None: - """Reset sync buffer to 0 (not ready).""" - try: - shm = SharedMemory(name=shm_name, create=False) - shm.buf[:8] = struct.pack("Q", 0) - shm.close() - except Exception as e: - logger.warning(f"Failed to reset sync buffer {shm_name}: {e}") - - def copy_tensor_to_workspace(self, tensor: torch.Tensor, offset: int) -> None: - """Copy tensor data to workspace at given offset. - - Args: - tensor: Source tensor (must be on same CUDA device) - offset: Byte offset in workspace - """ - if not tensor.is_cuda or tensor.device.index != self.device: - raise ValueError(f"Tensor must be on cuda:{self.device}") + with torch.cuda.device(device): + self.memory_pool: torch.Tensor = torch.empty( + memory_size, dtype=torch.int8, device=f"cuda:{device}" + ).contiguous() + + init_chunk = MmItemMemoryChunk((0, memory_size), self._new_sync_buffer()) + self.available_chunks: List[MmItemMemoryChunk] = [init_chunk] + self.occupied_chunks: List[MmItemMemoryChunk] = [] + # Pool of reusable ShmSyncBuffer objects (returned from recycled chunks) + self._sync_pool: List[ShmSyncBuffer] = [] + + self._recycler = threading.Thread( + target=self._recycle_loop, + name="MmItemMemoryPoolRecycler", + daemon=True, + ) + self._recycler.start() - size_bytes = tensor.numel() * tensor.element_size() + logger.info( + "MmItemMemoryPool: %d MB on cuda:%d, recycle_interval=%.2fs", + memory_size // (1024 * 1024), + device, + recycle_interval, + ) - # Get view of workspace at offset - offset_elements = offset // 4 # Workspace is float32 - num_elements = (size_bytes + 3) // 4 # Round up + # ------------------------------------------------------------------ + # Sync buffer management + # ------------------------------------------------------------------ + + def _new_sync_buffer(self) -> ShmSyncBuffer: + if self._sync_pool: + return self._sync_pool.pop() + return ShmSyncBuffer() + + def _return_sync_buffer(self, buf: ShmSyncBuffer) -> None: + buf._arr *= 0.0 # reset counter + self._sync_pool.append(buf) + + # ------------------------------------------------------------------ + # Allocation + # ------------------------------------------------------------------ + + def _get_available_chunk(self, src: torch.Tensor) -> Optional[MmItemMemoryChunk]: + """Best-fit allocation: find the smallest available chunk >= src size.""" + needed = src.numel() * src.element_size() + best: Optional[MmItemMemoryChunk] = None + for chunk in self.available_chunks: + if chunk.mem_size >= needed: + if best is None or chunk.mem_size < best.mem_size: + best = chunk + if best is None: + return None - workspace_view = self.workspace[ - offset_elements : offset_elements + num_elements - ] + # Split the selected chunk + occupied_area = (best.start, best.start + needed) + occupied = MmItemMemoryChunk(occupied_area, best.sync_flag) + self.occupied_chunks.append(occupied) + self.available_chunks.remove(best) - # Copy tensor data (flatten and cast to float32 view) - tensor_flat = tensor.flatten().view(torch.uint8) - workspace_flat = workspace_view.view(torch.uint8)[: tensor_flat.numel()] - workspace_flat.copy_(tensor_flat) + remainder = (occupied.end, best.end) + if remainder[0] < remainder[1]: + split = MmItemMemoryChunk(remainder, self._new_sync_buffer()) + self.available_chunks.append(split) - logger.debug(f"Copied tensor {tensor.shape} to workspace offset {offset}") + return occupied - def get_ipc_handle(self) -> bytes: - """Get CUDA IPC handle for the workspace buffer. + def get_slice_with_flag( + self, src: torch.Tensor + ) -> Tuple[Optional[Dict[str, Any]], Optional[torch.Tensor]]: + """Allocate a pool slice for *src* and return ``(sync_flag_meta, slice_tensor)``. - Returns: - CUDA IPC handle as bytes + Thread-safe. Returns ``(None, None)`` if the pool is full. """ - # Get IPC handle using torch.cuda API - # Note: This requires CUDA-capable device with IPC support - handle = cuda.cudart().cudaIpcGetMemHandle(self.workspace.data_ptr()) - return bytes(handle) - - def cleanup(self) -> None: - """Cleanup all sync buffers.""" - all_shm_names = set() - for chunk in self.chunks: - if chunk.sync_shm_name: - all_shm_names.add(chunk.sync_shm_name) - all_shm_names.update(self.sync_buffer_pool) - - for shm_name in all_shm_names: + with self._lock: + chunk = self._get_available_chunk(src) + if chunk is None: + logger.warning( + "MmItemMemoryPool full (%d occupied, %d available); " + "falling back to CPU transport", + len(self.occupied_chunks), + len(self.available_chunks), + ) + return None, None + pool_slice = self.memory_pool[chunk.start : chunk.end] + return chunk.sync_flag.meta_data, pool_slice + + # ------------------------------------------------------------------ + # Recycling + # ------------------------------------------------------------------ + + def _recycle_loop(self) -> None: + while not self._stop: try: - shm = SharedMemory(name=shm_name, create=False) - shm.close() - shm.unlink() - except FileNotFoundError: - pass - except Exception as e: - logger.warning(f"Failed to cleanup sync buffer {shm_name}: {e}") + with self._lock: + self._recycle_chunks() + self._merge_chunks() + except Exception as exc: + logger.warning( + "MmItemMemoryPool recycler error: %s", exc, exc_info=True + ) + time.sleep(self._recycle_interval) + + def _recycle_chunks(self) -> None: + new_occupied: List[MmItemMemoryChunk] = [] + for chunk in self.occupied_chunks: + if chunk.try_to_recycle(self.num_consumers): + self._return_sync_buffer(chunk.sync_flag) + chunk.sync_flag = self._new_sync_buffer() + self.available_chunks.append(chunk) + else: + new_occupied.append(chunk) + self.occupied_chunks = new_occupied + + def _merge_chunks(self) -> None: + """Coalesce adjacent free chunks to reduce fragmentation.""" + merged: List[MmItemMemoryChunk] = [] + for chunk in sorted(self.available_chunks, key=lambda c: c.start): + if merged and merged[-1].end == chunk.start: + prev = merged.pop() + self._return_sync_buffer(chunk.sync_flag) + merged.append( + MmItemMemoryChunk((prev.start, chunk.end), prev.sync_flag) + ) + else: + merged.append(chunk) + self.available_chunks = merged + + def shutdown(self) -> None: + self._stop = True + if self._recycler.is_alive(): + self._recycler.join(timeout=2.0) + - logger.info("WorkspaceBuffer cleaned up") +# --------------------------------------------------------------------------- +# CudaIpcTensorTransportProxy – pool-based CUDA IPC proxy object +# --------------------------------------------------------------------------- -@dataclass -class TensorMetadata: - """Metadata for reconstructing a tensor from CUDA IPC handle.""" +class CudaIpcTensorTransportProxy: + """Proxy that carries a CUDA IPC handle for a pool-slice tensor. - shape: Tuple[int, ...] - dtype: torch.dtype - offset: int # Byte offset in workspace - size_bytes: int - sync_shm_name: str # Shared memory name for sync flag + The *sender* process: + 1. Copies the source tensor into a ``MmItemMemoryPool`` slice (int8 view). + 2. Wraps the slice in this proxy, which captures the CUDA IPC handle via + ``storage._share_cuda_()``. + 3. Sends the proxy through ``multiprocessing.Queue`` (pickle). + The *receiver* process: + 1. Calls :meth:`reconstruct_on_device` to map the IPC memory and copy it + into a fresh local tensor. + 2. The copy increments the sync flag, allowing the sender's recycler to + reclaim the pool slice. -class CudaIPCTransport: - """Transport for sharing CUDA tensors via IPC handles.""" + Fallback: if ``_share_cuda_()`` fails (e.g. TP ranks), ``tensor_data`` holds + the raw tensor (which will be pickled the normal way, incurring serialization cost). + """ def __init__( self, - workspace_size_gb: float = 4.0, - device: int = 0, - ): - """Initialize CUDA IPC transport. + data: torch.Tensor, + info_data: torch.Tensor, + sync_buffer_meta: Dict[str, Any], + ) -> None: + if not isinstance(data, torch.Tensor) or not isinstance( + info_data, torch.Tensor + ): + raise TypeError( + f"data and info_data must be torch.Tensors, got {type(data)}, {type(info_data)}" + ) - Args: - workspace_size_gb: Size of workspace buffer in GB - device: CUDA device ID - """ - self.device = device - self.workspace = WorkspaceBuffer(workspace_size_gb, device) - self.ipc_handle = self.workspace.get_ipc_handle() - self.queue: Queue = Queue() + self.sync_data_meta = sync_buffer_meta + self._state = self._build_state(data, info_data) + self._reconstructed: Optional[torch.Tensor] = None + self._shm: Optional[shared_memory.SharedMemory] = None - def send_tensor(self, rid: str, tensor: torch.Tensor) -> bool: - """Send a tensor via CUDA IPC. + def _build_state( + self, data: torch.Tensor, info_data: torch.Tensor + ) -> Dict[str, Any]: + try: + storage = data.untyped_storage() + handle = storage._share_cuda_() + return { + "ipc_handle": { + "handle": handle, + "shape": data.shape, + "dtype": data.dtype, + "stride": data.stride(), + "device_index": data.device.index, + "storage_offset": data.storage_offset(), + "target_shape": info_data.shape, + "target_dtype": info_data.dtype, + }, + "tensor_data": None, + } + except Exception as exc: + logger.warning( + "CudaIpcTensorTransportProxy: _share_cuda_() failed (%s); " + "falling back to direct tensor.", + exc, + ) + return {"ipc_handle": None, "tensor_data": data} - Args: - rid: Request ID - tensor: Tensor to send (must be on CUDA) + def reconstruct_on_device(self, device_index: Optional[int] = None) -> torch.Tensor: + """Map IPC memory and copy into a new local tensor. - Returns: - True if sent via CUDA IPC, False if fallback needed + This **must** be called from the *receiver* process. After the copy + the sync flag is incremented so the sender can recycle the pool chunk. """ - if not tensor.is_cuda: - logger.debug(f"Tensor for {rid} not on CUDA, skipping IPC") - return False - - size_bytes = tensor.numel() * tensor.element_size() - - # Try to allocate from workspace - result = self.workspace.allocate(size_bytes) - if result is None: - logger.warning( - f"WorkspaceBuffer full, falling back to shared queue for {rid}" + if self._reconstructed is not None: + return self._reconstructed + + state = self._state + if state["ipc_handle"] is not None: + h = state["ipc_handle"] + source_device = torch.device(f"cuda:{h['device_index']}") + target_device = ( + source_device + if device_index is None + else torch.device(f"cuda:{device_index}") ) - return False + with torch.cuda.device(source_device): + storage = torch.UntypedStorage._new_shared_cuda(*h["handle"]) + slice_tensor = torch.empty( + 0, dtype=h["dtype"], device=source_device + ).set_( + storage, + storage_offset=h["storage_offset"], + size=h["shape"], + stride=h["stride"], + ) - offset, sync_shm_name = result + result = torch.empty( + h["target_shape"], dtype=h["target_dtype"], device=target_device + ).contiguous() + result.view(torch.int8).view(-1).copy_(slice_tensor) - # Copy tensor to workspace - self.workspace.copy_tensor_to_workspace(tensor, offset) + # Signal sender that the chunk can be recycled + _increment_sync_flag(self.sync_data_meta) + elif state["tensor_data"] is not None: + result = state["tensor_data"] + if device_index is not None: + result = result.to(f"cuda:{device_index}", non_blocking=True) + else: + raise RuntimeError("CudaIpcTensorTransportProxy: invalid state") - # Create metadata - metadata = TensorMetadata( - shape=tuple(tensor.shape), - dtype=tensor.dtype, - offset=offset, - size_bytes=size_bytes, - sync_shm_name=sync_shm_name, - ) + self._reconstructed = result + return result - # Send metadata through queue - self.queue.put((rid, metadata, self.ipc_handle)) - logger.debug(f"Sent tensor {tensor.shape} for {rid} via CUDA IPC") - return True +# --------------------------------------------------------------------------- +# TransportProxyTensor – simple CUDA IPC via torch.Tensor subclass + pickle +# --------------------------------------------------------------------------- - def receive_tensor( - self, timeout: float = 0.0001 - ) -> Optional[Tuple[str, torch.Tensor]]: - """Receive a tensor via CUDA IPC. - Args: - timeout: Timeout for queue.get +class TransportProxyTensor(torch.Tensor): + """A ``torch.Tensor`` subclass whose pickle uses CUDA IPC handles. - Returns: - Tuple of (rid, tensor) or None if queue empty - """ - try: - rid, metadata, ipc_handle = self.queue.get(timeout=timeout) - except Exception: - return None + When ``transport_mode == "cuda_ipc"`` and the tensor is on CUDA, + ``__getstate__`` exports the tensor via ``storage._share_cuda_()`` instead + of serialising the raw data. ``__setstate__`` reconstructs it in the + receiving process via ``UntypedStorage._new_shared_cuda``. - # Open IPC memory handle - # Note: This creates a tensor view into the remote process's workspace - with torch.cuda.device(self.device): - # Reconstruct tensor from IPC handle - # This is a view into remote memory, we need to copy it locally + Caveat: The underlying GPU allocation is never freed until the *sender* + process exits (PyTorch limitation). Prefer ``"cuda_ipc_pool"`` mode for + long-running services to avoid GPU memory leaks. - # For now, use a simpler approach: signal to copy later - # In production, you'd use cuda.cudart().cudaIpcOpenMemHandle + When the tensor is on CPU or ``transport_mode == "default"``, the tensor + is serialised normally (pickle of raw data). + """ + @staticmethod + def __new__( + cls, + data: torch.Tensor, + transport_mode: TensorTransportMode = "default", + ) -> "TransportProxyTensor": + if not isinstance(data, torch.Tensor): + raise TypeError(f"data must be a torch.Tensor, got {type(data)}") + instance = data.as_subclass(cls) + instance._transport_mode = transport_mode + return instance + + def __getstate__(self) -> Dict[str, Any]: + state: Dict[str, Any] = { + "transport_mode": self._transport_mode, + "tensor_data": None, + "ipc_extra": None, + } + if self._transport_mode == "cuda_ipc" and self.is_cuda: + try: + storage = self.untyped_storage() + handle = storage._share_cuda_() + state["ipc_extra"] = { + "handle": handle, + "shape": self.shape, + "dtype": self.dtype, + "stride": self.stride(), + "device_index": self.device.index, + "storage_offset": self.storage_offset(), + } + except Exception as exc: + logger.warning( + "TransportProxyTensor: _share_cuda_() failed (%s); falling back.", + exc, + ) + state["transport_mode"] = "default" + state["tensor_data"] = self.as_subclass(torch.Tensor) + else: + state["transport_mode"] = "default" + state["tensor_data"] = self.as_subclass(torch.Tensor) + return state + + def __setstate__(self, state: Dict[str, Any]) -> None: + self._transport_mode = state["transport_mode"] + if state["transport_mode"] == "cuda_ipc" and state["ipc_extra"] is not None: + h = state["ipc_extra"] + target = torch.device(f"cuda:{h['device_index']}") + try: + with torch.cuda.device(target): + storage = torch.UntypedStorage._new_shared_cuda(*h["handle"]) + reconstructed = torch.empty( + 0, dtype=h["dtype"], device=target + ).set_( + storage, + storage_offset=h["storage_offset"], + size=h["shape"], + stride=h["stride"], + ) + self.set_(reconstructed) + except Exception as exc: + logger.error("TransportProxyTensor: failed to open IPC handle: %s", exc) + raise + elif state["tensor_data"] is not None: + self.set_(state["tensor_data"]) + else: + raise RuntimeError("TransportProxyTensor: invalid state – no tensor data") + + @property + def transport_mode(self) -> TensorTransportMode: + return getattr(self, "_transport_mode", "default") + + +# --------------------------------------------------------------------------- +# Helpers: wrap / unwrap mm_inputs dicts +# --------------------------------------------------------------------------- + + +def wrap_mm_inputs_for_ipc( + mm_inputs: Optional[Dict[str, Any]], + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"] = None, +) -> Optional[Dict[str, Any]]: + """Recursively wrap CUDA tensors in *mm_inputs* for IPC transport. + + Args: + mm_inputs: Nested dict/list of tensors and other data. + transport_mode: One of ``"default"``, ``"cuda_ipc"``, ``"cuda_ipc_pool"``. + pool: Required when ``transport_mode == "cuda_ipc_pool"``. + + Returns: + A new data structure with CUDA tensors replaced by IPC proxies. + CPU tensors are left unchanged (they will be shared via ``share_memory_()`` + or normal pickling downstream). + """ + if mm_inputs is None: + return None + return _wrap_recursive(mm_inputs, transport_mode, pool) + + +def _wrap_recursive( + data: Any, + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"], +) -> Any: + if isinstance(data, torch.Tensor) and data.is_cuda: + return _wrap_cuda_tensor(data, transport_mode, pool) + elif isinstance(data, dict): + return {k: _wrap_recursive(v, transport_mode, pool) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + wrapped = [_wrap_recursive(item, transport_mode, pool) for item in data] + return type(data)(wrapped) + else: + return data + + +def _wrap_cuda_tensor( + tensor: torch.Tensor, + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"], +) -> Any: + if transport_mode == "cuda_ipc": + return TransportProxyTensor(tensor, transport_mode="cuda_ipc") + + if transport_mode == "cuda_ipc_pool": + if pool is None: + raise ValueError("pool must be provided for transport_mode='cuda_ipc_pool'") + sync_meta, pool_slice = pool.get_slice_with_flag(tensor) + if pool_slice is not None: + # Copy tensor bytes into the pool slice + pool_slice.copy_(tensor.view(torch.int8).view(-1), non_blocking=True) + return CudaIpcTensorTransportProxy( + data=pool_slice, + info_data=tensor, + sync_buffer_meta=sync_meta, + ) + else: + # Pool full – fall back to simple IPC (with potential memory leak) logger.warning( - "CUDA IPC receive not fully implemented - requires cudaIpcOpenMemHandle" + "Pool full; falling back to simple CUDA IPC (potential memory leak)" ) - # TODO: Implement actual IPC handle opening + return TransportProxyTensor(tensor, transport_mode="cuda_ipc") - # Create local tensor and signal copy completion - tensor = torch.empty( - metadata.shape, dtype=metadata.dtype, device=f"cuda:{self.device}" - ) + # "default" – move to CPU shared memory (handled by share_memory_() downstream) + return tensor - # Mark chunk as ready for reuse by setting sync flag - self._mark_chunk_reusable(metadata.sync_shm_name) - return rid, tensor +def unwrap_mm_inputs_from_ipc( + mm_inputs: Optional[Dict[str, Any]], + device_index: Optional[int] = None, +) -> Optional[Dict[str, Any]]: + """Recursively reconstruct tensors from IPC proxy objects. - def _mark_chunk_reusable(self, sync_shm_name: str) -> None: - """Mark a chunk as reusable by setting sync flag to 1.""" - try: - shm = SharedMemory(name=sync_shm_name, create=False) - shm.buf[:8] = struct.pack("Q", 1) # Set to 1 (ready for reuse) - shm.close() - logger.debug(f"Marked chunk reusable: {sync_shm_name}") - except Exception as e: - logger.error(f"Failed to mark chunk reusable {sync_shm_name}: {e}") - - def cleanup(self) -> None: - """Cleanup resources.""" - self.workspace.cleanup() - self.queue.close() + Call this in the *receiver* process after getting data from the queue. + + Args: + mm_inputs: Data structure possibly containing IPC proxy objects. + device_index: If not None, move reconstructed tensors to this device. + """ + if mm_inputs is None: + return None + return _unwrap_recursive(mm_inputs, device_index) + + +def _unwrap_recursive(data: Any, device_index: Optional[int]) -> Any: + if isinstance(data, CudaIpcTensorTransportProxy): + return data.reconstruct_on_device(device_index) + elif isinstance(data, TransportProxyTensor): + # Already reconstructed during unpickling; just return as plain tensor + return data.as_subclass(torch.Tensor) + elif isinstance(data, dict): + return {k: _unwrap_recursive(v, device_index) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [_unwrap_recursive(item, device_index) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 64ea55b0..8f2d9a95 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -9,6 +9,10 @@ 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory +When the shared queue fast path is active the scheduler also handles CUDA IPC +tensor reconstruction via +:func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. + The main ``event_loop`` scheduler flow:: while True: @@ -31,6 +35,10 @@ import zmq from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.orchestrator.cuda_ipc_transport import ( + TensorTransportMode, + unwrap_mm_inputs_from_ipc, +) from pymllm.orchestrator.ipc_utils import create_zmq_socket from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue @@ -48,6 +56,7 @@ def __init__( send_to_detokenizer_addr: str, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, + tensor_transport_mode: TensorTransportMode = "default", ): # ZMQ addresses self._recv_from_tokenizer_addr = recv_from_tokenizer_addr @@ -58,6 +67,7 @@ def __init__( # Shared queue configuration self._shared_queue = shared_queue self._enable_shared_queue = enable_shared_queue + self._tensor_transport_mode = tensor_transport_mode # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None @@ -111,8 +121,9 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite scheduling loop.""" logger.info( - "SchedulerProcess event loop started (shared_queue=%s)", + "SchedulerProcess event loop started (shared_queue=%s, transport=%s)", self._enable_shared_queue, + self._tensor_transport_mode, ) while True: self.recv_requests() @@ -163,10 +174,21 @@ def _recv_from_zmq(self) -> None: self._waiting_queue.append(msg) def _recv_from_shared_queue(self) -> None: - """Receive requests via shared memory + shared queue fast path.""" + """Receive requests via shared memory + shared queue fast path. + + After reading a ``(rid, shm_name, mm_inputs)`` tuple from the queue: + 1. The tokenized metadata is read from the POSIX shared memory segment. + 2. If CUDA IPC is enabled, ``mm_inputs`` may contain + :class:`~pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy` + or :class:`~pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor` + objects that are reconstructed by calling + :func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. + This step also increments sync flags so the sender can recycle pool chunks. + 3. A full ``TokenizedGenerateReqInput`` is assembled and appended to + ``_waiting_queue``. + """ while True: try: - # Non-blocking get from shared queue rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.0001) # Read metadata from shared memory (and unlink immediately) @@ -174,12 +196,16 @@ def _recv_from_shared_queue(self) -> None: shm_name, unlink=True ) - # Reconstruct the full TokenizedGenerateReqInput with mm_inputs + # Reconstruct GPU tensors from CUDA IPC handles (if any) + if self._tensor_transport_mode in ("cuda_ipc", "cuda_ipc_pool"): + mm_inputs = unwrap_mm_inputs_from_ipc(mm_inputs) + + # Reassemble the full request full_request = TokenizedGenerateReqInput( rid=metadata.rid, input_text=metadata.input_text, input_ids=metadata.input_ids, - mm_inputs=mm_inputs, # Restored from shared queue + mm_inputs=mm_inputs, sampling_params=metadata.sampling_params, stream=metadata.stream, return_logprob=metadata.return_logprob, @@ -190,18 +216,18 @@ def _recv_from_shared_queue(self) -> None: ) self._waiting_queue.append(full_request) - logger.debug(f"Received request {rid} from shared queue") + logger.debug("Received request %s from shared queue", rid) except stdlib_queue.Empty: - # No more requests available break - except Exception as e: - logger.error(f"Error receiving from shared queue: {e}", exc_info=True) - # Try to cleanup shared memory if possible + except Exception as exc: + logger.error( + "Error receiving from shared queue: %s", exc, exc_info=True + ) try: if "shm_name" in locals(): SharedMemoryManager.cleanup(shm_name) - except: + except Exception: pass break @@ -310,6 +336,7 @@ def run_scheduler_process( pipe_writer: Connection, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, + tensor_transport_mode: TensorTransportMode = "default", ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" proc = SchedulerProcess( @@ -319,6 +346,7 @@ def run_scheduler_process( send_to_detokenizer_addr, shared_queue=shared_queue, enable_shared_queue=enable_shared_queue, + tensor_transport_mode=tensor_transport_mode, ) proc.init_sockets() diff --git a/pymllm/orchestrator/shared_memory_queue.py b/pymllm/orchestrator/shared_memory_queue.py index 3d26ebf1..2f006bdc 100644 --- a/pymllm/orchestrator/shared_memory_queue.py +++ b/pymllm/orchestrator/shared_memory_queue.py @@ -1,32 +1,75 @@ """ Shared memory and queue utilities for fast IPC between tokenizer and scheduler. -This module implements shared-queue fast path to avoid expensive -ZMQ serialization of large multimodal tensors. +This module implements the shared-queue fast path to avoid expensive ZMQ +serialization of large multimodal tensors. -Design: - - Metadata lane: Small tokenized objects stored in shared memory keyed by rid - - Tensor lane: Large tensors made shareable via share_memory_() and passed by handle +## Design + +- **Metadata lane**: Small tokenized objects are written to a POSIX shared memory + segment keyed by the request ID (``rid``). The scheduler reads and immediately + unlinks the segment. + +- **Tensor lane**: Large tensors can be transported in one of three modes, + controlled by ``TensorTransportMode`` (passed at queue construction time): + + * ``"default"`` – CPU tensors only. GPU tensors are moved to POSIX shared + memory via ``tensor.share_memory_()`` (or left on CPU if already there). + This is the original behaviour and requires no CUDA support. + + * ``"cuda_ipc"`` – GPU tensors stay on GPU and are wrapped in + :class:`~pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor`. On the + receiver side the proxy's ``__setstate__`` automatically reconstructs the + tensor from the CUDA IPC handle during unpickling. CPU tensors are handled as + in ``"default"`` mode. **Caveat**: GPU memory is not freed until the sender + process exits (PyTorch limitation). Prefer ``"cuda_ipc_pool"`` for services. + + * ``"cuda_ipc_pool"`` – GPU tensors are copied into a pre-allocated + :class:`~pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool` workspace and + wrapped in :class:`~pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy`. + After the receiver copies the data it increments a sync flag and the sender's + recycler thread returns the chunk to the pool. This avoids GPU memory leaks. + CPU tensors are handled as in ``"default"`` mode. + +## Key relationship with CUDA IPC + +``"default"`` and ``"cuda_ipc*"`` modes are **mutually exclusive for GPU tensors**: + +- In ``"default"`` mode, GPU tensors that need to cross process boundaries must + first be moved to CPU (``share_memory_()``). This incurs a GPU→CPU copy. +- In ``"cuda_ipc*"`` modes, GPU tensors are shared as-is via CUDA IPC handles; + no copy to CPU is needed. + +CPU tensors are always handled via ``share_memory_()`` regardless of the mode. """ +from __future__ import annotations + import logging import pickle import uuid from multiprocessing import Queue from multiprocessing.shared_memory import SharedMemory -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional import torch +from pymllm.orchestrator.cuda_ipc_transport import ( + MmItemMemoryPool, + TensorTransportMode, + unwrap_mm_inputs_from_ipc, + wrap_mm_inputs_for_ipc, +) + logger = logging.getLogger(__name__) class SharedMemoryManager: """Manages shared memory segments for passing metadata between processes. - Each tokenized request's metadata is written to a unique shared memory segment - keyed by its request ID (rid). The scheduler reads and immediately unlinks the - segment to prevent memory leaks. + Each tokenized request's metadata is written to a unique shared memory + segment keyed by its request ID (rid). The scheduler reads and immediately + unlinks the segment to prevent memory leaks. """ @staticmethod @@ -40,23 +83,17 @@ def write_metadata(rid: str, metadata: Any) -> str: Returns: str: The shared memory segment name """ - # Serialize the metadata data = pickle.dumps(metadata) size = len(data) - - # Create unique shared memory segment name shm_name = f"pymllm_meta_{rid}_{uuid.uuid4().hex[:8]}" - try: - # Create shared memory segment shm = SharedMemory(name=shm_name, create=True, size=size) - # Write data shm.buf[:size] = data shm.close() - logger.debug(f"Wrote {size} bytes to shared memory {shm_name}") + logger.debug("Wrote %d bytes to shared memory %s", size, shm_name) return shm_name - except Exception as e: - logger.error(f"Failed to write metadata to shared memory: {e}") + except Exception as exc: + logger.error("Failed to write metadata to shared memory: %s", exc) raise @staticmethod @@ -71,25 +108,21 @@ def read_metadata(shm_name: str, unlink: bool = True) -> Any: The deserialized metadata object """ try: - # Open existing shared memory segment shm = SharedMemory(name=shm_name, create=False) - # Read and deserialize data data = bytes(shm.buf[:]) metadata = pickle.loads(data) shm.close() - - # Unlink to free memory immediately if unlink: try: shm.unlink() - logger.debug(f"Read and unlinked shared memory {shm_name}") + logger.debug("Read and unlinked shared memory %s", shm_name) except FileNotFoundError: - # Already unlinked, ignore pass - return metadata - except Exception as e: - logger.error(f"Failed to read metadata from shared memory {shm_name}: {e}") + except Exception as exc: + logger.error( + "Failed to read metadata from shared memory %s: %s", shm_name, exc + ) raise @staticmethod @@ -99,85 +132,137 @@ def cleanup(shm_name: str) -> None: shm = SharedMemory(name=shm_name, create=False) shm.close() shm.unlink() - logger.debug(f"Cleaned up shared memory {shm_name}") + logger.debug("Cleaned up shared memory %s", shm_name) except FileNotFoundError: - pass # Already cleaned up - except Exception as e: - logger.warning(f"Failed to cleanup shared memory {shm_name}: {e}") + pass + except Exception as exc: + logger.warning("Failed to cleanup shared memory %s: %s", shm_name, exc) class TensorQueue: - """Queue for passing large tensors between processes using shared memory. + """Queue for passing large tensors between processes. - Tensors are made shareable via .share_memory_() and passed through a - multiprocessing.Queue by handle (metadata only, not the actual data). - """ + Depending on ``transport_mode``, GPU tensors are either moved to CPU shared + memory (``"default"``) or kept on GPU and shared via CUDA IPC handles + (``"cuda_ipc"`` / ``"cuda_ipc_pool"``). - def __init__(self, maxsize: int = 0): - """Initialize the tensor queue. + Args: + maxsize: Maximum queue size (0 for unlimited). + transport_mode: Controls how GPU tensors are transported. + pool: Required when ``transport_mode == "cuda_ipc_pool"``. + """ - Args: - maxsize: Maximum queue size (0 for unlimited) - """ + def __init__( + self, + maxsize: int = 0, + transport_mode: TensorTransportMode = "default", + pool: Optional[MmItemMemoryPool] = None, + ) -> None: + # pool is allowed to be None at construction time for "cuda_ipc_pool" mode + # because the pool is initialised lazily inside the sender subprocess. + # The pool reference is injected later via _pool attribute assignment. self._queue: Queue = Queue(maxsize=maxsize) + self._transport_mode = transport_mode + self._pool = pool + + # ------------------------------------------------------------------ + # Producer side + # ------------------------------------------------------------------ - def put(self, rid: str, shm_name: str, mm_inputs: Optional[Dict[str, Any]]) -> None: - """Put a request with multimodal inputs into the queue. + def put( + self, + rid: str, + shm_name: str, + mm_inputs: Optional[Dict[str, Any]], + ) -> None: + """Put a request into the queue. + + GPU tensors inside *mm_inputs* are wrapped according to + ``transport_mode`` before being placed into the underlying + ``multiprocessing.Queue``. Args: - rid: Request ID - shm_name: Shared memory segment name for metadata - mm_inputs: Multimodal inputs dict (can contain torch tensors) + rid: Request ID. + shm_name: Shared memory segment name for the tokenized metadata. + mm_inputs: Multimodal inputs dict (may contain CUDA tensors). """ - # Make tensors shareable if present if mm_inputs is not None: - mm_inputs = self._make_tensors_shareable(mm_inputs) + if self._transport_mode in ("cuda_ipc", "cuda_ipc_pool"): + if self._transport_mode == "cuda_ipc_pool" and self._pool is None: + # Pool not yet initialised (race condition or CUDA unavailable); + # fall back to simple CUDA IPC for this message. + effective_mode = "cuda_ipc" + else: + effective_mode = self._transport_mode + # Wrap CUDA tensors in IPC proxies (stays on GPU, no copy to CPU) + mm_inputs = wrap_mm_inputs_for_ipc( + mm_inputs, + transport_mode=effective_mode, + pool=self._pool, + ) + # CPU tensors within mm_inputs are still shared via share_memory_() + mm_inputs = self._share_cpu_tensors(mm_inputs) + else: + # "default": move all tensors to CPU shared memory + mm_inputs = self._make_tensors_shareable(mm_inputs) self._queue.put((rid, shm_name, mm_inputs)) - logger.debug(f"Put request {rid} into tensor queue (shm={shm_name})") + logger.debug("Put request %s into tensor queue (shm=%s)", rid, shm_name) + + # ------------------------------------------------------------------ + # Consumer side + # ------------------------------------------------------------------ def get( self, timeout: Optional[float] = None ) -> tuple[str, str, Optional[Dict[str, Any]]]: """Get a request from the queue. + GPU tensors wrapped as IPC proxies are **not** automatically + reconstructed here – the caller (scheduler) must call + :func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc` + after retrieval. + Args: - timeout: Timeout in seconds (None for blocking indefinitely) + timeout: Timeout in seconds (None for blocking). Returns: - Tuple of (rid, shm_name, mm_inputs) + Tuple of ``(rid, shm_name, mm_inputs)``. """ rid, shm_name, mm_inputs = self._queue.get(timeout=timeout) - logger.debug(f"Got request {rid} from tensor queue (shm={shm_name})") + logger.debug("Got request %s from tensor queue (shm=%s)", rid, shm_name) return rid, shm_name, mm_inputs + # ------------------------------------------------------------------ + # Queue introspection + # ------------------------------------------------------------------ + def empty(self) -> bool: - """Check if the queue is empty.""" return self._queue.empty() def qsize(self) -> int: - """Return the approximate size of the queue.""" try: return self._queue.qsize() except NotImplementedError: - return 0 # Some platforms don't support qsize + return 0 def close(self) -> None: - """Close the queue.""" self._queue.close() + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + @staticmethod def _make_tensors_shareable(data: Any) -> Any: - """Recursively make all torch tensors in a data structure shareable. + """Recursively move all tensors (CPU and CUDA) to POSIX shared memory. - Args: - data: Nested dict/list/tensor structure - - Returns: - The same structure with tensors made shareable via share_memory_() + GPU tensors are first moved to CPU (incurring a device copy), then + placed in shared memory. This is the ``"default"`` path. """ if isinstance(data, torch.Tensor): - # Make tensor shareable across processes + if data.is_cuda: + data = data.cpu() if not data.is_shared(): data = data.share_memory_() return data @@ -188,3 +273,20 @@ def _make_tensors_shareable(data: Any) -> Any: return type(data)(result) else: return data + + @staticmethod + def _share_cpu_tensors(data: Any) -> Any: + """Recursively place CPU tensors in shared memory (GPU tensors are already + wrapped as IPC proxies and must not be touched here). + """ + if isinstance(data, torch.Tensor) and not data.is_cuda: + if not data.is_shared(): + data = data.share_memory_() + return data + elif isinstance(data, dict): + return {k: TensorQueue._share_cpu_tensors(v) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [TensorQueue._share_cpu_tensors(item) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 43db5ba0..587a7c1e 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -4,9 +4,26 @@ Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, and forwards the tokenized payloads to the SchedulerProcess. -Supports two modes: - 1. Legacy ZMQ path: Send TokenizedGenerateReqInput via ZMQ send_pyobj - 2. Shared queue fast path: Write metadata to shared memory and put rid in shared queue +Supports two transport modes (controlled by ``enable_shared_queue`` and +``tensor_transport_mode`` in the tokenizer config): + +1. **Legacy ZMQ path** (``enable_shared_queue=False``): + Tokenized objects are sent directly via ``ZMQ send_pyobj`` (pickle). This + is simple but slow for large multimodal tensors. + +2. **Shared queue fast path** (``enable_shared_queue=True``): + Metadata is written to POSIX shared memory and the queue carries a + lightweight ``(rid, shm_name, mm_inputs)`` tuple. The GPU tensors inside + ``mm_inputs`` are transported differently depending on ``tensor_transport_mode``: + + * ``"default"`` – GPU tensors are moved to CPU first (GPU→CPU copy), + then placed in POSIX shared memory. + * ``"cuda_ipc"`` – GPU tensors stay on GPU; they are wrapped in a + :class:`TransportProxyTensor` whose pickle uses CUDA IPC handles. + Simple but may leak GPU memory. + * ``"cuda_ipc_pool"`` – GPU tensors are copied into a pre-allocated + :class:`MmItemMemoryPool` workspace and shared via pool-chunk IPC + handles. Chunks are recycled; no GPU memory is leaked. """ import logging @@ -17,6 +34,7 @@ from transformers import AutoProcessor, AutoTokenizer from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.orchestrator.cuda_ipc_transport import MmItemMemoryPool, TensorTransportMode from pymllm.orchestrator.ipc_utils import create_zmq_socket from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue @@ -40,16 +58,22 @@ def __init__( Serialisable dict built by the parent process (``Engine``) before spawning. Required keys: - * ``tokenizer_path`` – str, path to the tokenizer directory. - * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. - * ``trust_remote_code`` – bool. - * ``context_length`` – Optional[int], explicit cap; inferred from - ``hf_config`` when ``None``. - * ``hf_config`` – Optional HuggingFace PretrainedConfig - (pickled by multiprocessing); used only to infer ``context_length``. - * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + * ``tokenizer_path`` – str, path to the tokenizer directory. + * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. + * ``trust_remote_code`` – bool. + * ``context_length`` – Optional[int], explicit cap; inferred + from ``hf_config`` when ``None``. + * ``hf_config`` – Optional HuggingFace PretrainedConfig. + * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + * ``tensor_transport_mode`` – ``"default" | "cuda_ipc" | "cuda_ipc_pool"``. + * ``cuda_ipc_pool_size_mb`` – int, pool size in MB (cuda_ipc_pool only). + * ``cuda_ipc_recycle_interval`` – float, recycler sleep interval (s). + shared_queue: - Optional TensorQueue for shared memory fast path communication. + Optional :class:`TensorQueue` for the shared memory fast path. + When *transport_mode* is ``"cuda_ipc_pool"`` this queue should have + been constructed with a ``MmItemMemoryPool``; the ``TokenizerProcess`` + initialises its own pool in that case. """ self._recv_from_rr_addr = recv_from_rr_addr self._send_to_scheduler_addr = send_to_scheduler_addr @@ -57,6 +81,21 @@ def __init__( self._enable_shared_queue = tokenizer_cfg.get("enable_shared_queue", False) self._shared_queue = shared_queue + # Tensor transport configuration + self._transport_mode: TensorTransportMode = tokenizer_cfg.get( + "tensor_transport_mode", "default" + ) + # Pool for cuda_ipc_pool mode – will be initialised lazily when the + # process first encounters a CUDA tensor. + self._ipc_pool: Optional[MmItemMemoryPool] = None + if self._transport_mode == "cuda_ipc_pool": + # The pool must be created inside the subprocess (after fork/spawn) + # because it allocates CUDA memory. We defer to _ensure_pool(). + pool_mb: int = int(tokenizer_cfg.get("cuda_ipc_pool_size_mb", 512)) + recycle: float = float(tokenizer_cfg.get("cuda_ipc_recycle_interval", 0.1)) + self._ipc_pool_size_mb = pool_mb + self._ipc_recycle_interval = recycle + self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_rr: Optional[zmq.Socket] = None self._send_to_scheduler: Optional[zmq.Socket] = None @@ -89,8 +128,9 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" logger.info( - "TokenizerProcess event loop started (shared_queue=%s)", + "TokenizerProcess event loop started (shared_queue=%s, transport=%s)", self._enable_shared_queue, + self._transport_mode, ) while True: raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() @@ -108,12 +148,19 @@ def _send_via_shared_queue( ) -> None: """Send tokenized request via shared memory + shared queue fast path. - Args: - tokenized: Either TokenizedGenerateReqInput dataclass or abort dict + GPU tensors inside ``mm_inputs`` are handled according to + ``self._transport_mode``: + + * ``"default"`` – moved to CPU via ``share_memory_()`` by ``TensorQueue``. + * ``"cuda_ipc"`` – wrapped in :class:`TransportProxyTensor` (stays on GPU). + * ``"cuda_ipc_pool"`` – copied into the :class:`MmItemMemoryPool` workspace and + wrapped in :class:`CudaIpcTensorTransportProxy`. + + Abort sentinel messages are forwarded via ZMQ (they are lightweight dicts). """ # Handle abort sentinel if isinstance(tokenized, dict) and tokenized.get("abort"): - # Fallback to ZMQ for abort messages + # Fallback to ZMQ for abort messages (no tensor payload) self._send_to_scheduler.send_pyobj(tokenized) return @@ -121,10 +168,14 @@ def _send_via_shared_queue( f"Expected TokenizedGenerateReqInput, got {type(tokenized)}" ) + # Lazily initialise the CUDA IPC pool (must happen inside the subprocess) + if self._transport_mode == "cuda_ipc_pool": + self._ensure_pool() + rid = tokenized.rid mm_inputs = tokenized.mm_inputs - # Create a lightweight metadata object (without mm_inputs) + # Create lightweight metadata object (mm_inputs sent separately via queue) metadata = TokenizedGenerateReqInput( rid=tokenized.rid, input_text=tokenized.input_text, @@ -143,9 +194,73 @@ def _send_via_shared_queue( shm_name = SharedMemoryManager.write_metadata(rid, metadata) # Put (rid, shm_name, mm_inputs) into shared queue + # TensorQueue.put() handles wrapping mm_inputs based on transport_mode self._shared_queue.put(rid, shm_name, mm_inputs) - logger.debug(f"Sent request {rid} via shared queue (shm={shm_name})") + logger.debug( + "Sent request %s via shared queue (shm=%s, transport=%s)", + rid, + shm_name, + self._transport_mode, + ) + + # ------------------------------------------------------------------ + # CUDA IPC pool initialisation (deferred to subprocess) + # ------------------------------------------------------------------ + + def _ensure_pool(self) -> None: + """Lazily create the MmItemMemoryPool inside the subprocess. + + This is deferred because CUDA context creation must happen after + ``torch.multiprocessing.Process`` has started (post-fork/spawn). + Once the pool is created we update the shared queue's transport config + in-place so the same underlying ``multiprocessing.Queue`` object is reused + (both processes already hold a reference to it). + """ + if self._ipc_pool is not None: + return + try: + import torch + + if not torch.cuda.is_available(): + logger.warning( + "CUDA not available; falling back to transport_mode='default'" + ) + self._transport_mode = "default" + if self._shared_queue is not None: + self._shared_queue._transport_mode = "default" + return + + pool_bytes = self._ipc_pool_size_mb * 1024 * 1024 + device = torch.cuda.current_device() + self._ipc_pool = MmItemMemoryPool( + memory_size=pool_bytes, + recycle_interval=self._ipc_recycle_interval, + device=device, + ) + # Update the shared queue's config in-place. + # Both processes share the same multiprocessing.Queue object, so we + # just update the wrapper's transport metadata; the underlying queue + # pipe is unchanged. + if self._shared_queue is not None: + self._shared_queue._transport_mode = self._transport_mode + self._shared_queue._pool = self._ipc_pool + + logger.info( + "MmItemMemoryPool initialised: %d MB on cuda:%d", + self._ipc_pool_size_mb, + device, + ) + except Exception as exc: + logger.error( + "Failed to initialise MmItemMemoryPool: %s; " + "falling back to transport_mode='default'", + exc, + exc_info=True, + ) + self._transport_mode = "default" + if self._shared_queue is not None: + self._shared_queue._transport_mode = "default" # ------------------------------------------------------------------ # Tokenization and multimodal preprocessing @@ -352,6 +467,8 @@ def _collect_mm_inputs( return mm def shutdown(self) -> None: + if self._ipc_pool is not None: + self._ipc_pool.shutdown() if self._recv_from_rr is not None: self._recv_from_rr.close() if self._send_to_scheduler is not None: From 9bc959fcd90bded9eb4a2fdeb3ad8208b1c6f790 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 2 Mar 2026 06:55:36 +0000 Subject: [PATCH 12/13] feat(sampling): add sampling module with FlashInfer acceleration and PyTorch fallback - Introduce pymllm.layers.sampling with diverse sampling functions - Implement softmax with temperature scaling and FlashInfer support - Add category sampling from probabilities and logits with optional determinism - Support top-p (nucleus), top-k, and min-p sampling methods - Provide combined top-k + top-p sampling from logits and probabilities - Add probability renormalization for top-p and top-k thresholds - Implement top-k masking for logits to filter out lower probabilities - Include chain speculative sampling for accelerated sequence generation - Provide pure-PyTorch fallback implementations for all methods - Update pymllm.layers.__init__.py to export new sampling functions - Rename pymllm.executor.eager_runner.py to model_runner.py for clarity --- .../{eager_runner.py => model_runner.py} | 0 pymllm/layers/__init__.py | 26 + pymllm/layers/sampling.py | 767 ++++++++++++++++++ 3 files changed, 793 insertions(+) rename pymllm/executor/{eager_runner.py => model_runner.py} (100%) diff --git a/pymllm/executor/eager_runner.py b/pymllm/executor/model_runner.py similarity index 100% rename from pymllm/executor/eager_runner.py rename to pymllm/executor/model_runner.py diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index fd9a070e..97cfb921 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -13,6 +13,20 @@ apply_rope_pos_ids, apply_rope_with_cos_sin_cache, ) +from pymllm.layers.sampling import ( + chain_speculative_sampling, + min_p_sampling_from_probs, + sampling_from_logits, + sampling_from_probs, + softmax, + top_k_mask_logits, + top_k_renorm_probs, + top_k_sampling_from_probs, + top_k_top_p_sampling_from_logits, + top_k_top_p_sampling_from_probs, + top_p_renorm_probs, + top_p_sampling_from_probs, +) from pymllm.layers.utils import set_weight_attrs __all__ = [ @@ -32,4 +46,16 @@ "apply_rope_pos_ids", "apply_llama31_rope_pos_ids", "apply_rope_with_cos_sin_cache", + "softmax", + "sampling_from_probs", + "sampling_from_logits", + "top_p_sampling_from_probs", + "top_k_sampling_from_probs", + "min_p_sampling_from_probs", + "top_k_top_p_sampling_from_logits", + "top_k_top_p_sampling_from_probs", + "top_p_renorm_probs", + "top_k_renorm_probs", + "top_k_mask_logits", + "chain_speculative_sampling", ] diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py index e69de29b..ff84879c 100644 --- a/pymllm/layers/sampling.py +++ b/pymllm/layers/sampling.py @@ -0,0 +1,767 @@ +"""Sampling operations with FlashInfer acceleration and PyTorch fallback. + +This module wraps all flashinfer.sampling APIs and provides pure-PyTorch +fallback implementations so that the rest of the codebase can import from +here without worrying about whether FlashInfer is installed. +""" + +from __future__ import annotations + +import logging +from typing import Optional, Tuple, Union + +import torch + +logger = logging.getLogger(__name__) + +try: + import flashinfer.sampling as _fi_sampling + + _HAS_FLASHINFER = True +except ImportError: + _HAS_FLASHINFER = False + logger.warning("flashinfer not found, falling back to PyTorch sampling kernels") + + +# --------------------------------------------------------------------------- +# Helper utilities (torch fallback) +# --------------------------------------------------------------------------- + + +def _resolve_indices( + data: torch.Tensor, indices: Optional[torch.Tensor] +) -> torch.Tensor: + """If *indices* is given, gather rows from *data* accordingly.""" + if indices is None: + return data + return data[indices.long()] + + +def _to_scalar_or_tensor( + value: Union[torch.Tensor, float, int], + batch_size: int, + device: torch.device, +) -> torch.Tensor: + """Broadcast a scalar or per-batch tensor to shape ``(batch_size,)``.""" + if isinstance(value, (int, float)): + return torch.full((batch_size,), value, device=device, dtype=torch.float32) + return value.to(device=device, dtype=torch.float32) + + +# --------------------------------------------------------------------------- +# softmax +# --------------------------------------------------------------------------- + + +def softmax( + logits: torch.Tensor, + temperature: Optional[Union[torch.Tensor, float]] = None, + enable_pdl: Optional[bool] = None, +) -> torch.Tensor: + """Safe softmax with optional temperature scaling. + + Parameters + ---------- + logits : torch.Tensor + Shape ``(batch_size, num_classes)``. + temperature : Optional[Union[torch.Tensor, float]] + Scalar or per-request ``(batch_size,)`` temperature. + enable_pdl : Optional[bool] + FlashInfer PDL flag (ignored in fallback). + + Returns + ------- + torch.Tensor + Probabilities with the same shape as *logits*. + """ + if _HAS_FLASHINFER: + return _fi_sampling.softmax( + logits, temperature=temperature, enable_pdl=enable_pdl + ) + + if temperature is not None: + if isinstance(temperature, (int, float)): + logits = logits / temperature + else: + logits = logits / temperature.unsqueeze(-1) + return torch.softmax(logits, dim=-1) + + +# --------------------------------------------------------------------------- +# sampling_from_probs +# --------------------------------------------------------------------------- + + +def sampling_from_probs( + probs: torch.Tensor, + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Category sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)`` or ``(unique_batch_size, num_classes)`` + when *indices* is provided. + indices : Optional[torch.Tensor] + Maps each output to a row in *probs*. + deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.sampling_from_probs( + probs, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices) + samples = torch.multinomial(p.float(), num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# sampling_from_logits +# --------------------------------------------------------------------------- + + +def sampling_from_logits( + logits: torch.Tensor, + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Category sampling from logits (applies softmax internally). + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.sampling_from_logits( + logits, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + probs = torch.softmax(logits.float(), dim=-1) + return sampling_from_probs( + probs, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + ) + + +# --------------------------------------------------------------------------- +# top_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_p_sampling_from_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-p (nucleus) sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_p : Union[torch.Tensor, float] + Top-p threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_p_sampling_from_probs( + probs, + top_p, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + renormed = _torch_top_p_renorm_probs(p, top_p) + samples = torch.multinomial(renormed, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_k_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_k_sampling_from_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_sampling_from_probs( + probs, + top_k, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + renormed = _torch_top_k_renorm_probs(p, top_k) + samples = torch.multinomial(renormed, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# min_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def min_p_sampling_from_probs( + probs: torch.Tensor, + min_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Min-p sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + min_p : Union[torch.Tensor, float] + Min-p threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.min_p_sampling_from_probs( + probs, + min_p, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + batch_size = p.shape[0] + min_p_t = _to_scalar_or_tensor(min_p, batch_size, p.device) + # min-p: keep tokens whose probability >= min_p * max_prob + max_probs = p.max(dim=-1, keepdim=True).values # (B,1) + threshold = min_p_t.unsqueeze(-1) * max_probs # (B,1) + mask = p < threshold + filtered = p.clone() + filtered[mask] = 0.0 + # renormalize + sums = filtered.sum(dim=-1, keepdim=True) + sums = sums.clamp(min=1e-8) + filtered = filtered / sums + samples = torch.multinomial(filtered, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_k_top_p_sampling_from_logits +# --------------------------------------------------------------------------- + + +def top_k_top_p_sampling_from_logits( + logits: torch.Tensor, + top_k: Union[torch.Tensor, int], + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + filter_apply_order: str = "top_k_first", + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k + top-p sampling from pre-softmax logits. + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + top_p : Union[torch.Tensor, float] + filter_apply_order : str + ``"top_k_first"`` or ``"joint"``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_top_p_sampling_from_logits( + logits, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + probs = torch.softmax(logits.float(), dim=-1) + return top_k_top_p_sampling_from_probs( + probs, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + ) + + +# --------------------------------------------------------------------------- +# top_k_top_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_k_top_p_sampling_from_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + filter_apply_order: str = "top_k_first", + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k + top-p sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + top_p : Union[torch.Tensor, float] + filter_apply_order : str + ``"top_k_first"`` or ``"joint"``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_top_p_sampling_from_probs( + probs, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + if filter_apply_order == "top_k_first": + p = _torch_top_k_renorm_probs(p, top_k) + p = _torch_top_p_renorm_probs(p, top_p) + else: + # joint: apply both filters simultaneously + p = _torch_top_k_renorm_probs(p, top_k) + p = _torch_top_p_renorm_probs(p, top_p) + samples = torch.multinomial(p, num_samples=1, generator=generator).squeeze(-1) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_p_renorm_probs +# --------------------------------------------------------------------------- + + +def top_p_renorm_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], +) -> torch.Tensor: + """Renormalize probabilities by top-p thresholding. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_p : Union[torch.Tensor, float] + Top-p threshold in ``(0, 1)``. + + Returns + ------- + torch.Tensor + Renormalized probabilities. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_p_renorm_probs(probs, top_p) + + return _torch_top_p_renorm_probs(probs.float(), top_p).to(probs.dtype) + + +def _torch_top_p_renorm_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], +) -> torch.Tensor: + """Pure-torch top-p renormalization (operates on float32).""" + sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True) + cumsum = torch.cumsum(sorted_probs, dim=-1) + + if isinstance(top_p, (int, float)): + mask = cumsum - sorted_probs > top_p + else: + top_p_t = top_p.unsqueeze(-1) + mask = cumsum - sorted_probs > top_p_t + + sorted_probs[mask] = 0.0 + # scatter back + result = torch.zeros_like(probs) + result.scatter_(1, sorted_indices, sorted_probs) + # renormalize + sums = result.sum(dim=-1, keepdim=True).clamp(min=1e-8) + return result / sums + + +# --------------------------------------------------------------------------- +# top_k_renorm_probs +# --------------------------------------------------------------------------- + + +def top_k_renorm_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Renormalize probabilities by top-k thresholding. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + + Returns + ------- + torch.Tensor + Renormalized probabilities. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_renorm_probs(probs, top_k) + + return _torch_top_k_renorm_probs(probs.float(), top_k).to(probs.dtype) + + +def _torch_top_k_renorm_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Pure-torch top-k renormalization (operates on float32).""" + if isinstance(top_k, int): + # uniform top_k across batch + topk_vals, _ = torch.topk(probs, top_k, dim=-1) + threshold = topk_vals[:, -1:] # (B, 1) + else: + # per-request top_k: use sorting + sorted_probs, _ = torch.sort(probs, dim=-1, descending=True) + # gather the k-th value for each row + k_indices = (top_k.long() - 1).unsqueeze(-1) # (B, 1) + threshold = sorted_probs.gather(1, k_indices) # (B, 1) + + mask = probs < threshold + filtered = probs.clone() + filtered[mask] = 0.0 + sums = filtered.sum(dim=-1, keepdim=True).clamp(min=1e-8) + return filtered / sums + + +# --------------------------------------------------------------------------- +# top_k_mask_logits +# --------------------------------------------------------------------------- + + +def top_k_mask_logits( + logits: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Mask logits by top-k thresholding (set non-top-k to -inf). + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + + Returns + ------- + torch.Tensor + Masked logits with the same shape and dtype. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_mask_logits(logits, top_k) + + if isinstance(top_k, int): + topk_vals, _ = torch.topk(logits, top_k, dim=-1) + threshold = topk_vals[:, -1:] + else: + sorted_logits, _ = torch.sort(logits, dim=-1, descending=True) + k_indices = (top_k.long() - 1).unsqueeze(-1) + threshold = sorted_logits.gather(1, k_indices) + + mask = logits < threshold + result = logits.clone() + result[mask] = float("-inf") + return result + + +# --------------------------------------------------------------------------- +# chain_speculative_sampling +# --------------------------------------------------------------------------- + + +def chain_speculative_sampling( + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + target_probs: torch.Tensor, + maybe_output_accepted_token_num: Optional[torch.Tensor] = None, + maybe_output_emitted_draft_token_num: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Speculative sampling for sequence generation. + + Parameters + ---------- + draft_probs : torch.Tensor + ``(batch_size, num_speculate_tokens, vocab_size)``. + draft_token_ids : torch.Tensor + ``(batch_size, num_speculate_tokens)``. + target_probs : torch.Tensor + ``(batch_size, num_speculate_tokens + 1, vocab_size)``. + maybe_output_accepted_token_num : Optional[torch.Tensor] + If provided, accepted counts are added in-place. + maybe_output_emitted_draft_token_num : Optional[torch.Tensor] + If provided, emitted counts are added in-place. + deterministic, generator, seed, offset + See FlashInfer docs. + + Returns + ------- + output_token_ids : torch.Tensor + ``(batch_size, num_speculate_tokens + 1)``, rejected slots padded with -1. + output_accepted_token_num : torch.Tensor + ``(batch_size,)``. + output_emitted_draft_token_num : torch.Tensor + ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.chain_speculative_sampling( + draft_probs, + draft_token_ids, + target_probs, + maybe_output_accepted_token_num=maybe_output_accepted_token_num, + maybe_output_emitted_draft_token_num=maybe_output_emitted_draft_token_num, + deterministic=deterministic, + generator=generator, + seed=seed, + offset=offset, + ) + + return _torch_chain_speculative_sampling( + draft_probs, + draft_token_ids, + target_probs, + maybe_output_accepted_token_num, + maybe_output_emitted_draft_token_num, + generator, + ) + + +def _torch_chain_speculative_sampling( + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + target_probs: torch.Tensor, + maybe_output_accepted_token_num: Optional[torch.Tensor], + maybe_output_emitted_draft_token_num: Optional[torch.Tensor], + generator: Optional[torch.Generator], +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure-torch chain speculative sampling. + + Implements the rejection-sampling algorithm from + "Accelerating Large Language Model Decoding with Speculative Sampling" + (Leviathan et al., 2023). + """ + batch_size, num_spec, vocab_size = draft_probs.shape + device = draft_probs.device + + output_ids = torch.full( + (batch_size, num_spec + 1), -1, dtype=torch.int32, device=device + ) + accepted_count = torch.zeros(batch_size, dtype=torch.int32, device=device) + emitted_count = torch.zeros(batch_size, dtype=torch.int32, device=device) + + for b in range(batch_size): + all_accepted = True + for t in range(num_spec): + draft_tok = draft_token_ids[b, t].item() + p_draft = draft_probs[b, t, draft_tok].item() + p_target = target_probs[b, t, draft_tok].item() + + # independent acceptance check (for the metric) + if p_target >= p_draft: + accepted_count[b] += 1 + else: + r = torch.rand(1, generator=generator, device=device).item() + if r < p_target / max(p_draft, 1e-10): + accepted_count[b] += 1 + + # sequential chain: accept / reject + if all_accepted: + r = torch.rand(1, generator=generator, device=device).item() + if r < min(1.0, p_target / max(p_draft, 1e-10)): + output_ids[b, t] = draft_tok + emitted_count[b] += 1 + else: + # reject: sample from max(0, p_target - p_draft) + diff = target_probs[b, t].float() - draft_probs[b, t].float() + diff = torch.clamp(diff, min=0.0) + dsum = diff.sum() + if dsum > 1e-8: + diff = diff / dsum + else: + diff = target_probs[b, t].float() + diff = diff / diff.sum().clamp(min=1e-8) + resampled = torch.multinomial( + diff.unsqueeze(0), num_samples=1, generator=generator + ).item() + output_ids[b, t] = resampled + emitted_count[b] += 1 + all_accepted = False + + # bonus token (sampled from target at position after last emitted) + if all_accepted: + pos = num_spec + bonus_probs = target_probs[b, pos].float() + bonus_probs = bonus_probs / bonus_probs.sum().clamp(min=1e-8) + bonus = torch.multinomial( + bonus_probs.unsqueeze(0), num_samples=1, generator=generator + ).item() + output_ids[b, num_spec] = bonus + + if maybe_output_accepted_token_num is not None: + maybe_output_accepted_token_num.add_(accepted_count) + if maybe_output_emitted_draft_token_num is not None: + maybe_output_emitted_draft_token_num.add_(emitted_count) + + return output_ids, accepted_count, emitted_count + + +# --------------------------------------------------------------------------- +# Aliases (FlashInfer also exposes these) +# --------------------------------------------------------------------------- +top_p_renorm_prob = top_p_renorm_probs +top_k_renorm_prob = top_k_renorm_probs From 2cf50f40c760137ad439ea45afc73f61bac18c5a Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 9 Mar 2026 07:48:45 +0000 Subject: [PATCH 13/13] feat(cuda): add fused GDN decode and RMSNorm+SiLU gating kernels for attention - Implement fused GDN decode kernel performing gating, L2 normalization, delta update, and output computation in a single CUDA kernel optimized for SM80+ architectures - Add fused RMSNorm with optional SiLU gating kernel for Qwen3.5 GDN attention - Provide JIT Python wrappers for both kernels to enable easy integration - Extend CUDA JIT module imports to include new gdn_decode kernel - Update global and server configs with new backend options and default values - Enhance argument parsing in global_config to support literal choice constraints - Add fields for multimodal M-RoPE and vision inputs in ForwardBatch for decoding - Implement EOS token ID extraction and max output tokens normalization in launcher module --- .gitignore | 1 + .../mllm_kernel/cuda/csrc/gdn_decode.cuh | 432 ++++++ .../mllm_kernel/cuda/csrc/rms_norm_gated.cuh | 212 +++ mllm-kernel/mllm_kernel/cuda/jit/__init__.py | 3 +- .../mllm_kernel/cuda/jit/gdn_decode.py | 114 ++ .../mllm_kernel/cuda/jit/rms_norm_gated.py | 87 ++ pymllm/configs/global_config.py | 31 +- pymllm/configs/server_config.py | 23 +- pymllm/engine/forward_batch.py | 9 + pymllm/engine/launch.py | 206 ++- pymllm/executor/__init__.py | 10 + pymllm/executor/cuda_graph_runner.py | 590 ++++++++ pymllm/executor/model_runner.py | 1198 +++++++++++++++ pymllm/layers/__init__.py | 4 + pymllm/layers/attention/__init__.py | 8 + pymllm/layers/attention/attention_backend.py | 22 + pymllm/layers/attention/gdn_backend.py | 660 ++++++++ pymllm/layers/attention/hybrid_backend.py | 184 +++ .../attention/radix_linear_attention.py | 116 ++ pymllm/layers/gated_delta_net.py | 168 +++ pymllm/layers/rms_norm.py | 24 +- pymllm/layers/rms_norm_gated.py | 154 ++ pymllm/layers/rope.py | 147 +- pymllm/layers/sampling.py | 9 + pymllm/mem_cache/memory_pool.py | 159 ++ pymllm/mem_cache/radix_cache.py | 40 +- pymllm/models/__init__.py | 62 + pymllm/models/qwen3_5.py | 530 +++++++ pymllm/models/qwen3_vl.py | 1329 +++++++++++++++++ pymllm/orchestrator/async_disk_io_process.py | 84 -- pymllm/orchestrator/detokenizer_process.py | 116 +- pymllm/orchestrator/ipc_utils.py | 22 + pymllm/orchestrator/model_runner_process.py | 1007 +++++++++++-- .../orchestrator/request_response_process.py | 23 +- pymllm/orchestrator/scheduler_process.py | 820 +++++++++- pymllm/orchestrator/tokenizer_process.py | 4 +- pymllm/parsers/__init__.py | 10 + pymllm/parsers/reasoning_parser.py | 212 +++ pymllm/parsers/tool_call_parser.py | 433 ++++++ pymllm/server/launch.py | 923 +++++++++++- 40 files changed, 9810 insertions(+), 376 deletions(-) create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py create mode 100644 pymllm/layers/attention/gdn_backend.py create mode 100644 pymllm/layers/attention/hybrid_backend.py create mode 100644 pymllm/layers/attention/radix_linear_attention.py create mode 100644 pymllm/layers/gated_delta_net.py create mode 100644 pymllm/layers/rms_norm_gated.py create mode 100644 pymllm/models/qwen3_5.py create mode 100644 pymllm/models/qwen3_vl.py delete mode 100644 pymllm/orchestrator/async_disk_io_process.py create mode 100644 pymllm/parsers/__init__.py create mode 100644 pymllm/parsers/reasoning_parser.py create mode 100644 pymllm/parsers/tool_call_parser.py diff --git a/.gitignore b/.gitignore index cdafc270..7f14b37e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ .cache/ .tmp/ compile_commands.json +settings.local.json # MLLM Team Specific tasks/mllmteam* diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh new file mode 100644 index 00000000..4c2833c0 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh @@ -0,0 +1,432 @@ +// Fused GDN (Gated Delta Net) decode kernel for linear attention. +// +// Performs a single-token recurrent update per request: +// g = -exp(A_log) * softplus(a + dt_bias) +// beta = sigmoid(b) +// q = L2norm(q) * scale +// k = L2norm(k) +// state *= exp(g) (decay) +// v_delta = v - state @ k (delta rule) +// v_delta *= beta (gated update) +// state += v_delta outer k (state update) +// output = state @ q (readout) +// +// Works on SM80+ (Ampere, Jetson Orin, Hopper, ...). +// Matches the algorithm of sglang's fused_sigmoid_gating_delta_rule_update. +// +// Grid : (NV, bs * HV) where NV = ceil(V / BV) +// Block: BLOCK_K threads (one thread per K-dimension element) +// +// Each thread owns BV state elements at its K position. +// Two cross-thread reductions (over K) compute delta and output dot products. + +#pragma once + +#include +#include + +#include +#include + +#include +#include +#include + +#include + +namespace GDNDecodeKernel { + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +inline constexpr int BV = 32; // V-dimension tile size + +// --------------------------------------------------------------------------- +// Warp-level reduction +// --------------------------------------------------------------------------- + +__device__ __forceinline__ float warp_reduce_sum(float val) { + #pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_xor_sync(0xffffffff, val, offset); + } + return val; +} + +// --------------------------------------------------------------------------- +// Type conversion helpers +// --------------------------------------------------------------------------- + +template +__device__ __forceinline__ float to_float(T val); + +template <> +__device__ __forceinline__ float to_float<__half>(__half val) { + return __half2float(val); +} + +template <> +__device__ __forceinline__ float to_float<__nv_bfloat16>(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template <> +__device__ __forceinline__ float to_float(float val) { + return val; +} + +template +__device__ __forceinline__ T from_float(float val); + +template <> +__device__ __forceinline__ __half from_float<__half>(float val) { + return __float2half(val); +} + +template <> +__device__ __forceinline__ __nv_bfloat16 from_float<__nv_bfloat16>(float val) { + return __float2bfloat16(val); +} + +template <> +__device__ __forceinline__ float from_float(float val) { + return val; +} + +// --------------------------------------------------------------------------- +// Block-level scalar reduction (sum across all threads → broadcast result) +// --------------------------------------------------------------------------- + +// Reduces a scalar across all threads in the block. +// Returns the sum in ALL threads (via shared memory broadcast). +// smem must have at least (blockDim.x / 32) floats. +__device__ __forceinline__ float block_reduce_sum(float val, float* smem) { + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; + const int num_warps = blockDim.x / 32; + + val = warp_reduce_sum(val); + if (lane_id == 0) smem[warp_id] = val; + __syncthreads(); + + // First warp reduces across warps + if (warp_id == 0) { + float v = (lane_id < num_warps) ? smem[lane_id] : 0.0f; + v = warp_reduce_sum(v); + if (lane_id == 0) smem[0] = v; + } + __syncthreads(); + return smem[0]; +} + +// --------------------------------------------------------------------------- +// Block-level vector reduction: BV independent sums across all K threads +// --------------------------------------------------------------------------- + +// Each thread contributes partial[0..BV-1]. After this call, the results +// are written to out[0..BV-1] and are valid in all threads. +// reduce_buf must have at least BV * num_warps floats. +// broadcast_buf must have at least BV floats. +__device__ __forceinline__ void block_reduce_bv( + float partial[BV], + float* reduce_buf, // [num_warps * BV] + float* broadcast_buf, // [BV] + float out[BV] +) { + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; + const int num_warps = blockDim.x / 32; + + // Intra-warp reduction for each bv + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + float val = warp_reduce_sum(partial[bv]); + if (lane_id == 0) { + reduce_buf[warp_id * BV + bv] = val; + } + } + __syncthreads(); + + // Inter-warp reduction: threads 0..BV-1 each reduce one bv + if (threadIdx.x < BV) { + float sum = 0.0f; + #pragma unroll 8 + for (int w = 0; w < num_warps; w++) { + sum += reduce_buf[w * BV + threadIdx.x]; + } + broadcast_buf[threadIdx.x] = sum; + } + __syncthreads(); + + // Broadcast to all threads + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + out[bv] = broadcast_buf[bv]; + } +} + +// --------------------------------------------------------------------------- +// Main GDN decode kernel +// --------------------------------------------------------------------------- + +template +__global__ void gdn_decode_kernel( + const T* __restrict__ q_ptr, // [bs, H, K] + const T* __restrict__ k_ptr, // [bs, H, K] + const T* __restrict__ v_ptr, // [bs, HV, V] + const T* __restrict__ a_ptr, // [bs, HV] + const T* __restrict__ b_ptr, // [bs, HV] + const float* __restrict__ A_log_ptr, // [HV] + const float* __restrict__ dt_bias_ptr, // [HV] + float* __restrict__ state_pool, // [pool_size, HV, V, K] + const int64_t* __restrict__ cache_indices, // [bs] + T* __restrict__ output_ptr, // [bs, HV, V] + const int bs, + const int H, // num_k_heads + const int HV, // num_v_heads + const int K, // head_k_dim + const int V, // head_v_dim + const float scale // K^-0.5 +) { + // Block indices + const int bv_block = blockIdx.x; // V-tile index + const int batch_head = blockIdx.y; // batch * HV + const int i_n = batch_head / HV; // batch index + const int i_hv = batch_head % HV; // value head index + const int i_h = i_hv * H / HV; // key head index (GQA mapping) + const int k_idx = threadIdx.x; // K-dimension index + const int v_start = bv_block * BV; // V-dimension start + + if (i_n >= bs) return; + + // Shared memory layout (declared dynamically) + extern __shared__ float smem[]; + const int num_warps = BLOCK_K / 32; + float* sq = smem; // [BLOCK_K] + float* sk = smem + BLOCK_K; // [BLOCK_K] + float* sv_broadcast = smem + 2 * BLOCK_K; // [BV] + float* warp_buf = smem + 2 * BLOCK_K + BV; // [num_warps] + float* reduce_buf = smem + 2 * BLOCK_K + BV + num_warps; // [BV * num_warps] + + // ===== 1. Load gating parameters and compute decay + beta ===== + // All threads load the same scalars (cheap, avoids shared memory) + const float A_log_val = A_log_ptr[i_hv]; + const float dt_bias_val = dt_bias_ptr[i_hv]; + const float a_val = to_float(a_ptr[i_n * HV + i_hv]); + const float b_val = to_float(b_ptr[i_n * HV + i_hv]); + + const float x = a_val + dt_bias_val; + // softplus with numerical stability: softplus(x) = log(1+exp(x)), or x for x>20 + const float softplus_x = (x <= 20.0f) ? logf(1.0f + expf(x)) : x; + const float g = -expf(A_log_val) * softplus_x; + const float decay = expf(g); + const float beta = 1.0f / (1.0f + expf(-b_val)); + + // ===== 2. Load q, k and compute L2 norms ===== + float q_val = 0.0f, k_val = 0.0f; + if (k_idx < K) { + q_val = to_float(q_ptr[i_n * H * K + i_h * K + k_idx]); + k_val = to_float(k_ptr[i_n * H * K + i_h * K + k_idx]); + } + + // L2 norm: reduce q*q and k*k across block + float q_sq_sum = block_reduce_sum(q_val * q_val, warp_buf); + float k_sq_sum = block_reduce_sum(k_val * k_val, warp_buf); + + float q_norm = rsqrtf(q_sq_sum + 1e-6f); + float k_norm = rsqrtf(k_sq_sum + 1e-6f); + + // Store normalized q (scaled) and k in shared memory + if (k_idx < K) { + sq[k_idx] = q_val * q_norm * scale; + sk[k_idx] = k_val * k_norm; + } else { + sq[k_idx] = 0.0f; + sk[k_idx] = 0.0f; + } + __syncthreads(); + + // ===== 3. Load state elements for this thread ===== + const int64_t pool_idx = cache_indices[i_n]; + // state_pool layout: [pool_size, HV, V, K] + const int64_t state_base = pool_idx * HV * V * K + i_hv * V * K; + + float state[BV]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + const int v_idx = v_start + bv; + if (v_idx < V && k_idx < K) { + state[bv] = state_pool[state_base + (int64_t)v_idx * K + k_idx]; + } else { + state[bv] = 0.0f; + } + } + + // ===== 4. Decay: state *= exp(g) ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + state[bv] *= decay; + } + + // ===== 5. Delta: v_delta[bv] = v[bv] - sum_k(state[bv,k] * k_norm[k]) ===== + float partial_delta[BV]; + const float my_k = sk[k_idx]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + partial_delta[bv] = state[bv] * my_k; + } + + float delta[BV]; + block_reduce_bv(partial_delta, reduce_buf, sv_broadcast, delta); + + // Compute v_delta = (v - delta) * beta and broadcast to all threads. + // Threads 0..BV-1 each load one v element, compute v_delta, write to smem. + if (k_idx < BV) { + const int my_v_idx = v_start + k_idx; + float my_v = (my_v_idx < V) + ? to_float(v_ptr[i_n * HV * V + i_hv * V + my_v_idx]) + : 0.0f; + sv_broadcast[k_idx] = (my_v - delta[k_idx]) * beta; + } + __syncthreads(); + + float v_delta[BV]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + v_delta[bv] = sv_broadcast[bv]; + } + + // ===== 6. State update: state[bv,k] += v_delta[bv] * k_norm[k] ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + state[bv] += v_delta[bv] * my_k; + } + + // ===== 7. Output: o[bv] = sum_k(state[bv,k] * q_norm_scaled[k]) ===== + float partial_out[BV]; + const float my_q = sq[k_idx]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + partial_out[bv] = state[bv] * my_q; + } + + float out_vals[BV]; + block_reduce_bv(partial_out, reduce_buf, sv_broadcast, out_vals); + + // ===== 8. Store output ===== + // output layout: [bs, HV, V] + if (k_idx < BV) { + const int v_idx = v_start + k_idx; + if (v_idx < V) { + output_ptr[i_n * HV * V + i_hv * V + v_idx] = from_float(out_vals[k_idx]); + } + } + + // ===== 9. Store state back to pool ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + const int v_idx = v_start + bv; + if (v_idx < V && k_idx < K) { + state_pool[state_base + (int64_t)v_idx * K + k_idx] = state[bv]; + } + } +} + +// --------------------------------------------------------------------------- +// Launch wrapper (called via TVM FFI) +// --------------------------------------------------------------------------- + +void run( + tvm::ffi::TensorView q, // [bs, H, K] + tvm::ffi::TensorView k, // [bs, H, K] + tvm::ffi::TensorView v, // [bs, HV, V] + tvm::ffi::TensorView a, // [bs, HV] + tvm::ffi::TensorView b, // [bs, HV] + tvm::ffi::TensorView A_log, // [HV] + tvm::ffi::TensorView dt_bias, // [HV] + tvm::ffi::TensorView state_pool, // [pool_size, HV, V, K] + tvm::ffi::TensorView cache_indices, // [bs] + tvm::ffi::TensorView output // [bs, HV, V] +) { + using namespace mllm_kernel::host; + + // --- Extract dimensions --- + auto BS = SymbolicSize{"bs"}; + auto H_ = SymbolicSize{"H"}; + auto HV_ = SymbolicSize{"HV"}; + auto K_ = SymbolicSize{"K"}; + auto V_ = SymbolicSize{"V"}; + auto PS = SymbolicSize{"pool_size"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + device.set_options(); + dtype.set_options(); + + (void)TensorMatcher({BS, H_, K_}).with_dtype(dtype).with_device(device).verify(q); + (void)TensorMatcher({BS, H_, K_}).with_dtype(dtype).with_device(device).verify(k); + (void)TensorMatcher({BS, HV_, V_}).with_dtype(dtype).with_device(device).verify(v); + (void)TensorMatcher({BS, HV_}).with_dtype(dtype).with_device(device).verify(a); + (void)TensorMatcher({BS, HV_}).with_dtype(dtype).with_device(device).verify(b); + (void)TensorMatcher({HV_}).with_dtype().with_device(device).verify(A_log); + (void)TensorMatcher({HV_}).with_dtype().with_device(device).verify(dt_bias); + (void)TensorMatcher({PS, HV_, V_, K_}).with_dtype().with_device(device).verify(state_pool); + (void)TensorMatcher({BS}).with_device(device).verify(cache_indices); + (void)TensorMatcher({BS, HV_, V_}).with_dtype(dtype).with_device(device).verify(output); + + const int bs = static_cast(BS.unwrap()); + const int H = static_cast(H_.unwrap()); + const int HV = static_cast(HV_.unwrap()); + const int K = static_cast(K_.unwrap()); + const int V = static_cast(V_.unwrap()); + const float scale = 1.0f / sqrtf(static_cast(K)); + + // Block size = K (rounded up to warp multiple, max 1024) + int block_k = ((K + 31) / 32) * 32; + if (block_k > 1024) block_k = 1024; + const int num_warps = block_k / 32; + + // Grid + const int NV = (V + BV - 1) / BV; + dim3 grid(NV, bs * HV); + dim3 block(block_k); + + // Dynamic shared memory: sq[block_k] + sk[block_k] + sv[BV] + warp_buf[nw] + reduce[BV*nw] + const size_t smem_bytes = (2 * block_k + BV + num_warps + BV * num_warps) * sizeof(float); + + const DLDevice dl_device = device.unwrap(); + + // Typed launch helper + #define LAUNCH_GDN_DECODE(CType, BKVAL) \ + LaunchKernel(grid, block, dl_device, smem_bytes)( \ + gdn_decode_kernel, \ + static_cast(q.data_ptr()), \ + static_cast(k.data_ptr()), \ + static_cast(v.data_ptr()), \ + static_cast(a.data_ptr()), \ + static_cast(b.data_ptr()), \ + static_cast(A_log.data_ptr()), \ + static_cast(dt_bias.data_ptr()), \ + static_cast(state_pool.data_ptr()), \ + static_cast(cache_indices.data_ptr()), \ + static_cast(output.data_ptr()), \ + bs, H, HV, K, V, scale \ + ) + + // Dispatch based on dtype and block size + if (dtype.is_type()) { + if (block_k == 64) { LAUNCH_GDN_DECODE(__nv_bfloat16, 64); } + else if (block_k == 128) { LAUNCH_GDN_DECODE(__nv_bfloat16, 128); } + else if (block_k == 256) { LAUNCH_GDN_DECODE(__nv_bfloat16, 256); } + else { LAUNCH_GDN_DECODE(__nv_bfloat16, 256); } + } else { + if (block_k == 64) { LAUNCH_GDN_DECODE(__half, 64); } + else if (block_k == 128) { LAUNCH_GDN_DECODE(__half, 128); } + else if (block_k == 256) { LAUNCH_GDN_DECODE(__half, 256); } + else { LAUNCH_GDN_DECODE(__half, 256); } + } + + #undef LAUNCH_GDN_DECODE +} + +} // namespace GDNDecodeKernel diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh new file mode 100644 index 00000000..b6124602 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh @@ -0,0 +1,212 @@ +// Fused RMSNorm with optional SiLU gating for Qwen3.5 GDN attention. +// +// Computes: output = rmsnorm(x, weight, eps) * silu(z) (if z provided) +// output = rmsnorm(x, weight, eps) (if z is null) +// +// Where: rmsnorm(x) = x / sqrt(mean(x^2) + eps) * weight +// silu(z) = z * sigmoid(z) +// +// This kernel fuses both operations into a single pass over the data, +// maximizing memory bandwidth utilization. Each block processes one row +// (one token position). +// +// Supported dtypes: float16, bfloat16 (accumulation in float32). + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace RMSNormGatedKernel { + +// --------------------------------------------------------------------------- +// Warp-level reduction +// --------------------------------------------------------------------------- + +__device__ __forceinline__ float warp_reduce_sum(float val) { + #pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_xor_sync(0xffffffff, val, offset); + } + return val; +} + +// --------------------------------------------------------------------------- +// Type conversion helpers +// --------------------------------------------------------------------------- + +template +__device__ __forceinline__ float to_float(T val); + +template <> +__device__ __forceinline__ float to_float(half val) { + return __half2float(val); +} + +template <> +__device__ __forceinline__ float to_float<__nv_bfloat16>(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template <> +__device__ __forceinline__ float to_float(float val) { + return val; +} + +template +__device__ __forceinline__ T from_float(float val); + +template <> +__device__ __forceinline__ half from_float(float val) { + return __float2half(val); +} + +template <> +__device__ __forceinline__ __nv_bfloat16 from_float<__nv_bfloat16>(float val) { + return __float2bfloat16(val); +} + +template <> +__device__ __forceinline__ float from_float(float val) { + return val; +} + +// --------------------------------------------------------------------------- +// Main kernel +// --------------------------------------------------------------------------- + +template +__global__ void rms_norm_gated_kernel( + T* __restrict__ output, // [M, N] + const T* __restrict__ input, // [M, N] + const T* __restrict__ weight, // [N] + const T* __restrict__ gate, // [M, N] or nullptr + const int M, // number of rows + const int N, // number of columns (hidden_size) + const float eps +) { + const int row = blockIdx.x; + if (row >= M) return; + + const int tid = threadIdx.x; + const T* x_row = input + row * N; + T* out_row = output + row * N; + const T* z_row = (gate != nullptr) ? gate + row * N : nullptr; + + // --- Pass 1: compute sum of squares --- + float sum_sq = 0.0f; + for (int col = tid; col < N; col += BLOCK_SIZE) { + float val = to_float(x_row[col]); + sum_sq += val * val; + } + + // Block-level reduction + __shared__ float shared_sum[32]; // one per warp + int warp_id = tid / 32; + int lane_id = tid % 32; + + sum_sq = warp_reduce_sum(sum_sq); + if (lane_id == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + // Final reduction in first warp + if (warp_id == 0) { + float val = (lane_id < (BLOCK_SIZE / 32)) ? shared_sum[lane_id] : 0.0f; + val = warp_reduce_sum(val); + if (lane_id == 0) { + shared_sum[0] = val; + } + } + __syncthreads(); + + float rms = rsqrtf(shared_sum[0] / (float)N + eps); + + // --- Pass 2: normalize, scale by weight, optionally gate with silu(z) --- + for (int col = tid; col < N; col += BLOCK_SIZE) { + float val = to_float(x_row[col]); + float w = to_float(weight[col]); + + float normed = val * rms * w; + + if (z_row != nullptr) { + float z = to_float(z_row[col]); + // silu(z) = z * sigmoid(z) + float silu_z = z / (1.0f + expf(-z)); + normed *= silu_z; + } + + out_row[col] = from_float(normed); + } +} + +// --------------------------------------------------------------------------- +// Launch wrapper (called via TVM FFI) +// --------------------------------------------------------------------------- + +void run( + tvm::ffi::TensorView output, + tvm::ffi::TensorView input, + tvm::ffi::TensorView weight, + tvm::ffi::TensorView gate, // empty tensor (numel==0) means no gate + double eps +) { + using namespace mllm_kernel::host; + + auto M = SymbolicSize{"M"}; + auto N = SymbolicSize{"N"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + device.set_options(); + dtype.set_options(); + + (void)TensorMatcher({M, N}).with_dtype(dtype).with_device(device).verify(input); + (void)TensorMatcher({M, N}).with_dtype(dtype).with_device(device).verify(output); + (void)TensorMatcher({N}).with_dtype(dtype).with_device(device).verify(weight); + + const int rows = static_cast(M.unwrap()); + const int cols = static_cast(N.unwrap()); + const bool has_gate = (gate.numel() > 0); + + constexpr int BLOCK_SIZE = 256; + + if (dtype.is_type()) { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel, + static_cast(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } else if (dtype.is_type()) { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel<__nv_bfloat16, BLOCK_SIZE>, + static_cast<__nv_bfloat16*>(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } else { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel, + static_cast(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } +} + +} // namespace RMSNormGatedKernel diff --git a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py index 202ff3b3..cc4ab667 100644 --- a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py +++ b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py @@ -1,4 +1,5 @@ from .add_constant import add_constant +from .gdn_decode import gdn_decode from .store_cache import can_use_store_cache, store_cache -__all__ = ["add_constant", "can_use_store_cache", "store_cache"] +__all__ = ["add_constant", "can_use_store_cache", "gdn_decode", "store_cache"] diff --git a/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py b/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py new file mode 100644 index 00000000..53aaeaab --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py @@ -0,0 +1,114 @@ +"""Fused GDN decode CUDA JIT kernel. + +Performs a single-token GDN (Gated Delta Net) recurrent update per request, +fusing gating + L2 normalization + delta rule + output computation into +one kernel. Works on SM80+ (Ampere, Jetson Orin, Hopper, ...). + +Usage:: + + from mllm_kernel.cuda.jit.gdn_decode import gdn_decode + + output = gdn_decode(q, k, v, a, b, A_log, dt_bias, state_pool, cache_indices) +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_gdn_decode_kernel(): + """JIT-compile the fused GDN decode CUDA kernel.""" + + @jit( + args=[], + device="cuda", + cuda_files=["gdn_decode.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("gdn_decode", "GDNDecodeKernel::run"), + ], + func_name="gdn_decode", + ) + def _kernel( + compiled_module, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, + state_pool: torch.Tensor, + cache_indices: torch.Tensor, + output: torch.Tensor, + ) -> None: + compiled_module.gdn_decode( + q, k, v, a, b, A_log, dt_bias, state_pool, cache_indices, output + ) + + return _kernel + + +def gdn_decode( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, + state_pool: torch.Tensor, + cache_indices: torch.Tensor, +) -> torch.Tensor: + """Fused GDN decode: gating + L2 norm + delta rule + output. + + Parameters + ---------- + q : torch.Tensor + Query tensor, shape ``(bs, num_k_heads, head_k_dim)``, bf16/fp16. + k : torch.Tensor + Key tensor, shape ``(bs, num_k_heads, head_k_dim)``, bf16/fp16. + v : torch.Tensor + Value tensor, shape ``(bs, num_v_heads, head_v_dim)``, bf16/fp16. + a : torch.Tensor + Decay gate input, shape ``(bs, num_v_heads)``, bf16/fp16. + b : torch.Tensor + Update gate input, shape ``(bs, num_v_heads)``, bf16/fp16. + A_log : torch.Tensor + Log-space decay parameter, shape ``(num_v_heads,)``, float32. + dt_bias : torch.Tensor + Bias for decay gate, shape ``(num_v_heads,)``, float32. + state_pool : torch.Tensor + Pooled recurrent state, shape ``(pool_size, num_v_heads, head_v_dim, head_k_dim)``, + float32. Modified in-place. + cache_indices : torch.Tensor + Pool indices per request, shape ``(bs,)``, int64. + + Returns + ------- + torch.Tensor + Output tensor, shape ``(bs, num_v_heads, head_v_dim)``, same dtype as v. + """ + bs = q.shape[0] + num_v_heads = v.shape[1] + head_v_dim = v.shape[2] + + output = torch.empty(bs, num_v_heads, head_v_dim, dtype=v.dtype, device=v.device) + + kernel = _make_gdn_decode_kernel() + kernel( + q.contiguous(), + k.contiguous(), + v.contiguous(), + a.contiguous(), + b.contiguous(), + A_log.contiguous(), + dt_bias.contiguous(), + state_pool, + cache_indices.to(torch.int64).contiguous(), + output, + ) + return output diff --git a/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py b/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py new file mode 100644 index 00000000..d7906a38 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py @@ -0,0 +1,87 @@ +"""Fused RMSNorm + SiLU gating CUDA JIT kernel for Qwen3.5 GDN attention. + +Computes ``rmsnorm(x, weight, eps) * silu(z)`` in a single fused pass. + +Usage:: + + from mllm_kernel.cuda.jit.rms_norm_gated import rms_norm_gated + + output = rms_norm_gated(x, weight, z=gate, eps=1e-6) +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_rms_norm_gated_kernel(): + """JIT-compile the fused RMSNorm+gating CUDA kernel.""" + + @jit( + args=[], + device="cuda", + cuda_files=["rms_norm_gated.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("rms_norm_gated", "RMSNormGatedKernel::run"), + ], + func_name="rms_norm_gated", + ) + def _kernel( + compiled_module, + output: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + gate: torch.Tensor, + eps: float, + ) -> None: + compiled_module.rms_norm_gated(output, input, weight, gate, eps) + + return _kernel + + +def rms_norm_gated( + x: torch.Tensor, + weight: torch.Tensor, + z: torch.Tensor | None = None, + eps: float = 1e-6, +) -> torch.Tensor: + """Fused RMSNorm with optional SiLU gating. + + Parameters + ---------- + x : torch.Tensor + Input tensor, shape ``(M, N)`` or ``(..., N)``. + weight : torch.Tensor + Normalization weight, shape ``(N,)``. + z : torch.Tensor or None + Optional gating tensor, same shape as ``x``. + If provided: ``output = rmsnorm(x) * silu(z)`` + eps : float + Epsilon for numerical stability. + + Returns + ------- + torch.Tensor + Output with same shape and dtype as ``x``. + """ + x_shape = x.shape + x_2d = x.reshape(-1, x.shape[-1]) + + if z is not None: + z_2d = z.reshape(-1, z.shape[-1]) + if z_2d.stride(-1) != 1: + z_2d = z_2d.contiguous() + else: + z_2d = x.new_empty(0) # empty tensor signals "no gate" to the kernel + + if x_2d.stride(-1) != 1: + x_2d = x_2d.contiguous() + + output = torch.empty_like(x_2d) + kernel = _make_rms_norm_gated_kernel() + kernel(output, x_2d, weight.contiguous(), z_2d, eps) + return output.reshape(x_shape) diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py index 1761697b..711de3cd 100644 --- a/pymllm/configs/global_config.py +++ b/pymllm/configs/global_config.py @@ -127,6 +127,16 @@ def _converter_for_annotation(annotation: Any) -> Optional[Callable[[str], Any]] return None +def _choices_for_annotation(annotation: Any) -> Optional[list]: + """Extract allowed values from a ``Literal`` annotation, if applicable.""" + + inner, _ = _unwrap_optional(annotation) + origin = get_origin(inner) + if origin is Literal: + return list(get_args(inner)) + return None + + def _is_bool_annotation(annotation: Any) -> bool: """Return ``True`` if annotation represents a bool/Optional[bool] field.""" @@ -225,16 +235,27 @@ def make_args( # Skip non-scalar or runtime-only fields (e.g. arbitrary objects). continue - section_group.add_argument( - option, + choices = _choices_for_annotation(annotation) + kwargs: dict[str, Any] = dict( dest=dest, type=converter, default=argparse.SUPPRESS, - help=( + ) + if choices is not None: + kwargs["choices"] = choices + choices_str = ", ".join(str(c) for c in choices) + kwargs["help"] = ( + f"{section_name}.{dc_field.name} " + f"{{choices: {choices_str}}} " + f"(default: {_format_default_for_help(default_value)})." + ) + else: + kwargs["help"] = ( f"{section_name}.{dc_field.name} (default: " f"{_format_default_for_help(default_value)})." - ), - ) + ) + + section_group.add_argument(option, **kwargs) return parser diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index f6a2090f..8727f7c1 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -40,19 +40,13 @@ class ServerConfig: max_queued_requests: Optional[int] = None max_total_tokens: Optional[int] = None chunked_prefill_size: Optional[int] = None - max_prefill_tokens: int = None + max_prefill_tokens: Optional[int] = None schedule_policy: Literal["auto", "fcfs"] = "fcfs" schedule_conservativeness: float = 1.0 sleep_on_idle: bool = False stream_interval: int = 1 stream_output: bool = True - # --------------------------------------------------------------------- # - # Threads - # --------------------------------------------------------------------- # - enable_disk_io_async: bool = False - disk_io_async_thread_count: int = 1 - # --------------------------------------------------------------------- # # Device # --------------------------------------------------------------------- # @@ -62,23 +56,34 @@ class ServerConfig: # Backend / acceleration # --------------------------------------------------------------------- # attention_backend: Literal["auto", "flashinfer"] = "auto" + gdn_decode_backend: Literal["auto", "flashinfer", "mllm_kernel", "pytorch"] = "auto" sampling_backend: Optional[str] = None disable_cuda_graph: bool = False - enable_torch_compile: bool = True + enable_torch_compile: bool = False torch_compile_max_bs: int = 32 random_seed: Optional[int] = 42 + # --------------------------------------------------------------------- # + # Output parsers (reasoning / tool calls) + # --------------------------------------------------------------------- # + reasoning_parser: Optional[str] = None # e.g. "deepseek-r1", "qwen3" + tool_call_parser: Optional[str] = None # e.g. "qwen25", "llama3", "hermes" + # --------------------------------------------------------------------- # # Logging and observability # --------------------------------------------------------------------- # log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" enable_metrics: bool = False show_time_cost: bool = False + # Log prefill/decode throughput stats every N decode batches (0 = disabled) + decode_log_interval: int = 40 # --------------------------------------------------------------------- # # Feature switches # --------------------------------------------------------------------- # enable_shared_queue: bool = False # Use shared memory queue for fast IPC + disable_radix_cache: bool = False # Disable radix-tree prefix caching + radix_cache_page_size: int = 1 # Number of tokens per KV-pool page in RadixCache # CUDA IPC transport for multimodal GPU tensors. # Requires enable_shared_queue=True to take effect. @@ -161,5 +166,7 @@ def _validate(self) -> None: raise ValueError("`max_running_requests` must be > 0 when set.") if self.max_queued_requests is not None and self.max_queued_requests < 0: raise ValueError("`max_queued_requests` must be >= 0 when set.") + if self.radix_cache_page_size < 1: + raise ValueError("`radix_cache_page_size` must be >= 1.") if self.schedule_conservativeness <= 0: raise ValueError("`schedule_conservativeness` must be > 0.") diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py index ebb715ff..428da7b6 100644 --- a/pymllm/engine/forward_batch.py +++ b/pymllm/engine/forward_batch.py @@ -180,3 +180,12 @@ class ForwardBatch: # ---- attention backend (set by model runner) ---- attn_backend: Optional["AttentionBackend"] = None + + # ---- multimodal M-RoPE ---- + # Per-request position delta for M-RoPE decode steps. + # Set by the model during prefill; consumed during decode to offset positions. + mrope_position_deltas: Optional[torch.Tensor] = None # [batch_size] int64 + + # ---- multimodal vision inputs (extend / prefill only) ---- + pixel_values: Optional[torch.Tensor] = None + image_grid_thw: Optional[torch.Tensor] = None diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 2ba04e1c..e5214511 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -28,12 +28,102 @@ ) from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process -from pymllm.orchestrator.model_runner_process import run_model_runner_process from pymllm.orchestrator.detokenizer_process import run_detokenizer_process -from pymllm.orchestrator.async_disk_io_process import run_async_disk_io_process logger = logging.getLogger(__name__) +# Standard HuggingFace config fields that indicate max output tokens, +# checked in priority order. +_MAX_NEW_TOKENS_FIELDS = ( + "max_new_tokens", + "max_tokens", + "max_completion_tokens", +) + + +def _normalize_eos_raw(raw) -> List[int]: + """Normalize a raw eos_token_id value (int, list, or None) to a list.""" + if raw is None: + return [] + if isinstance(raw, int): + return [raw] + if isinstance(raw, (list, tuple)): + return [x for x in raw if isinstance(x, int)] + return [] + + +def _get_eos_token_ids(hf_config, model_path=None) -> List[int]: + """Extract EOS token ID(s) from a HuggingFace model config. + + Searches in priority order: + 1. ``hf_config.eos_token_id`` (top-level, standard models) + 2. ``hf_config.text_config.eos_token_id`` (VL / multimodal models) + 3. ``generation_config.json`` (many models store EOS here) + 4. ``tokenizer_config.json`` via AutoTokenizer (last resort) + """ + if hf_config is None: + return [] + + # 1. Top-level config + ids = _normalize_eos_raw(getattr(hf_config, "eos_token_id", None)) + if ids: + return ids + + # 2. Nested text_config (VL / multimodal models like Qwen3-VL) + text_config = getattr(hf_config, "text_config", None) + if text_config is not None: + ids = _normalize_eos_raw(getattr(text_config, "eos_token_id", None)) + if ids: + return ids + + # 3. generation_config.json (lightweight, just reads a JSON file) + if model_path is not None: + try: + from transformers import GenerationConfig + + gen_cfg = GenerationConfig.from_pretrained(str(model_path)) + ids = _normalize_eos_raw(getattr(gen_cfg, "eos_token_id", None)) + if ids: + logger.info("EOS token IDs from generation_config.json: %s", ids) + return ids + except Exception: + pass + + # 4. Tokenizer (last resort) + if model_path is not None: + try: + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(str(model_path), trust_remote_code=True) + if tok.eos_token_id is not None: + ids = [tok.eos_token_id] + logger.info("EOS token ID from tokenizer: %s", ids) + return ids + except Exception: + pass + + return [] + + +def _get_model_default_max_new_tokens(hf_config) -> Optional[int]: + """Extract max output token limit from a HuggingFace model config. + + Checks standard fields in priority order. Returns ``None`` when the + config does not specify any recognised output-length field. + """ + if hf_config is None: + return None + for field_name in _MAX_NEW_TOKENS_FIELDS: + value = getattr(hf_config, field_name, None) + if value is not None and isinstance(value, int) and value > 0: + logger.info( + "Using model config %s=%d as default max_new_tokens", + field_name, + value, + ) + return value + return None + class Engine: def __init__(self): @@ -59,20 +149,12 @@ def _launch_processes(self) -> None: addr_tokenizer_to_scheduler: str = make_ipc_address( "tokenizer_to_scheduler", uid ) - addr_scheduler_to_model_runner: str = make_ipc_address( - "scheduler_to_model_runner", uid - ) - addr_model_runner_to_scheduler: str = make_ipc_address( - "model_runner_to_scheduler", uid - ) addr_scheduler_to_detokenizer: str = make_ipc_address( "scheduler_to_detokenizer", uid ) addr_detokenizer_to_request_response: str = make_ipc_address( "detokenizer_to_request_response", uid ) - addr_scheduler_to_disk_io: str = make_ipc_address("scheduler_to_disk_io", uid) - # Record all subprocesses procs_and_readers: List[tuple] = [] @@ -114,6 +196,7 @@ def _launch_processes(self) -> None: "tensor_transport_mode": transport_mode, "cuda_ipc_pool_size_mb": cfg.server.cuda_ipc_pool_size_mb, "cuda_ipc_recycle_interval": cfg.server.cuda_ipc_recycle_interval, + "log_level": cfg.server.log_level, } # Tokenizer @@ -131,39 +214,44 @@ def _launch_processes(self) -> None: ) procs_and_readers.append((tokenizer_proc, tokenizer_reader, "tokenizer")) - # Scheduler + # Determine default max_new_tokens from model config (if available) + model_max_new_tokens = _get_model_default_max_new_tokens( + cfg.model.hf_config + ) + scheduler_kwargs = {} + if model_max_new_tokens is not None: + scheduler_kwargs["default_max_new_tokens"] = model_max_new_tokens + + # Extract EOS token ID(s) from model config + eos_token_ids = _get_eos_token_ids(cfg.model.hf_config, model_path=cfg.server.model_path) + if eos_token_ids: + scheduler_kwargs["eos_token_ids"] = eos_token_ids + logger.info("EOS token IDs for scheduler: %s", eos_token_ids) + + # Model runner config — passed to the scheduler process which now + # owns the model runner in-process (sglang-style architecture). + scheduler_kwargs["server_config"] = cfg.server + scheduler_kwargs["model_config"] = cfg.model + scheduler_kwargs["gpu_id"] = cfg.server.base_gpu_id + + # Scheduler (+ in-process model runner) scheduler_reader, scheduler_writer = mp.Pipe(duplex=False) scheduler_proc = mp.Process( target=run_scheduler_process, args=( addr_tokenizer_to_scheduler, - addr_scheduler_to_model_runner, - addr_model_runner_to_scheduler, addr_scheduler_to_detokenizer, scheduler_writer, shared_queue, # Pass shared queue enable_shared_queue, # Pass flag transport_mode, # Pass tensor transport mode + cfg.server.log_level, # Pass log level ), + kwargs=scheduler_kwargs, daemon=True, ) procs_and_readers.append((scheduler_proc, scheduler_reader, "scheduler")) - # Model Runner - model_runner_reader, model_runner_writer = mp.Pipe(duplex=False) - model_runner_proc = mp.Process( - target=run_model_runner_process, - args=( - addr_scheduler_to_model_runner, - addr_model_runner_to_scheduler, - model_runner_writer, - ), - daemon=True, - ) - procs_and_readers.append( - (model_runner_proc, model_runner_reader, "model_runner") - ) - # Detokenizer detokenizer_reader, detokenizer_writer = mp.Pipe(duplex=False) detokenizer_proc = mp.Process( @@ -172,21 +260,12 @@ def _launch_processes(self) -> None: addr_scheduler_to_detokenizer, addr_detokenizer_to_request_response, detokenizer_writer, + tokenizer_cfg, ), daemon=True, ) procs_and_readers.append((detokenizer_proc, detokenizer_reader, "detokenizer")) - # Async Disk I/O - if get_global_config().server.enable_disk_io_async: - disk_io_reader, disk_io_writer = mp.Pipe(duplex=False) - disk_io_proc = mp.Process( - target=run_async_disk_io_process, - args=(addr_scheduler_to_disk_io, disk_io_writer), - daemon=True, - ) - procs_and_readers.append((disk_io_proc, disk_io_reader, "async_disk_io")) - # Start all subprocesses for proc, _, name in procs_and_readers: proc.start() @@ -203,20 +282,15 @@ def _launch_processes(self) -> None: raise RuntimeError(f"{name} process failed to initialise: {msg}") logger.info("%s process ready", name) - # RR Process is current main process + # RR Process is current main process — only bind ZMQ sockets here. + # Background tasks are started lazily by listen() on the first + # add_request(), so they always run on the correct event loop. self._rr_process = RequestResponseProcess( send_to_tokenizer_addr=addr_request_response_to_tokenizer, recv_from_detokenizer_addr=addr_detokenizer_to_request_response, ) - - try: - self._loop = asyncio.get_running_loop() - except RuntimeError: - self._loop = asyncio.new_event_loop() - asyncio.set_event_loop(self._loop) - - self._rr_process.start(self._loop) - logger.info("RequestResponseProcess started in main process") + self._rr_process.start() + logger.info("RequestResponseProcess sockets bound") # Print colorful gradient ASCII art banner if HAS_BANNER_LIBS: @@ -296,7 +370,12 @@ async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: ) return list(outputs) - return self._loop.run_until_complete(_run()) + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop.run_until_complete(_run()) async def generate_async( self, @@ -354,10 +433,15 @@ async def generate_async( else: yield await self._wait_for_final_result(single_rid, state) finally: - self._rr_process.remove_state(single_rid) + if not state.finished: + logger.info("Aborting request %s (client disconnected)", single_rid) + await self._rr_process.abort_request(single_rid) + else: + self._rr_process.remove_state(single_rid) else: rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] states: List[ReqState] = result # type: ignore[assignment] + _bg_tasks: List[asyncio.Task] = [] try: if stream: # Merge streams from all sub-requests using an asyncio queue. @@ -368,18 +452,18 @@ async def _forward(r: str, s: ReqState) -> None: await queue.put(chunk) await queue.put(None) # sentinel - tasks = [ + _bg_tasks = [ asyncio.create_task(_forward(r, s)) for r, s in zip(rids_list, states) ] done_count = 0 - while done_count < len(tasks): + while done_count < len(_bg_tasks): item = await queue.get() if item is None: done_count += 1 else: yield item - await asyncio.gather(*tasks) + await asyncio.gather(*_bg_tasks) else: for coro in asyncio.as_completed( [ @@ -389,8 +473,14 @@ async def _forward(r: str, s: ReqState) -> None: ): yield await coro finally: - for r in rids_list: - self._rr_process.remove_state(r) + for t in _bg_tasks: + t.cancel() + for r, s in zip(rids_list, states): + if not s.finished: + logger.info("Aborting request %s (client disconnected)", r) + await self._rr_process.abort_request(r) + else: + self._rr_process.remove_state(r) @staticmethod async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: @@ -443,7 +533,11 @@ def shutdown(self) -> None: """Terminate all subprocesses.""" if self._rr_process is not None: try: - self._loop.run_until_complete(self._rr_process.shutdown()) + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(self._rr_process.shutdown()) + else: + loop.run_until_complete(self._rr_process.shutdown()) except Exception: pass for proc in self._subprocesses: diff --git a/pymllm/executor/__init__.py b/pymllm/executor/__init__.py index e69de29b..b513b870 100644 --- a/pymllm/executor/__init__.py +++ b/pymllm/executor/__init__.py @@ -0,0 +1,10 @@ +"""Executor module: model loading, forward pass, and sampling.""" + +from pymllm.executor.cuda_graph_runner import CudaGraphRunner +from pymllm.executor.model_runner import LogitsProcessorOutput, ModelRunner + +__all__ = [ + "CudaGraphRunner", + "LogitsProcessorOutput", + "ModelRunner", +] diff --git a/pymllm/executor/cuda_graph_runner.py b/pymllm/executor/cuda_graph_runner.py index e69de29b..fe4fb0e9 100644 --- a/pymllm/executor/cuda_graph_runner.py +++ b/pymllm/executor/cuda_graph_runner.py @@ -0,0 +1,590 @@ +"""CUDA-graph accelerated forward pass for decode steps. + +Captures CUDA graphs for a set of discrete batch sizes so that the decode +forward pass can be replayed without CPU-side kernel-launch overhead. + +Simplified from sglang's ``CudaGraphRunner`` for pymllm's single-GPU +architecture. Handles: + +* Pre-allocated input buffers (avoids per-step allocations) +* CUDA-graph capture for each batch size +* Optional ``torch.compile`` integration +* Graph replay with padding to the nearest captured batch size + +Typical lifecycle:: + + runner = CudaGraphRunner(model_runner) # captures all batch sizes + + # --- inside the inference loop --- + if runner.can_run(forward_batch): + logits_output = runner.replay(forward_batch) + else: + logits_output = model_runner.forward(forward_batch) + +Integration with :class:`~pymllm.executor.model_runner.ModelRunner` +------------------------------------------------------------------- +The ``ModelRunner`` owns the ``CudaGraphRunner`` and delegates decode +batches to it when the batch size is within the captured range. The +``CudaGraphRunner`` calls ``attn_backend.init_forward_metadata_*_cuda_graph`` +directly (bypassing the normal ``init_forward_metadata`` path) so that +FlashInfer's per-batch planning is recorded inside the graph. +""" + +from __future__ import annotations + +import bisect +import gc +import logging +import time +from contextlib import contextmanager +from dataclasses import dataclass +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union + +import torch + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.executor.model_runner import LogitsProcessorOutput + +if TYPE_CHECKING: + from pymllm.executor.model_runner import ModelRunner + from pymllm.layers.attention.attention_backend import AttentionBackend + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Global CUDA-graph memory pool (shared across all CudaGraphRunner instances) +# --------------------------------------------------------------------------- + +_global_graph_memory_pool: Optional[tuple] = None + + +def get_global_graph_memory_pool() -> Optional[tuple]: + """Return the shared CUDA graph memory pool handle.""" + return _global_graph_memory_pool + + +def set_global_graph_memory_pool(pool: tuple) -> None: + """Set the shared CUDA graph memory pool handle.""" + global _global_graph_memory_pool + _global_graph_memory_pool = pool + + +# --------------------------------------------------------------------------- +# Context managers +# --------------------------------------------------------------------------- + +# Flag indicating whether we are currently capturing a CUDA graph. +_is_capture_mode: bool = False + + +def is_capture_mode() -> bool: + """Return ``True`` if a CUDA-graph capture is in progress.""" + return _is_capture_mode + + +@contextmanager +def model_capture_mode(): + """Context manager that sets the global capture-mode flag.""" + global _is_capture_mode + _is_capture_mode = True + try: + yield + finally: + _is_capture_mode = False + + +@contextmanager +def freeze_gc(): + """Freeze the garbage collector during CUDA-graph capture. + + GC activity during capture can interfere with the recorded stream + ordering. This context manager collects garbage before capture, + freezes all surviving objects, and unfreezes + re-collects afterwards. + """ + gc.collect() + gc.freeze() + try: + yield + finally: + gc.unfreeze() + gc.collect() + + +# --------------------------------------------------------------------------- +# Pre-allocated input buffers +# --------------------------------------------------------------------------- + + +@dataclass +class _InputBuffers: + """Pre-allocated GPU tensors used as CUDA-graph inputs. + + During graph capture these buffers are used as-is. During replay the + real batch data is copied into the first ``batch_size`` rows while the + remaining padding rows retain their fill values. + """ + + input_ids: torch.Tensor # [max_bs] int64 + req_pool_indices: torch.Tensor # [max_bs] int32 + seq_lens: torch.Tensor # [max_bs] int32 + seq_lens_cpu: torch.Tensor # [max_bs] int32 (CPU) + out_cache_loc: torch.Tensor # [max_bs] int64 + positions: torch.Tensor # [max_bs] int64 + mrope_position_deltas: torch.Tensor # [max_bs] int64 + + @classmethod + def create( + cls, + *, + device: torch.device, + max_bs: int, + seq_len_fill_value: int, + ) -> "_InputBuffers": + """Allocate all buffers for the given maximum batch size.""" + with torch.device(device): + input_ids = torch.zeros((max_bs,), dtype=torch.int64) + req_pool_indices = torch.zeros((max_bs,), dtype=torch.int32) + seq_lens = torch.full((max_bs,), seq_len_fill_value, dtype=torch.int32) + out_cache_loc = torch.zeros((max_bs,), dtype=torch.int64) + positions = torch.zeros((max_bs,), dtype=torch.int64) + mrope_position_deltas = torch.zeros((max_bs,), dtype=torch.int64) + + # seq_lens_cpu must be a real CPU tensor. + seq_lens_cpu = torch.full( + (max_bs,), + seq_len_fill_value, + dtype=torch.int32, + device="cpu", + ) + + return cls( + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + seq_lens_cpu=seq_lens_cpu, + out_cache_loc=out_cache_loc, + positions=positions, + mrope_position_deltas=mrope_position_deltas, + ) + + def populate( + self, + forward_batch: ForwardBatch, + padded_bs: int, + seq_len_fill_value: int, + ) -> None: + """Copy real batch data into the pre-allocated buffers. + + Any padding slots (``[real_bs : padded_bs]``) are filled with safe + defaults so that the captured graph does not access invalid memory. + """ + real_bs = forward_batch.batch_size + + # Reset padding slots when the padded size exceeds the real size. + if padded_bs != real_bs: + self.seq_lens.fill_(seq_len_fill_value) + self.out_cache_loc.zero_() + self.mrope_position_deltas.zero_() + + self.input_ids[:real_bs].copy_(forward_batch.input_ids) + self.req_pool_indices[:real_bs].copy_(forward_batch.req_pool_indices) + self.seq_lens[:real_bs].copy_(forward_batch.seq_lens) + self.out_cache_loc[:real_bs].copy_(forward_batch.out_cache_loc) + self.positions[:real_bs].copy_(forward_batch.positions) + + # Copy M-RoPE position deltas (used by Qwen3-VL for multimodal). + if forward_batch.mrope_position_deltas is not None: + self.mrope_position_deltas[:real_bs].copy_( + forward_batch.mrope_position_deltas + ) + else: + self.mrope_position_deltas[:real_bs].zero_() + + if forward_batch.seq_lens_cpu is not None: + if padded_bs != real_bs: + self.seq_lens_cpu.fill_(seq_len_fill_value) + self.seq_lens_cpu[:real_bs].copy_(forward_batch.seq_lens_cpu) + + +# --------------------------------------------------------------------------- +# Batch-size schedule +# --------------------------------------------------------------------------- + + +def _default_capture_batch_sizes(max_bs: int) -> List[int]: + """Return a list of batch sizes to capture. + + Uses the same schedule as sglang (non-speculative):: + + [1, 2, 4, 8, 12, 16, 24, 32, 40, …, 256, 272, 288, …, 512, 544, …] + + Capped at *max_bs*. + """ + bs_list = ( + [1, 2, 4, 8, 12] + + list(range(16, 257, 8)) + + list(range(272, 512, 16)) + + list(range(512, max_bs + 1, 32)) + ) + bs_list = sorted(set(bs for bs in bs_list if bs <= max_bs)) + if not bs_list: + bs_list = [1] + return bs_list + + +# --------------------------------------------------------------------------- +# CudaGraphRunner +# --------------------------------------------------------------------------- + + +class CudaGraphRunner: + """Captures and replays CUDA graphs for decode-step forward passes. + + This class is the pymllm equivalent of sglang's ``CudaGraphRunner``, + stripped of distributed, speculative-decoding, LoRA, mamba, TBO, and + piecewise-graph complexities. + + Parameters + ---------- + model_runner + The owning :class:`~pymllm.executor.model_runner.ModelRunner`. + Must have been fully initialised before the ``CudaGraphRunner`` + is constructed. + """ + + def __init__(self, model_runner: "ModelRunner"): + self.model_runner = model_runner + self.device = model_runner.device + + self.graphs: Dict[int, torch.cuda.CUDAGraph] = {} + self.output_buffers: Dict[int, LogitsProcessorOutput] = {} + + self.enable_torch_compile: bool = ( + model_runner.server_config.enable_torch_compile + ) + self.torch_compile_max_bs: int = model_runner.server_config.torch_compile_max_bs + + # ----------------------------------------------------------- + # Batch-size schedule + # ----------------------------------------------------------- + max_bs = model_runner.max_running_requests + self.capture_bs: List[int] = _default_capture_batch_sizes(max_bs) + self.compile_bs: List[int] = ( + [bs for bs in self.capture_bs if bs <= self.torch_compile_max_bs] + if self.enable_torch_compile + else [] + ) + self.max_bs: int = max(self.capture_bs) + + logger.info("CUDA graph capture batch sizes: %s", self.capture_bs) + + # ----------------------------------------------------------- + # Attention-backend CUDA-graph state + # ----------------------------------------------------------- + self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs, self.max_bs) + + # Fill value for padded seq_lens so attention kernels don't div-by-0. + self.seq_len_fill_value: int = ( + self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value() + ) + + # ----------------------------------------------------------- + # Pre-allocated input buffers + # ----------------------------------------------------------- + self.buffers: _InputBuffers = _InputBuffers.create( + device=torch.device(self.device), + max_bs=self.max_bs, + seq_len_fill_value=self.seq_len_fill_value, + ) + + # ----------------------------------------------------------- + # Optional torch.compile config + # ----------------------------------------------------------- + if self.enable_torch_compile: + _set_torch_compile_config() + + # ----------------------------------------------------------- + # Capture all batch sizes + # ----------------------------------------------------------- + try: + with model_capture_mode(): + self.capture() + except RuntimeError as exc: + raise RuntimeError( + f"CUDA graph capture failed: {exc}\n" + "Possible fixes:\n" + " 1. Reduce --server.mem_fraction_static (e.g. 0.7)\n" + " 2. Reduce --server.max_running_requests\n" + " 3. Disable CUDA graph with --server.disable_cuda_graph\n" + ) from exc + + # ------------------------------------------------------------------ + # Capability check + # ------------------------------------------------------------------ + + def can_run(self, forward_batch: ForwardBatch) -> bool: + """Return ``True`` if the batch can be run via CUDA graph replay. + + The batch must be a decode (or idle) batch whose size does not + exceed the largest captured batch size. + """ + return ( + forward_batch.forward_mode.is_decode_or_idle() + and forward_batch.batch_size <= self.max_bs + ) + + # ------------------------------------------------------------------ + # Capture + # ------------------------------------------------------------------ + + def capture(self) -> None: + """Capture CUDA graphs for every batch size in ``capture_bs``. + + Iterates in reverse order (largest first) so that the GPU memory + pool allocated for the largest graph is reused by smaller ones. + """ + tic = time.perf_counter() + before_mem = _get_avail_mem(self.device) + logger.info("CUDA graph capture begin. avail mem=%.2f GB", before_mem) + + with freeze_gc(): + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + for bs in reversed(self.capture_bs): + forward_fn = self._get_forward_fn(bs) + graph, output = self._capture_one_batch_size(bs, forward_fn, stream) + self.graphs[bs] = graph + self.output_buffers[bs] = output + + after_mem = _get_avail_mem(self.device) + logger.info( + "CUDA graph capture end. elapsed=%.2f s, mem usage=%.2f GB, " + "avail mem=%.2f GB", + time.perf_counter() - tic, + before_mem - after_mem, + after_mem, + ) + + def _get_forward_fn(self, bs: int) -> Callable: + """Return the forward callable for the given batch size. + + When ``torch.compile`` is enabled and *bs* is within the compile + threshold, the model's forward method is wrapped with + ``torch.compile``. + """ + model_forward = self.model_runner.model.forward + if self.enable_torch_compile and bs in self.compile_bs: + return torch.compile( + torch.no_grad()(model_forward), + mode="max-autotune-no-cudagraphs", + ) + return model_forward + + def _capture_one_batch_size( + self, + bs: int, + forward: Callable, + stream: torch.cuda.Stream, + ) -> tuple: + """Capture a single CUDA graph for batch size *bs*. + + Steps: + 1. Build a ``ForwardBatch`` from the pre-allocated buffers. + 2. Tell the attention backend to plan for CUDA-graph capture. + 3. Run the forward pass twice for warmup. + 4. Capture the third run into a ``CUDAGraph``. + + Returns ``(graph, output_buffers)``. + """ + buffers = self.buffers + + # Slice pre-allocated buffers to the capture size. + input_ids = buffers.input_ids[:bs] + req_pool_indices = buffers.req_pool_indices[:bs] + seq_lens = buffers.seq_lens[:bs] + seq_lens_cpu = buffers.seq_lens_cpu[:bs] + out_cache_loc = buffers.out_cache_loc[:bs] + positions = buffers.positions[:bs] + mrope_position_deltas = buffers.mrope_position_deltas[:bs] + + # Build ForwardBatch (DECODE mode). + # mrope_position_deltas is set to the static buffer (initially zeros) + # so that the graph captures the ``positions + deltas`` path. During + # replay the buffer is updated with real delta values. + forward_batch = ForwardBatch( + forward_mode=ForwardMode.DECODE, + batch_size=bs, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=int(seq_lens.sum().item()), + seq_lens_cpu=seq_lens_cpu, + positions=positions, + return_logprob=False, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool=self.model_runner.token_to_kv_pool, + attn_backend=self.model_runner.attn_backend, + mrope_position_deltas=mrope_position_deltas, + ) + + # Tell the attention backend to set up CUDA-graph-aware metadata. + self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + num_tokens=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + forward_mode=ForwardMode.DECODE, + ) + + # The single forward-pass function to be captured. + def run_once(): + return forward( + input_ids, + forward_batch.positions, + forward_batch, + ) + + # Warmup (2 eager runs to stabilise cudnn / autotuner / etc.). + for _ in range(2): + torch.cuda.synchronize() + run_once() + + # ----- Capture ----- + global _global_graph_memory_pool + if _global_graph_memory_pool is None: + _global_graph_memory_pool = torch.cuda.graph_pool_handle() + + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph( + graph, + pool=_global_graph_memory_pool, + stream=stream, + ): + output = run_once() + + return graph, output + + # ------------------------------------------------------------------ + # Replay + # ------------------------------------------------------------------ + + def replay( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Replay a captured CUDA graph for the given decode batch. + + The batch is padded to the nearest captured size, inputs are copied + into the pre-allocated buffers, the graph is replayed, and the + output is sliced back to the real batch size. + + Parameters + ---------- + forward_batch + The decode batch from the scheduler. + + Returns + ------- + LogitsProcessorOutput + The logits for the real (un-padded) sequences. + """ + real_bs = forward_batch.batch_size + + # Find the smallest captured bs >= real_bs. + idx = bisect.bisect_left(self.capture_bs, real_bs) + padded_bs = self.capture_bs[idx] + + # Copy real data into the static buffers. + self.buffers.populate( + forward_batch, + padded_bs=padded_bs, + seq_len_fill_value=self.seq_len_fill_value, + ) + + # Update the attention backend for replay. + seq_lens_sum = ( + forward_batch.seq_lens_sum + (padded_bs - real_bs) * self.seq_len_fill_value + ) + self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph( + bs=padded_bs, + req_pool_indices=self.buffers.req_pool_indices[:padded_bs], + seq_lens=self.buffers.seq_lens[:padded_bs], + seq_lens_sum=seq_lens_sum, + forward_mode=ForwardMode.DECODE, + seq_lens_cpu=self.buffers.seq_lens_cpu[:padded_bs], + ) + + # Replay the graph. + self.graphs[padded_bs].replay() + + # Retrieve output and slice to real batch size. + output = self.output_buffers[padded_bs] + + if isinstance(output, LogitsProcessorOutput): + return LogitsProcessorOutput( + next_token_logits=output.next_token_logits[:real_bs], + hidden_states=( + output.hidden_states[:real_bs] + if output.hidden_states is not None + else None + ), + ) + elif isinstance(output, torch.Tensor): + # Raw tensor output: assume [padded_bs, vocab_size]. + return LogitsProcessorOutput( + next_token_logits=output[:real_bs], + ) + else: + # HuggingFace-style output with .logits attribute. + if hasattr(output, "logits"): + logits = output.logits + if logits.dim() == 3: + return LogitsProcessorOutput( + next_token_logits=logits[:real_bs, -1, :], + ) + return LogitsProcessorOutput( + next_token_logits=logits[:real_bs], + ) + raise TypeError(f"Unexpected CUDA graph output type: {type(output)}") + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + """Release all captured CUDA graphs and associated buffers.""" + for graph in self.graphs.values(): + del graph + self.graphs.clear() + self.output_buffers.clear() + logger.info("CudaGraphRunner shutdown complete.") + + +# --------------------------------------------------------------------------- +# Utility helpers +# --------------------------------------------------------------------------- + + +def _get_avail_mem(device: str) -> float: + """Return available GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + free, _ = torch.cuda.mem_get_info() + return free / (1 << 30) + + +def _set_torch_compile_config() -> None: + """Set dynamo / inductor configs for optimal CUDA-graph + compile.""" + try: + import torch._dynamo.config + import torch._inductor.config + + torch._inductor.config.coordinate_descent_tuning = True + torch._inductor.config.triton.unique_kernel_names = True + torch._inductor.config.fx_graph_cache = True + torch._dynamo.config.accumulated_cache_size_limit = 1024 + if hasattr(torch._dynamo.config, "cache_size_limit"): + torch._dynamo.config.cache_size_limit = 1024 + except ImportError: + logger.warning("torch._dynamo / torch._inductor not available.") diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index e69de29b..6d6f33fe 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -0,0 +1,1198 @@ +"""ModelRunner runs the forward passes of the models. + +Simplified from sglang's ``ModelRunner`` for pymllm's single-GPU inference +architecture. Handles: + +* Model loading (HuggingFace checkpoint via ``transformers``) +* KV-cache memory pool initialisation +* Attention backend setup (FlashInfer) +* Forward pass dispatch (extend / decode / idle) +* Token sampling from logits + +Typical lifecycle:: + + runner = ModelRunner(server_config, model_config) + runner.initialize() + + # --- inside the inference loop --- + forward_batch = runner.prepare_forward_batch_decode(...) + logits_output = runner.forward(forward_batch) + next_token_ids = runner.sample(logits_output, forward_batch) + +Typical data flow +----------------- + SchedulerProcess builds a batch dict + ↓ + ModelRunnerProcess calls ModelRunner.forward(forward_batch) + ↓ + attn_backend.init_forward_metadata(forward_batch) + ↓ + model.forward(input_ids, positions, forward_batch) + ↓ + ModelRunner.sample(logits_output, forward_batch) + ↓ + next_token_ids returned to scheduler +""" + +from __future__ import annotations + +import gc +import logging +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import torch +from torch import nn + +from pymllm.configs import get_global_config +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.mem_cache.memory_pool import ( + GDNPool, + KVPool, + ReqToTokenPool, + TokenToKVPoolAllocator, + make_full_attention_net_mem_pool, + make_req_to_token_pool, +) + +if TYPE_CHECKING: + from pymllm.configs.model_config import ModelConfig + from pymllm.configs.server_config import ServerConfig + from pymllm.executor.cuda_graph_runner import CudaGraphRunner + from pymllm.layers.attention.attention_backend import AttentionBackend + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Utility: GPU memory query +# --------------------------------------------------------------------------- + + +def get_available_gpu_memory(device: str = "cuda", gpu_id: int = 0) -> float: + """Return available GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + torch.cuda.set_device(gpu_id) + free, _ = torch.cuda.mem_get_info(gpu_id) + return free / (1 << 30) + + +def get_total_gpu_memory(device: str = "cuda", gpu_id: int = 0) -> float: + """Return total GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + torch.cuda.set_device(gpu_id) + _, total = torch.cuda.mem_get_info(gpu_id) + return total / (1 << 30) + + +# --------------------------------------------------------------------------- +# LogitsProcessorOutput +# --------------------------------------------------------------------------- + + +@dataclass +class LogitsProcessorOutput: + """Container for output logits produced by the model's forward pass. + + Attributes + ---------- + next_token_logits + Raw logits for the last token of each sequence in the batch, + shape ``[batch_size, vocab_size]``. + hidden_states + Optional hidden states from the model (e.g. for speculative decoding + or auxiliary loss computation). + """ + + next_token_logits: torch.Tensor # [batch_size, vocab_size] + hidden_states: Optional[torch.Tensor] = None + + +# --------------------------------------------------------------------------- +# ModelRunner +# --------------------------------------------------------------------------- + + +class ModelRunner: + """Runs the forward passes of the models. + + This is the core execution component that owns the model, memory pools, + and attention backend. It is used by + :class:`~pymllm.orchestrator.model_runner_process.ModelRunnerProcess` to + execute batches dispatched by the scheduler. + + Parameters + ---------- + server_config + Server runtime configuration. Falls back to the global singleton + when ``None``. + model_config + Model configuration (wraps a HuggingFace ``PretrainedConfig``). + Falls back to the global singleton when ``None``. + gpu_id + GPU device index to use. + """ + + def __init__( + self, + server_config: Optional["ServerConfig"] = None, + model_config: Optional["ModelConfig"] = None, + gpu_id: int = 0, + ): + cfg = get_global_config() + self.server_config = server_config or cfg.server + self.model_config = model_config or cfg.model + + self.gpu_id = gpu_id + self.device: str = "cuda" if torch.cuda.is_available() else "cpu" + self.dtype: torch.dtype = self._resolve_dtype() + + # Set by initialize() + self.model: Optional[nn.Module] = None + self.req_to_token_pool: Optional[ReqToTokenPool] = None + self.token_to_kv_pool: Optional[KVPool] = None + self.token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None + self.gdn_pool: Optional[GDNPool] = None + self.attn_backend: Optional["AttentionBackend"] = None + self.graph_runner: Optional["CudaGraphRunner"] = None + + # Memory configuration + self.max_total_num_tokens: int = 0 + self.max_running_requests: int = 0 + + # Model metadata (populated after loading) + self.num_hidden_layers: int = 0 + self.num_attention_heads: int = 0 + self.num_kv_heads: int = 0 + self.head_dim: int = 0 + self.hidden_size: int = 0 + self.vocab_size: int = 0 + self.context_len: int = 0 + + # KV cache dtype -- same as model dtype by default; may differ for + # quantised KV caches in the future. + self.kv_cache_dtype: torch.dtype = self.dtype + + # Forward pass counter (monotonically increasing). + self.forward_pass_id: int = 0 + + # ------------------------------------------------------------------ + # Initialisation + # ------------------------------------------------------------------ + + def initialize(self) -> None: + """Full initialisation: set device, load model, init memory + backend. + + Call this once before any forward pass. + """ + tic = time.perf_counter() + logger.info("ModelRunner initialisation begin.") + + # Set device + if self.device == "cuda": + torch.cuda.set_device(self.gpu_id) + + # Set default dtype + torch.set_default_dtype(self.dtype) + + # Load the model + self.load_model() + + # Extract model metadata from hf_config + self._extract_model_metadata() + + # Resolve KV-cache dtype + self._configure_kv_cache_dtype() + + # Initialise memory pools + self.init_memory_pool() + + # Initialise attention backend + self.init_attention_backend() + + # Warm up cuBLAS + if self.device == "cuda": + self._init_cublas() + + # Capture CUDA graphs (must be after model + pools + backend) + self.init_cuda_graphs() + + elapsed = time.perf_counter() - tic + logger.info( + "ModelRunner initialisation complete. elapsed=%.2f s, " + "device=%s, dtype=%s, kv_dtype=%s, max_tokens=%d, max_reqs=%d", + elapsed, + self.device, + self.dtype, + self.kv_cache_dtype, + self.max_total_num_tokens, + self.max_running_requests, + ) + + # ------------------------------------------------------------------ + # Dtype resolution + # ------------------------------------------------------------------ + + def _resolve_dtype(self) -> torch.dtype: + """Resolve the model dtype from configuration.""" + dtype_str = self.server_config.dtype + if dtype_str == "auto": + if torch.cuda.is_available(): + if torch.cuda.get_device_capability()[0] >= 8: + return torch.bfloat16 + return torch.float16 + return torch.float32 + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + result = dtype_map.get(dtype_str) + if result is None: + raise ValueError(f"Unsupported dtype: {dtype_str!r}") + return result + + def _configure_kv_cache_dtype(self) -> None: + """Determine the dtype used for KV-cache storage. + + The global ``QuantizationConfig.kv_cache_dtype`` can override the + model dtype (e.g. ``fp8_e4m3`` for quantised KV caches). When set + to ``"auto"`` the model dtype is used as-is. + """ + cfg = get_global_config() + kv_dtype_str = cfg.quantization.kv_cache_dtype + + if kv_dtype_str == "auto": + self.kv_cache_dtype = self.dtype + return + + kv_dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "fp8_e4m3": torch.float8_e4m3fn, + "fp8_e5m2": torch.float8_e5m2, + } + resolved = kv_dtype_map.get(kv_dtype_str) + if resolved is None: + logger.warning( + "Unrecognised kv_cache_dtype %r, falling back to model dtype.", + kv_dtype_str, + ) + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = resolved + + logger.info("KV-cache dtype: %s", self.kv_cache_dtype) + + # ------------------------------------------------------------------ + # Model metadata + # ------------------------------------------------------------------ + + def _extract_model_metadata(self) -> None: + """Extract key model parameters from the HuggingFace config.""" + hf_config = self.model_config.hf_config + if hf_config is None: + raise RuntimeError( + "HuggingFace config not loaded. " + "Make sure model_config.hf_config is set before calling " + "initialize()." + ) + + # Handle text_config for multimodal models + text_config = getattr(hf_config, "text_config", hf_config) + + self.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) + self.num_attention_heads = getattr(text_config, "num_attention_heads", 0) + self.num_kv_heads = getattr( + text_config, + "num_key_value_heads", + self.num_attention_heads, + ) + self.head_dim = getattr( + text_config, + "head_dim", + getattr(text_config, "hidden_size", 0) // max(self.num_attention_heads, 1), + ) + self.hidden_size = getattr(text_config, "hidden_size", 0) + self.vocab_size = getattr(text_config, "vocab_size", 0) + + # V-head dim may differ from K-head dim (e.g. MLA) + self.v_head_dim: int = getattr(text_config, "v_head_dim", self.head_dim) + + # Context length + self.context_len = self.server_config.context_length or getattr( + text_config, "max_position_embeddings", 4096 + ) + + # Hybrid model metadata (GDN layers) + self.num_gdn_layers: int = getattr(self.model, "num_gdn_layers", 0) + self.full_attn_layer_ids: set = getattr(self.model, "full_attn_layer_ids", set()) + + logger.info( + "Model metadata: layers=%d, q_heads=%d, kv_heads=%d, " + "head_dim=%d, v_head_dim=%d, hidden=%d, vocab=%d, ctx_len=%d" + + (", gdn_layers=%d" if self.num_gdn_layers > 0 else ""), + self.num_hidden_layers, + self.num_attention_heads, + self.num_kv_heads, + self.head_dim, + self.v_head_dim, + self.hidden_size, + self.vocab_size, + self.context_len, + *([self.num_gdn_layers] if self.num_gdn_layers > 0 else []), + ) + + # ------------------------------------------------------------------ + # Model loading + # ------------------------------------------------------------------ + + def load_model(self) -> None: + """Load the model from a HuggingFace checkpoint. + + First checks the pymllm model registry for a custom implementation + that uses ``RadixAttention``. If found, instantiates it with the + HuggingFace config and loads weights via ``load_weights()``. + Otherwise falls back to ``AutoModelForCausalLM.from_pretrained``. + """ + tic = time.perf_counter() + model_path = self.server_config.model_path + + if model_path is None: + raise RuntimeError("server_config.model_path is not set.") + + before_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info( + "Load model begin. path=%s, avail mem=%.2f GB", + model_path, + before_mem, + ) + + # Look up the architecture in the pymllm model registry + from pymllm.models import _MODEL_REGISTRY, get_model_class + + hf_config = self.model_config.hf_config + architectures = [] + if hf_config is not None: + architectures = getattr(hf_config, "architectures", None) or [] + + if not architectures: + supported = ", ".join(sorted(_MODEL_REGISTRY.keys())) + raise RuntimeError( + f"Cannot determine model architecture from config. " + f"Supported architectures: {supported}" + ) + + architecture = architectures[0] + model_cls = get_model_class(architecture) + if model_cls is None: + supported = ", ".join(sorted(_MODEL_REGISTRY.keys())) + raise RuntimeError( + f"Architecture {architecture!r} is not supported by pymllm. " + f"Supported architectures: {supported}" + ) + + logger.info("Using pymllm model class: %s", model_cls.__name__) + device_str = f"cuda:{self.gpu_id}" if self.device == "cuda" else self.device + # Use set_default_dtype so parameters created without explicit dtype + # get the target dtype, while parameters with explicit dtype=torch.float32 + # (e.g. A_log, dt_bias in GDN layers) stay in float32. + old_dtype = torch.get_default_dtype() + torch.set_default_dtype(self.dtype) + try: + with torch.device(device_str): + self.model = model_cls(hf_config) + finally: + torch.set_default_dtype(old_dtype) + self.model.load_weights(self._iter_weights(model_path)) + self.model.eval() + + after_mem = get_available_gpu_memory(self.device, self.gpu_id) + weight_mem = before_mem - after_mem + logger.info( + "Load model end. elapsed=%.2f s, type=%s, " + "weight_mem=%.2f GB, avail mem=%.2f GB", + time.perf_counter() - tic, + type(self.model).__name__, + weight_mem, + after_mem, + ) + + @staticmethod + def _iter_weights(model_path) -> "Generator[Tuple[str, torch.Tensor], None, None]": + """Yield ``(name, tensor)`` pairs from safetensors or ``.bin`` files. + + Prefers safetensors when available; falls back to PyTorch ``.bin`` + files otherwise. + """ + import glob as _glob + from pathlib import Path + + model_path = Path(model_path) + + # Prefer safetensors + st_files = sorted(_glob.glob(str(model_path / "*.safetensors"))) + if st_files: + from safetensors.torch import load_file + + for fpath in st_files: + state_dict = load_file(fpath) + yield from state_dict.items() + del state_dict + return + + # Fallback: PyTorch .bin files + bin_files = sorted(_glob.glob(str(model_path / "*.bin"))) + for fpath in bin_files: + state_dict = torch.load(fpath, map_location="cpu", weights_only=True) + yield from state_dict.items() + del state_dict + + # ------------------------------------------------------------------ + # Memory pool initialisation + # ------------------------------------------------------------------ + + def init_memory_pool(self) -> None: + """Initialise KV-cache memory pools and request-to-token mapping. + + 1. Profiles available GPU memory to determine the maximum number of + KV-cache token slots (``max_total_num_tokens``). + 2. Derives ``max_running_requests`` from config or heuristic. + 3. Creates :class:`~pymllm.mem_cache.memory_pool.ReqToTokenPool`, + :class:`~pymllm.mem_cache.memory_pool.KVPool`, and + :class:`~pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator`. + """ + logger.info("Initialising memory pools...") + + # Determine max number of tokens in KV cache + self.max_total_num_tokens = self._profile_max_num_tokens() + + # Determine max running requests + max_reqs = self.server_config.max_running_requests + if max_reqs is None: + max_reqs = min( + max( + int(self.max_total_num_tokens / self.context_len * 512), + 2048, + ), + 4096, + ) + self.max_running_requests = max_reqs + + if self.max_total_num_tokens <= 0: + raise RuntimeError( + "Not enough memory for KV cache. " + "Try reducing context_length or using a smaller model." + ) + + # Create ReqToTokenPool + self.req_to_token_pool = make_req_to_token_pool( + max_reqs=self.max_running_requests, + max_context_len=self.context_len + 4, # small padding + device=self.device, + ) + + # Create KVPool + TokenToKVPoolAllocator + # Note: layer_num uses num_hidden_layers even for hybrid models + # because the KV pool is indexed by global layer_id. GDN layers' + # KV slots are allocated but unused (they use GDNPool instead). + self.token_to_kv_pool, self.token_to_kv_pool_allocator = ( + make_full_attention_net_mem_pool( + size=self.max_total_num_tokens, + layer_num=self.num_hidden_layers, + k_head_num=self.num_kv_heads, + k_head_dim=self.head_dim, + v_head_num=self.num_kv_heads, + v_head_dim=self.v_head_dim, + device=self.device, + dtype=self.kv_cache_dtype, + ) + ) + + # Create GDNPool if hybrid model with GDN layers + if self.num_gdn_layers > 0: + hf_config = self.model_config.hf_config + text_config = getattr(hf_config, "text_config", hf_config) + gdn_num_k_heads = getattr(text_config, "linear_num_key_heads", 16) + gdn_num_v_heads = getattr(text_config, "linear_num_value_heads", 32) + gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) + gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) + gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) + gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + + self.gdn_pool = GDNPool( + max_reqs=self.max_running_requests, + num_gdn_layers=self.num_gdn_layers, + num_v_heads=gdn_num_v_heads, + head_k_dim=gdn_head_k_dim, + head_v_dim=gdn_head_v_dim, + conv_dim=gdn_conv_dim, + conv_kernel_size=gdn_conv_kernel, + device=self.device, + dtype=self.dtype, + max_track_slots=self.max_running_requests, + ) + + logger.info( + "Memory pool initialised: max_tokens=%d, max_reqs=%d, kv_pool=%.2f GB" + + (", gdn_pool=%.2f GB" if self.gdn_pool is not None else ""), + self.max_total_num_tokens, + self.max_running_requests, + self.token_to_kv_pool._mem_bytes() / (1 << 30), + *([self.gdn_pool.mem_bytes() / (1 << 30)] if self.gdn_pool is not None else []), + ) + + def _profile_max_num_tokens(self) -> int: + """Profile available memory to determine maximum KV-cache tokens. + + If ``server_config.max_total_tokens`` is explicitly set that value + is used directly. Otherwise a memory-fraction-based heuristic + similar to sglang's ``profile_max_num_token`` is applied. + """ + # If user explicitly set max_total_tokens, use that. + if self.server_config.max_total_tokens is not None: + return self.server_config.max_total_tokens + + if self.device != "cuda": + # For CPU, use a conservative default. + return 4096 + + available_gb = get_available_gpu_memory(self.device, self.gpu_id) + + # Determine memory fraction for static allocation (KV cache). + mem_fraction = self.server_config.mem_fraction_static + if mem_fraction is None: + mem_fraction = 0.85 # default: use 85% of remaining memory + + # Calculate per-token KV cache size in bytes. + kv_element_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size() + cell_size = ( + self.num_kv_heads + * (self.head_dim + self.v_head_dim) # K + V + * self.num_hidden_layers + * kv_element_size + ) + + if cell_size == 0: + logger.warning( + "cell_size is 0 (model metadata may be incomplete); " + "using default max_total_num_tokens=4096" + ) + return 4096 + + rest_memory_bytes = int(available_gb * mem_fraction * (1 << 30)) + + # Reserve memory for GDN pool if hybrid model + if self.num_gdn_layers > 0: + hf_config = self.model_config.hf_config + text_config = getattr(hf_config, "text_config", hf_config) + gdn_num_k_heads = getattr(text_config, "linear_num_key_heads", 16) + gdn_num_v_heads = getattr(text_config, "linear_num_value_heads", 32) + gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) + gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) + gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) + gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + + # Estimate GDN pool memory for max_running_requests + # Track slots add max_reqs_est extra slots for prefix cache snapshots + max_reqs_est = min( + max(int(rest_memory_bytes / cell_size / self.context_len * 512), 2048), + 4096, + ) if self.server_config.max_running_requests is None else self.server_config.max_running_requests + pool_size = max_reqs_est + 1 + max_reqs_est # +track_slots + recurrent_bytes = ( + self.num_gdn_layers * pool_size * gdn_num_v_heads + * gdn_head_v_dim * gdn_head_k_dim * 4 # float32 + ) + dtype_size = torch.tensor([], dtype=self.dtype).element_size() + conv_bytes = ( + self.num_gdn_layers * pool_size * gdn_conv_dim + * (gdn_conv_kernel - 1) * dtype_size + ) + gdn_pool_bytes = recurrent_bytes + conv_bytes + rest_memory_bytes -= gdn_pool_bytes + logger.info( + "GDN pool memory reservation: %.2f GB", + gdn_pool_bytes / (1 << 30), + ) + + max_num_tokens = rest_memory_bytes // cell_size + + logger.info( + "Memory profiling: avail=%.2f GB, fraction=%.2f, " + "cell_size=%d bytes, max_tokens=%d", + available_gb, + mem_fraction, + cell_size, + max_num_tokens, + ) + + return max(max_num_tokens, 1) # at least 1 + + # ------------------------------------------------------------------ + # Attention backend + # ------------------------------------------------------------------ + + def init_attention_backend(self) -> None: + """Initialise the attention backend. + + Creates a :class:`FlashInferAttnBackend` for standard models, or a + :class:`HybridAttnBackend` (FlashInfer + GDN) for hybrid models. + """ + from pymllm.layers.attention.flashinfer_backend import FlashInferAttnBackend + + logger.info("Initialising attention backend...") + + flash_backend = FlashInferAttnBackend( + num_heads=self.num_attention_heads, + num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + kv_cache_dtype=self.kv_cache_dtype, + q_dtype=self.dtype, + max_context_len=self.context_len, + req_to_token=self.req_to_token_pool.req_to_token, + device=torch.device(self.device), + max_req_pool_size=self.req_to_token_pool.size, + ) + + if self.gdn_pool is not None: + from pymllm.layers.attention.gdn_backend import GDNAttnBackend + from pymllm.layers.attention.hybrid_backend import HybridAttnBackend + + gdn_backend = GDNAttnBackend( + gdn_pool=self.gdn_pool, + device=torch.device(self.device), + ) + self.attn_backend = HybridAttnBackend( + full_attn_backend=flash_backend, + gdn_backend=gdn_backend, + full_attn_layer_ids=self.full_attn_layer_ids, + ) + else: + self.attn_backend = flash_backend + + logger.info( + "Attention backend: %s", + type(self.attn_backend).__name__, + ) + + # ------------------------------------------------------------------ + # Warmup + # ------------------------------------------------------------------ + + def _init_cublas(self) -> None: + """Run a small matmul to initialise cuBLAS. + + Without this, the first real matmul may incur a significant + initialisation overhead. + """ + dtype = torch.float16 + device = "cuda" + a = torch.ones((16, 16), dtype=dtype, device=device) + b = torch.ones((16, 16), dtype=dtype, device=device) + _ = a @ b + + # ------------------------------------------------------------------ + # CUDA graph capture + # ------------------------------------------------------------------ + + def init_cuda_graphs(self) -> None: + """Capture CUDA graphs for decode-step acceleration. + + Skipped when: + * The device is not CUDA. + * ``server_config.disable_cuda_graph`` is ``True``. + * The model is not a generation model. + """ + self.graph_runner = None + + if self.device != "cuda": + return + if self.server_config.disable_cuda_graph: + logger.info("CUDA graphs disabled by config.") + return + if not self.is_generation: + return + + from pymllm.executor.cuda_graph_runner import CudaGraphRunner + + tic = time.perf_counter() + before_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info("Capturing CUDA graphs... avail mem=%.2f GB", before_mem) + + self.graph_runner = CudaGraphRunner(self) + + after_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info( + "CUDA graph capture complete. elapsed=%.2f s, " + "mem usage=%.2f GB, avail mem=%.2f GB", + time.perf_counter() - tic, + before_mem - after_mem, + after_mem, + ) + + # ------------------------------------------------------------------ + # ForwardBatch construction + # ------------------------------------------------------------------ + + def prepare_forward_batch_extend( + self, + input_ids: torch.Tensor, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + extend_seq_lens: torch.Tensor, + extend_prefix_lens: torch.Tensor, + out_cache_loc: torch.Tensor, + return_logprob: bool = False, + top_logprobs_nums: Optional[List[int]] = None, + ) -> ForwardBatch: + """Build a :class:`ForwardBatch` for an extend (prefill) pass. + + Parameters + ---------- + input_ids + Token IDs for all new tokens, shape ``[total_new_tokens]``. + req_pool_indices + Index of each request in ``ReqToTokenPool``, + shape ``[batch_size]``. + seq_lens + Total (prefix + new) length of each sequence, + shape ``[batch_size]``. + extend_seq_lens + Number of new tokens per sequence, shape ``[batch_size]``. + extend_prefix_lens + Cached prefix length per sequence, shape ``[batch_size]``. + out_cache_loc + KV-pool slot indices for each new token, + shape ``[total_new_tokens]``. + return_logprob + Whether to return per-token log-probabilities. + top_logprobs_nums + Number of top log-probs per sequence. + """ + batch_size = req_pool_indices.shape[0] + seq_lens_sum = int(seq_lens.sum().item()) + extend_num_tokens = int(extend_seq_lens.sum().item()) + + # Compute positions for each token + positions = _compute_positions(extend_seq_lens, extend_prefix_lens) + + # Compute extend_start_loc (exclusive cumsum of extend_seq_lens) + extend_start_loc = torch.zeros( + batch_size, dtype=torch.int32, device=self.device + ) + if batch_size > 1: + extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0).to( + torch.int32 + ) + + return ForwardBatch( + forward_mode=ForwardMode.EXTEND, + batch_size=batch_size, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=seq_lens_sum, + seq_lens_cpu=seq_lens.cpu(), + positions=positions, + extend_num_tokens=extend_num_tokens, + extend_seq_lens=extend_seq_lens, + extend_prefix_lens=extend_prefix_lens, + extend_start_loc=extend_start_loc, + extend_prefix_lens_cpu=extend_prefix_lens.tolist(), + extend_seq_lens_cpu=extend_seq_lens.tolist(), + return_logprob=return_logprob, + top_logprobs_nums=top_logprobs_nums, + req_to_token_pool=self.req_to_token_pool, + token_to_kv_pool=self.token_to_kv_pool, + attn_backend=self.attn_backend, + ) + + def prepare_forward_batch_decode( + self, + input_ids: torch.Tensor, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + out_cache_loc: torch.Tensor, + return_logprob: bool = False, + top_logprobs_nums: Optional[List[int]] = None, + mrope_position_deltas: Optional[torch.Tensor] = None, + ) -> ForwardBatch: + """Build a :class:`ForwardBatch` for a decode step. + + Parameters + ---------- + input_ids + Token IDs (one per sequence), shape ``[batch_size]``. + req_pool_indices + Index of each request in ``ReqToTokenPool``, + shape ``[batch_size]``. + seq_lens + Total sequence length of each request, shape ``[batch_size]``. + out_cache_loc + KV-pool slot for each sequence's new token, + shape ``[batch_size]``. + return_logprob + Whether to return per-token log-probabilities. + top_logprobs_nums + Number of top log-probs per sequence. + mrope_position_deltas + Per-request M-RoPE position deltas, shape ``[batch_size]`` (int64). + Used by multimodal models (e.g. Qwen3-VL) to offset decode-step + positions by the spatial extent of prefill images. + """ + batch_size = req_pool_indices.shape[0] + seq_lens_sum = int(seq_lens.sum().item()) + + # For decode, positions = seq_lens - 1 (the new token position) + positions = (seq_lens - 1).to(torch.int64) + + return ForwardBatch( + forward_mode=ForwardMode.DECODE, + batch_size=batch_size, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=seq_lens_sum, + seq_lens_cpu=seq_lens.cpu(), + positions=positions, + return_logprob=return_logprob, + top_logprobs_nums=top_logprobs_nums, + req_to_token_pool=self.req_to_token_pool, + token_to_kv_pool=self.token_to_kv_pool, + attn_backend=self.attn_backend, + mrope_position_deltas=mrope_position_deltas, + ) + + # ------------------------------------------------------------------ + # Forward pass + # ------------------------------------------------------------------ + + def forward( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run a forward pass through the model. + + Dispatches to the appropriate method based on the batch's + :attr:`~pymllm.engine.forward_batch.ForwardMode`. For decode + batches, automatically uses CUDA-graph replay when a captured + graph is available. + + Parameters + ---------- + forward_batch + The prepared batch (from ``prepare_forward_batch_*``). + + Returns + ------- + LogitsProcessorOutput + Contains ``next_token_logits`` of shape + ``[batch_size, vocab_size]``. + """ + self.forward_pass_id += 1 + + if forward_batch.forward_mode.is_idle(): + return self._forward_idle(forward_batch) + + # Try CUDA graph replay for decode batches. + if ( + forward_batch.forward_mode.is_decode() + and self.graph_runner is not None + and self.graph_runner.can_run(forward_batch) + ): + return self.graph_runner.replay(forward_batch) + + if forward_batch.forward_mode.is_decode(): + return self.forward_decode(forward_batch) + elif forward_batch.forward_mode.is_extend(): + return self.forward_extend(forward_batch) + else: + raise ValueError(f"Unsupported forward mode: {forward_batch.forward_mode}") + + def forward_decode( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run a decode forward pass (one new token per sequence). + + Calls ``attn_backend.init_forward_metadata`` followed by + ``model.forward``. + """ + self.attn_backend.init_forward_metadata(forward_batch) + model_output = self.model.forward( + forward_batch.input_ids, + forward_batch.positions, + forward_batch, + ) + return self._process_logits(model_output, forward_batch) + + def forward_extend( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run an extend (prefill) forward pass. + + Calls ``attn_backend.init_forward_metadata`` followed by + ``model.forward``. + """ + self.attn_backend.init_forward_metadata(forward_batch) + model_output = self.model.forward( + forward_batch.input_ids, + forward_batch.positions, + forward_batch, + ) + return self._process_logits(model_output, forward_batch) + + def _forward_idle( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Return empty logits for an idle batch (no sequences to process).""" + return LogitsProcessorOutput( + next_token_logits=torch.empty( + (0, self.vocab_size), + dtype=self.dtype, + device=self.device, + ), + ) + + # ------------------------------------------------------------------ + # Logits post-processing + # ------------------------------------------------------------------ + + def _process_logits( + self, + model_output: Any, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Extract last-token logits from model output. + + Handles: + * A :class:`LogitsProcessorOutput` returned by custom model + implementations. + * A ``CausalLMOutput`` (from HuggingFace ``transformers``) with a + ``.logits`` attribute. + * A raw ``torch.Tensor`` of logits. + """ + if isinstance(model_output, LogitsProcessorOutput): + return model_output + + # Standard HuggingFace output + if hasattr(model_output, "logits"): + logits = model_output.logits + elif isinstance(model_output, torch.Tensor): + logits = model_output + else: + raise TypeError( + f"Unexpected model output type: {type(model_output)}. " + "Expected torch.Tensor or an object with .logits attribute." + ) + + # --- Decode: logits is [bs, 1, vocab] or [bs, vocab] --- + if forward_batch.forward_mode.is_decode(): + if logits.dim() == 3: + next_token_logits = logits[:, -1, :] + else: + next_token_logits = logits + else: + # --- Extend: pick the last token of each sequence --- + next_token_logits = self._gather_last_token_logits(logits, forward_batch) + + return LogitsProcessorOutput(next_token_logits=next_token_logits) + + def _gather_last_token_logits( + self, + logits: torch.Tensor, + forward_batch: ForwardBatch, + ) -> torch.Tensor: + """Gather the logits of the last token in each sequence for extend. + + During extend, the model processes all tokens but we only need the + logits at the last position of each sequence for next-token sampling. + """ + if logits.dim() == 3: + # [batch_size, seq_len, vocab_size] from standard HF model + return logits[:, -1, :] + + # Flat layout [total_tokens, vocab_size] + if ( + forward_batch.extend_start_loc is not None + and forward_batch.extend_seq_lens is not None + ): + last_indices = ( + forward_batch.extend_start_loc + forward_batch.extend_seq_lens - 1 + ).long() + return logits[last_indices] + + # Fallback: last row + return logits[-1:, :] + + # ------------------------------------------------------------------ + # Sampling + # ------------------------------------------------------------------ + + def sample( + self, + logits_output: LogitsProcessorOutput, + forward_batch: ForwardBatch, + temperatures: Optional[torch.Tensor] = None, + top_ps: Optional[torch.Tensor] = None, + top_ks: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Sample next-token IDs from logits. + + Supports per-request temperature, top-p, and top-k. + + Parameters + ---------- + logits_output + The logits from :meth:`forward`. + forward_batch + The current forward batch. + temperatures + Per-request temperature, shape ``[batch_size]``. + top_ps + Per-request top-p, shape ``[batch_size]``. + top_ks + Per-request top-k, shape ``[batch_size]``. + + Returns + ------- + torch.Tensor + Next-token IDs, shape ``[batch_size]``, dtype ``int32``. + """ + from pymllm.layers.sampling import ( + sampling_from_probs, + softmax, + top_k_top_p_sampling_from_probs, + ) + + logits = logits_output.next_token_logits + + if logits.numel() == 0: + return torch.empty(0, dtype=torch.int32, device=self.device) + + # Greedy path: temperature=0 (or all zeros) → argmax, no sampling. + if temperatures is not None: + all_greedy = bool((temperatures < 1e-6).all()) + else: + all_greedy = False + + if all_greedy: + return logits.argmax(dim=-1).to(torch.int32) + + # Stochastic path: apply temperature then sample. + if temperatures is not None: + probs = softmax(logits, temperature=temperatures) + else: + probs = torch.softmax(logits.float(), dim=-1) + + # Apply top-k / top-p sampling if specified + has_top_k = top_ks is not None + has_top_p = top_ps is not None + + if has_top_k or has_top_p: + k = top_ks if has_top_k else logits.shape[-1] + p = top_ps if has_top_p else 1.0 + next_token_ids = top_k_top_p_sampling_from_probs(probs, k, p) + else: + next_token_ids = sampling_from_probs(probs) + + return next_token_ids + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + """Release model and memory resources.""" + logger.info("ModelRunner shutting down...") + + if self.graph_runner is not None: + self.graph_runner.shutdown() + self.graph_runner = None + if self.model is not None: + del self.model + self.model = None + if self.token_to_kv_pool is not None: + del self.token_to_kv_pool + self.token_to_kv_pool = None + if self.token_to_kv_pool_allocator is not None: + del self.token_to_kv_pool_allocator + self.token_to_kv_pool_allocator = None + if self.gdn_pool is not None: + del self.gdn_pool + self.gdn_pool = None + if self.req_to_token_pool is not None: + del self.req_to_token_pool + self.req_to_token_pool = None + self.attn_backend = None + + if self.device == "cuda": + torch.cuda.empty_cache() + gc.collect() + + logger.info("ModelRunner shutdown complete.") + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def is_generation(self) -> bool: + """True if the model is a generation (causal-LM) model.""" + return True + + @property + def sliding_window_size(self) -> Optional[int]: + """Sliding-window attention span, or ``None`` for full context.""" + hf_config = self.model_config.hf_config + if hf_config is None: + return None + text_config = getattr(hf_config, "text_config", hf_config) + return getattr(text_config, "sliding_window", None) + + +# --------------------------------------------------------------------------- +# Utility functions +# --------------------------------------------------------------------------- + + +def _compute_positions( + extend_seq_lens: torch.Tensor, + extend_prefix_lens: torch.Tensor, +) -> torch.Tensor: + """Compute per-token positions for an extend batch. + + For each sequence, positions are + ``[prefix_len, prefix_len+1, ..., prefix_len+seq_len-1]``. + The result is a flat 1-D tensor of shape ``[sum(extend_seq_lens)]``. + """ + device = extend_seq_lens.device + batch_size = extend_seq_lens.shape[0] + total_tokens = int(extend_seq_lens.sum().item()) + + if total_tokens == 0: + return torch.empty(0, dtype=torch.int64, device=device) + + positions = torch.empty(total_tokens, dtype=torch.int64, device=device) + offset = 0 + for i in range(batch_size): + seq_len = int(extend_seq_lens[i].item()) + prefix_len = int(extend_prefix_lens[i].item()) + if seq_len > 0: + positions[offset : offset + seq_len] = torch.arange( + prefix_len, + prefix_len + seq_len, + dtype=torch.int64, + device=device, + ) + offset += seq_len + + return positions diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index 97cfb921..2ecb1396 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -6,9 +6,12 @@ from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear from pymllm.layers.mlp import MLP, ParallelMLP from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rms_norm_gated import RMSNormGated +from pymllm.layers.gated_delta_net import GatedDeltaNet from pymllm.layers.rope import ( apply_llama31_rope, apply_llama31_rope_pos_ids, + apply_mrope, apply_rope, apply_rope_pos_ids, apply_rope_with_cos_sin_cache, @@ -41,6 +44,7 @@ "LayerNorm", "RMSNorm", "GemmaRMSNorm", + "apply_mrope", "apply_rope", "apply_llama31_rope", "apply_rope_pos_ids", diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py index 5d0dbf07..ae187975 100644 --- a/pymllm/layers/attention/__init__.py +++ b/pymllm/layers/attention/__init__.py @@ -8,7 +8,10 @@ WrapperDispatch, should_use_tensor_core, ) +from pymllm.layers.attention.gdn_backend import GDNAttnBackend +from pymllm.layers.attention.hybrid_backend import HybridAttnBackend from pymllm.layers.attention.radix_attention import AttentionType, RadixAttention +from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention __all__ = [ # Base @@ -16,10 +19,15 @@ # RadixAttention "AttentionType", "RadixAttention", + # RadixLinearAttention (GDN) + "RadixLinearAttention", # FlashInfer backend "FlashInferAttnBackend", "DecodeMetadata", "PrefillMetadata", "WrapperDispatch", "should_use_tensor_core", + # GDN + Hybrid backends + "GDNAttnBackend", + "HybridAttnBackend", ] diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py index 07e2f6a1..fe168c2d 100644 --- a/pymllm/layers/attention/attention_backend.py +++ b/pymllm/layers/attention/attention_backend.py @@ -103,6 +103,28 @@ def forward( q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs ) + # ------------------------------------------------------------------ + # GDN linear-attention interface (used by HybridAttnBackend) + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Run GDN linear-attention for one layer. + + Only implemented by backends that support hybrid (full + GDN) + architectures. The default raises ``NotImplementedError``. + """ + raise NotImplementedError( + f"{type(self).__name__} does not support GDN linear attention. " + "Use HybridAttnBackend for hybrid full+GDN models." + ) + # ------------------------------------------------------------------ # Optional CUDA-graph interface # ------------------------------------------------------------------ diff --git a/pymllm/layers/attention/gdn_backend.py b/pymllm/layers/attention/gdn_backend.py new file mode 100644 index 00000000..2b6e27b4 --- /dev/null +++ b/pymllm/layers/attention/gdn_backend.py @@ -0,0 +1,660 @@ +"""GDN attention backend -- pooled-state GDN computation for hybrid models. + +Performs GDN (Gated Delta Net) linear-attention using externalized state +stored in a :class:`~pymllm.mem_cache.memory_pool.GDNPool`. Supports +both extend (prefill) and decode paths with FlashInfer kernels. + +This backend is not used directly; it is wrapped by +:class:`~pymllm.layers.attention.hybrid_backend.HybridAttnBackend`. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import torch.nn.functional as F + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + from pymllm.mem_cache.memory_pool import GDNPool + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Server config: gdn_decode_backend override +# --------------------------------------------------------------------------- + + +def _get_gdn_decode_backend_override() -> str: + """Read ``server.gdn_decode_backend`` from GlobalConfig. + + Returns one of: ``"auto"``, ``"flashinfer"``, ``"mllm_kernel"``, ``"pytorch"``. + """ + try: + from pymllm.configs import get_global_config + return get_global_config().server.gdn_decode_backend + except Exception: + return "auto" + + +# --------------------------------------------------------------------------- +# mllm-kernel GDN decode (lazy import, SM80+) +# --------------------------------------------------------------------------- + +_mllm_gdn_decode = None + + +def _get_mllm_gdn_decode(): + """Lazy import for mllm-kernel fused GDN decode CUDA kernel.""" + global _mllm_gdn_decode + if _mllm_gdn_decode is None: + try: + from mllm_kernel.cuda.jit.gdn_decode import gdn_decode + + _mllm_gdn_decode = gdn_decode + logger.info("GDNAttnBackend: [probe] mllm-kernel GDN decode available (SM80+)") + except (ImportError, RuntimeError) as e: + logger.info("GDNAttnBackend: [probe] mllm-kernel GDN decode not available: %s", e) + _mllm_gdn_decode = False + return _mllm_gdn_decode if _mllm_gdn_decode is not False else None + + +# --------------------------------------------------------------------------- +# FlashInfer GDN kernel (lazy import) +# --------------------------------------------------------------------------- + +_flashinfer_available: Optional[bool] = None +_fi_chunk_gated_delta_rule = None +_fi_gated_delta_rule_decode = None + + +def _get_flashinfer_gdn(): + """Lazy import for FlashInfer GDN kernels (prefill + decode).""" + global _flashinfer_available, _fi_chunk_gated_delta_rule, _fi_gated_delta_rule_decode + if _flashinfer_available is None: + try: + os.environ.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1") + _flashinfer_available = ( + torch.cuda.is_available() + and torch.cuda.get_device_capability()[0] >= 9 + ) + if not _flashinfer_available: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN not available (requires SM90+, " + "current SM%d%d)", *torch.cuda.get_device_capability() + ) + return _flashinfer_available, None, None + + from flashinfer.gdn_prefill import chunk_gated_delta_rule + _fi_chunk_gated_delta_rule = chunk_gated_delta_rule + + try: + from flashinfer.gdn_decode import gated_delta_rule_decode_pretranspose + _fi_gated_delta_rule_decode = gated_delta_rule_decode_pretranspose + logger.info("GDNAttnBackend: [probe] FlashInfer GDN available (prefill + decode)") + except ImportError: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN partially available " + "(prefill only, decode not found)" + ) + except (ImportError, RuntimeError) as e: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN not available: %s", e + ) + _flashinfer_available = False + return _flashinfer_available, _fi_chunk_gated_delta_rule, _fi_gated_delta_rule_decode + + +# --------------------------------------------------------------------------- +# GDN gating computation +# --------------------------------------------------------------------------- + + +def _gdn_gating( + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute GDN gating factors. + + Returns + ------- + g : log-space decay factor: -exp(A_log) * softplus(a + dt_bias) + beta : update gate: sigmoid(b) + """ + g = -torch.exp(A_log) * F.softplus(a + dt_bias) + beta = torch.sigmoid(b) + return g, beta + + +# --------------------------------------------------------------------------- +# Forward metadata +# --------------------------------------------------------------------------- + + +@dataclass +class GDNForwardMetadata: + """Per-batch metadata for GDN backend.""" + + cache_indices: torch.Tensor # [batch_size] = req_pool_indices + cu_seqlens: Optional[torch.Tensor] = None # extend only + + +# --------------------------------------------------------------------------- +# GDNAttnBackend +# --------------------------------------------------------------------------- + + +class GDNAttnBackend: + """GDN linear-attention backend using pooled states. + + Handles both extend (prefill) and decode paths for GDN layers. + Uses FlashInfer kernels when available (SM90+), with PyTorch fallback. + + Parameters + ---------- + gdn_pool + Pre-allocated :class:`~pymllm.mem_cache.memory_pool.GDNPool`. + device + Target device. + """ + + def __init__(self, gdn_pool: "GDNPool", device: torch.device): + self.gdn_pool = gdn_pool + self.device = device + self.forward_metadata: Optional[GDNForwardMetadata] = None + + # Pre-check FlashInfer availability + self._use_flashinfer, _, _ = _get_flashinfer_gdn() + + # One-shot flags to log the selected backend on first actual forward call + self._decode_backend_logged = False + self._extend_backend_logged = False + + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Prepare GDN metadata from the current forward batch.""" + cache_indices = forward_batch.req_pool_indices.to(torch.int64) + + cu_seqlens = None + if forward_batch.forward_mode.is_extend(): + # Build cu_seqlens from extend_seq_lens + if forward_batch.extend_seq_lens is not None: + seq_lens = forward_batch.extend_seq_lens.to(torch.int64) + cu_seqlens = torch.zeros( + len(seq_lens) + 1, + dtype=torch.int64, + device=self.device, + ) + torch.cumsum(seq_lens, dim=0, out=cu_seqlens[1:]) + + self.forward_metadata = GDNForwardMetadata( + cache_indices=cache_indices, + cu_seqlens=cu_seqlens, + ) + + # ------------------------------------------------------------------ + # CUDA-graph interface + # ------------------------------------------------------------------ + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate CUDA-graph state for GDN backend. + + The GDN pool buffers are already pre-allocated at fixed addresses, + so we only need to allocate the metadata tensor. + """ + self._cuda_graph_cache_indices = torch.zeros( + (max_bs,), dtype=torch.int64, device=self.device + ) + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + ) -> None: + """Set up GDN metadata for CUDA-graph capture (decode only).""" + self._cuda_graph_cache_indices[:bs].copy_( + req_pool_indices[:bs].to(torch.int64) + ) + self.forward_metadata = GDNForwardMetadata( + cache_indices=self._cuda_graph_cache_indices[:bs], + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + ) -> None: + """Update GDN metadata for CUDA-graph replay (decode only).""" + self._cuda_graph_cache_indices[:bs].copy_( + req_pool_indices[:bs].to(torch.int64) + ) + self.forward_metadata = GDNForwardMetadata( + cache_indices=self._cuda_graph_cache_indices[:bs], + ) + + # ------------------------------------------------------------------ + # Forward: decode + # ------------------------------------------------------------------ + + def forward_decode( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """GDN decode: one new token per request. + + Steps: + 1. Gather conv_state from pool → [bs, conv_dim, K-1] + 2. Conv1d update: shift + weighted sum for 1 new token + 3. Scatter updated conv_state back to pool + 4. SiLU → split q,k,v + 5. FlashInfer gated_delta_rule_decode (or PyTorch fallback) + """ + metadata = self.forward_metadata + cache_indices = metadata.cache_indices + gdn_idx = layer.gdn_layer_idx + bs = mixed_qkv.shape[0] + + recurrent_buf, conv_buf = self.gdn_pool.get_layer_state(gdn_idx) + conv_weight = layer.conv_weight # [conv_dim, kernel_size] + K = conv_weight.shape[1] + + # --- Conv1d decode: single-token update --- + conv_state = conv_buf[cache_indices] # [bs, conv_dim, K-1] + x = mixed_qkv.unsqueeze(-1) # [bs, conv_dim, 1] + + new_conv_state = torch.cat([conv_state[:, :, 1:], x], dim=-1) + full_window = torch.cat([conv_state, x], dim=-1) # [bs, conv_dim, K] + conv_out = (full_window * conv_weight.unsqueeze(0)).sum(dim=-1) + + conv_buf[cache_indices] = new_conv_state + + # --- SiLU activation --- + conv_out = F.silu(conv_out) + + # --- Split q, k, v --- + key_dim = layer.num_k_heads * layer.head_k_dim + value_dim = layer.num_v_heads * layer.head_v_dim + q, k, v = conv_out.split([key_dim, key_dim, value_dim], dim=-1) + q = q.view(bs, layer.num_k_heads, layer.head_k_dim) + k = k.view(bs, layer.num_k_heads, layer.head_k_dim) + v = v.view(bs, layer.num_v_heads, layer.head_v_dim) + + # --- Recurrent update --- + # Priority (when "auto"): FlashInfer SM90+ > mllm-kernel SM80+ > PyTorch + # Can be overridden via --server.gdn_decode_backend + backend = _get_gdn_decode_backend_override() + use_fi, _, fi_decode = _get_flashinfer_gdn() + mllm_gdn = _get_mllm_gdn_decode() + + use_flashinfer = ( + (backend in ("auto", "flashinfer")) + and use_fi and fi_decode is not None + and mixed_qkv.is_cuda + ) + use_mllm = ( + (backend in ("auto", "mllm_kernel")) + and not (backend == "auto" and use_flashinfer) + and mllm_gdn is not None + and mixed_qkv.is_cuda + ) + + if backend == "flashinfer" and not use_flashinfer: + logger.warning("GDNAttnBackend: gdn_decode_backend='flashinfer' requested but unavailable, falling back") + if backend == "mllm_kernel" and mllm_gdn is None: + logger.warning("GDNAttnBackend: gdn_decode_backend='mllm_kernel' requested but unavailable, falling back") + + if not self._decode_backend_logged: + if use_flashinfer: + selected = "flashinfer" + elif use_mllm: + selected = "mllm_kernel" + else: + selected = "pytorch" + logger.info( + "GDNAttnBackend: [decode] using backend=%s (config=%s)", selected, backend + ) + self._decode_backend_logged = True + + if use_flashinfer: + # FlashInfer decode (SM90+) + query_fi = q.unsqueeze(1) + key_fi = k.unsqueeze(1) + value_fi = v.unsqueeze(1) + a_fi = a.unsqueeze(1) + b_fi = b.unsqueeze(1) + + state_batch = recurrent_buf[cache_indices] + + output_fi, new_state = fi_decode( + q=query_fi, k=key_fi, v=value_fi, + state=state_batch, + A_log=layer.A_log.detach(), + a=a_fi, dt_bias=layer.dt_bias.detach(), b=b_fi, + scale=None, output=None, use_qk_l2norm=True, + ) + + recurrent_buf[cache_indices] = new_state + output = output_fi.squeeze(1) + + elif use_mllm: + # mllm-kernel fused CUDA decode (SM80+) + output = mllm_gdn( + q, k, v, a, b, + layer.A_log, layer.dt_bias, + recurrent_buf, cache_indices, + ) + + else: + # PyTorch fallback + g, beta = _gdn_gating(a, b, layer.A_log, layer.dt_bias) + output = self._decode_pytorch_fallback( + q, k, v, g, beta, recurrent_buf, cache_indices, layer + ) + + return output.reshape(bs, value_dim) + + def _decode_pytorch_fallback( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + recurrent_buf: torch.Tensor, + cache_indices: torch.Tensor, + layer: "RadixLinearAttention", + ) -> torch.Tensor: + """Pure PyTorch decode fallback for GDN with delta rule and L2 norm. + + Matches the sglang Triton kernel (fused_sigmoid_gating_delta_rule_update): + state *= exp(g) # decay + v_delta = v - state @ k # delta rule + v_delta *= beta # gating + state += v_delta outer k # state update + output = state @ q # readout + """ + bs = q.shape[0] + num_v_heads = layer.num_v_heads + num_k_heads = layer.num_k_heads + + # GQA: expand k/q heads to match v heads + if num_k_heads != num_v_heads: + repeats = num_v_heads // num_k_heads + q = q.repeat_interleave(repeats, dim=1) + k = k.repeat_interleave(repeats, dim=1) + + # All computation in float32 (state is float32, avoids dtype mismatch) + orig_dtype = q.dtype + q = q.float() + k = k.float() + v = v.float() + + # L2 normalize q and k per-head (matching use_qk_l2norm_in_kernel=True) + q = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + decay = torch.exp(g.float()) # [bs, num_v_heads] + beta_f = beta.float() # [bs, num_v_heads] + + outputs = [] + for i in range(bs): + idx = cache_indices[i] + state = recurrent_buf[idx] # [H, V, K] float32 + + # Decay + state = state * decay[i].unsqueeze(-1).unsqueeze(-1) + + k_i = k[i] # [H, K] + v_i = v[i] # [H, V] + b_i = beta_f[i] # [H] + q_i = q[i] # [H, K] + + # Delta rule: v_delta = v - state @ k + v_delta = v_i - torch.bmm(state, k_i.unsqueeze(-1)).squeeze(-1) + v_delta = v_delta * b_i.unsqueeze(-1) # gating + + # State update: state += v_delta ⊗ k (outer product in [V, K] layout) + state = state + v_delta.unsqueeze(-1) * k_i.unsqueeze(-2) + recurrent_buf[idx] = state + + # Output: o = state @ q + o_t = torch.bmm(state, q_i.unsqueeze(-1)).squeeze(-1) # [H, V] + outputs.append(o_t) + + return torch.stack(outputs, dim=0).to(orig_dtype) # [bs, H, V] + + # ------------------------------------------------------------------ + # Forward: extend (prefill) + # ------------------------------------------------------------------ + + def forward_extend( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """GDN extend (prefill): multi-token per request. + + Steps: + 1. Gather conv_state from pool for each request + 2. Per-request causal conv1d + 3. Scatter new conv_state back to pool + 4. SiLU → split q,k,v → gating + 5. FlashInfer chunk_gated_delta_rule (or PyTorch fallback) + 6. Scatter final recurrent state back to pool + """ + metadata = self.forward_metadata + cache_indices = metadata.cache_indices + cu_seqlens = metadata.cu_seqlens + gdn_idx = layer.gdn_layer_idx + total_tokens = mixed_qkv.shape[0] + + recurrent_buf, conv_buf = self.gdn_pool.get_layer_state(gdn_idx) + conv_weight = layer.conv_weight # [conv_dim, kernel_size] + K = conv_weight.shape[1] + batch_size = cache_indices.shape[0] + + key_dim = layer.num_k_heads * layer.head_k_dim + value_dim = layer.num_v_heads * layer.head_v_dim + + # --- Per-request causal conv1d --- + conv_out = torch.empty_like(mixed_qkv) # [total_tokens, conv_dim] + + for i in range(batch_size): + start = int(cu_seqlens[i].item()) + end = int(cu_seqlens[i + 1].item()) + seq_len = end - start + if seq_len == 0: + continue + + idx = cache_indices[i] + x = mixed_qkv[start:end] # [seq_len, conv_dim] + prev_state = conv_buf[idx] # [conv_dim, K-1] + + # Pad with previous conv state + x_padded = torch.cat([prev_state.T, x], dim=0) # [K-1+seq_len, conv_dim] + + # Save new conv state (last K-1 tokens) + conv_buf[idx] = x_padded[-(K - 1):].T.clone() + + # Causal conv1d + out = torch.zeros(seq_len, x.shape[1], device=x.device, dtype=x.dtype) + for kk in range(K): + out += x_padded[kk: kk + seq_len] * conv_weight[:, kk] + conv_out[start:end] = out + + # --- SiLU activation --- + conv_out = F.silu(conv_out) + + # --- Split q, k, v --- + q, k, v = conv_out.split([key_dim, key_dim, value_dim], dim=-1) + q = q.view(total_tokens, layer.num_k_heads, layer.head_k_dim) + k = k.view(total_tokens, layer.num_k_heads, layer.head_k_dim) + v = v.view(total_tokens, layer.num_v_heads, layer.head_v_dim) + + # --- GDN gating --- + g, beta = _gdn_gating(a, b, layer.A_log, layer.dt_bias) + + # --- Recurrent computation --- + use_fi, fi_prefill, _ = _get_flashinfer_gdn() + use_fi_extend = use_fi and fi_prefill is not None and mixed_qkv.is_cuda + + if not self._extend_backend_logged: + logger.info( + "GDNAttnBackend: [extend] using backend=%s", + "flashinfer" if use_fi_extend else "pytorch", + ) + self._extend_backend_logged = True + + if use_fi_extend: + # Gather initial states for this batch + init_state = recurrent_buf[cache_indices].to(torch.float32) + # [batch_size, num_v_heads, head_v_dim, head_k_dim] + + alpha = torch.exp(g.to(torch.float32)) + beta_f32 = beta.to(torch.float32) + + # FlashInfer's use_qk_l2norm_in_kernel is silently ignored — + # the flag is declared in the Python wrapper but never forwarded + # to the CUDA kernel. Pre-normalize q and k here, matching + # sglang's approach (l2norm_fwd before calling with False). + q_fi = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k_fi = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + output, final_state = fi_prefill( + q=q_fi.contiguous(), + k=k_fi.contiguous(), + v=v.contiguous(), + g=alpha, + beta=beta_f32, + initial_state=init_state, + output_final_state=True, + cu_seqlens=cu_seqlens, + use_qk_l2norm_in_kernel=False, + ) + + # Scatter final states back to pool + recurrent_buf[cache_indices] = final_state.to(recurrent_buf.dtype) + else: + # PyTorch fallback: per-request sequential scan + output = self._extend_pytorch_fallback( + q, k, v, g, beta, recurrent_buf, cache_indices, cu_seqlens, layer + ) + + return output.reshape(total_tokens, value_dim) + + def _extend_pytorch_fallback( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + recurrent_buf: torch.Tensor, + cache_indices: torch.Tensor, + cu_seqlens: torch.Tensor, + layer: "RadixLinearAttention", + ) -> torch.Tensor: + """Pure PyTorch extend fallback for GDN with delta rule and L2 norm.""" + total_tokens = q.shape[0] + num_v_heads = layer.num_v_heads + num_k_heads = layer.num_k_heads + head_v_dim = layer.head_v_dim + batch_size = cache_indices.shape[0] + + # All computation in float32 + orig_dtype = q.dtype + q = q.float() + k = k.float() + v = v.float() + + # L2 normalize q and k per-head + q = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + # GQA expansion + if num_k_heads != num_v_heads: + repeats = num_v_heads // num_k_heads + q = q.repeat_interleave(repeats, dim=1) + k = k.repeat_interleave(repeats, dim=1) + + output = torch.zeros( + total_tokens, num_v_heads, head_v_dim, + device=q.device, dtype=torch.float32, + ) + + for i in range(batch_size): + start = int(cu_seqlens[i].item()) + end = int(cu_seqlens[i + 1].item()) + seq_len = end - start + if seq_len == 0: + continue + + idx = cache_indices[i] + q_seq = q[start:end] + k_seq = k[start:end] + v_seq = v[start:end] + g_seq = g[start:end] + beta_seq = beta[start:end] + + decay = torch.exp(g_seq.float()) # [seq_len, H] + beta_f = beta_seq.float() # [seq_len, H] + state = recurrent_buf[idx].clone() # [H, V, K] float32 + + seq_outputs = [] + for t in range(seq_len): + # Decay + state = state * decay[t].unsqueeze(-1).unsqueeze(-1) + + k_t = k_seq[t] # [H, K] + v_t = v_seq[t] # [H, V] + b_t = beta_f[t] # [H] + q_t = q_seq[t] # [H, K] + + # Delta rule: v_delta = v - state @ k + v_delta = v_t - torch.bmm(state, k_t.unsqueeze(-1)).squeeze(-1) + v_delta = v_delta * b_t.unsqueeze(-1) + + # State update + state = state + v_delta.unsqueeze(-1) * k_t.unsqueeze(-2) + + # Output + o_t = torch.bmm(state, q_t.unsqueeze(-1)).squeeze(-1) + seq_outputs.append(o_t) + + recurrent_buf[idx] = state + output[start:end] = torch.stack(seq_outputs, dim=0) + + return output.to(orig_dtype) + + # ------------------------------------------------------------------ + # Dispatch entry point + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Route to decode or extend based on forward mode.""" + if forward_batch.forward_mode.is_decode(): + return self.forward_decode(layer, forward_batch, mixed_qkv, a, b) + else: + return self.forward_extend(layer, forward_batch, mixed_qkv, a, b) diff --git a/pymllm/layers/attention/hybrid_backend.py b/pymllm/layers/attention/hybrid_backend.py new file mode 100644 index 00000000..a5628259 --- /dev/null +++ b/pymllm/layers/attention/hybrid_backend.py @@ -0,0 +1,184 @@ +"""Hybrid attention backend -- FlashInfer + GDN for hybrid architectures. + +Wraps a :class:`FlashInferAttnBackend` (for full-attention layers) and a +:class:`GDNAttnBackend` (for GDN linear-attention layers). Dispatches +based on layer type: + +* ``RadixAttention`` calls → delegated to ``full_attn_backend`` +* ``RadixLinearAttention`` calls (via ``forward_gdn``) → delegated to ``gdn_backend`` + +CUDA-graph compatible: delegates all graph lifecycle methods to both +sub-backends. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Optional, Set + +import torch + +from pymllm.layers.attention.attention_backend import AttentionBackend + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + from pymllm.layers.attention.flashinfer_backend import FlashInferAttnBackend + from pymllm.layers.attention.gdn_backend import GDNAttnBackend + from pymllm.layers.attention.radix_attention import RadixAttention + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + +logger = logging.getLogger(__name__) + + +class HybridAttnBackend(AttentionBackend): + """Composite attention backend for hybrid full-attention + GDN models. + + Parameters + ---------- + full_attn_backend + FlashInfer backend for standard transformer attention layers. + gdn_backend + GDN backend for linear-attention layers. + full_attn_layer_ids + Set of global layer IDs that use full attention (for logging). + """ + + def __init__( + self, + full_attn_backend: "FlashInferAttnBackend", + gdn_backend: "GDNAttnBackend", + full_attn_layer_ids: Set[int], + ): + self.full_attn_backend = full_attn_backend + self.gdn_backend = gdn_backend + self.full_attn_layer_ids = full_attn_layer_ids + + logger.info( + "HybridAttnBackend created: %d full-attn layers, " + "%d GDN layers", + len(full_attn_layer_ids), + gdn_backend.gdn_pool.num_gdn_layers, + ) + + # ------------------------------------------------------------------ + # Core interface: init_forward_metadata + # ------------------------------------------------------------------ + + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Initialize metadata for both sub-backends.""" + self.full_attn_backend.init_forward_metadata(forward_batch) + self.gdn_backend.init_forward_metadata(forward_batch) + + # ------------------------------------------------------------------ + # Full attention: forward_decode / forward_extend + # ------------------------------------------------------------------ + + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Delegate full-attention decode to FlashInfer backend.""" + return self.full_attn_backend.forward_decode( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Delegate full-attention extend to FlashInfer backend.""" + return self.full_attn_backend.forward_extend( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + # ------------------------------------------------------------------ + # GDN linear attention: forward_gdn + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Delegate GDN computation to the GDN backend.""" + return self.gdn_backend.forward_gdn( + layer=layer, + forward_batch=forward_batch, + mixed_qkv=mixed_qkv, + a=a, + b=b, + ) + + # ------------------------------------------------------------------ + # CUDA-graph interface: delegate to both sub-backends + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + """Delegate to the full-attention backend.""" + return self.full_attn_backend.get_cuda_graph_seq_len_fill_value() + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate CUDA-graph state for both sub-backends.""" + self.full_attn_backend.init_cuda_graph_state(max_bs, max_num_tokens) + self.gdn_backend.init_cuda_graph_state(max_bs, max_num_tokens) + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: "ForwardMode", + ) -> None: + """Set up metadata for CUDA-graph capture in both sub-backends.""" + self.full_attn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + num_tokens=num_tokens, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + forward_mode=forward_mode, + ) + self.gdn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: "ForwardMode", + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata for CUDA-graph replay in both sub-backends.""" + self.full_attn_backend.init_forward_metadata_replay_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + seq_lens_sum=seq_lens_sum, + forward_mode=forward_mode, + seq_lens_cpu=seq_lens_cpu, + ) + self.gdn_backend.init_forward_metadata_replay_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + ) diff --git a/pymllm/layers/attention/radix_linear_attention.py b/pymllm/layers/attention/radix_linear_attention.py new file mode 100644 index 00000000..01993163 --- /dev/null +++ b/pymllm/layers/attention/radix_linear_attention.py @@ -0,0 +1,116 @@ +"""RadixLinearAttention -- GDN linear-attention layer for hybrid models. + +Analogous to :class:`RadixAttention` but for GDN (Gated Delta Net) layers. +Stores per-layer GDN parameters and delegates computation to the +:meth:`AttentionBackend.forward_gdn` method on the current +:class:`~pymllm.engine.forward_batch.ForwardBatch`. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch +from torch import nn + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + + +class RadixLinearAttention(nn.Module): + """GDN linear-attention layer that delegates to the attention backend. + + Each GDN layer in a pymllm model creates one ``RadixLinearAttention`` + with a unique ``layer_id`` and ``gdn_layer_idx``. During forward, it + calls ``forward_batch.attn_backend.forward_gdn(...)`` which routes to + the appropriate GDN backend implementation. + + Parameters + ---------- + layer_id : int + Global zero-based layer index within the model. + gdn_layer_idx : int + Sequential zero-based index among GDN layers only (not global). + Used to index into :class:`~pymllm.mem_cache.memory_pool.GDNPool`. + num_k_heads : int + Number of key heads. + num_v_heads : int + Number of value heads. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_weight : nn.Parameter + Reference to the GDNConv1d weight parameter. + A_log : nn.Parameter + Log-space decay parameter. + dt_bias : nn.Parameter + Bias for the decay gate. + """ + + def __init__( + self, + layer_id: int, + gdn_layer_idx: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_weight: nn.Parameter, + A_log: nn.Parameter, + dt_bias: nn.Parameter, + ): + super().__init__() + self.layer_id = layer_id + self.gdn_layer_idx = gdn_layer_idx + self.num_k_heads = num_k_heads + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + # Store references to model parameters (not copies) + self.conv_weight = conv_weight + self.A_log = A_log + self.dt_bias = dt_bias + + def forward( + self, + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Delegate GDN computation to the attention backend. + + Parameters + ---------- + forward_batch + Batch metadata with ``attn_backend`` attached. + mixed_qkv + Concatenated Q/K/V projection output before conv1d. + a + Decay gate input, shape ``[num_tokens, num_v_heads]``. + b + Update gate input, shape ``[num_tokens, num_v_heads]``. + + Returns + ------- + torch.Tensor + GDN attention output, shape ``[num_tokens, num_v_heads * head_v_dim]``. + """ + return forward_batch.attn_backend.forward_gdn( + layer=self, + forward_batch=forward_batch, + mixed_qkv=mixed_qkv, + a=a, + b=b, + ) + + def extra_repr(self) -> str: + return ( + f"layer_id={self.layer_id}, " + f"gdn_layer_idx={self.gdn_layer_idx}, " + f"k_heads={self.num_k_heads}, " + f"v_heads={self.num_v_heads}, " + f"k_dim={self.head_k_dim}, " + f"v_dim={self.head_v_dim}" + ) diff --git a/pymllm/layers/gated_delta_net.py b/pymllm/layers/gated_delta_net.py new file mode 100644 index 00000000..3753734d --- /dev/null +++ b/pymllm/layers/gated_delta_net.py @@ -0,0 +1,168 @@ +"""Gated Delta Network (GDN) linear attention for Qwen3.5. + +This implements the linear attention mechanism used in Qwen3.5's hybrid +architecture. GDN alternates with standard full-attention layers. + +Core formulation (decode, per-head): + g_t = -exp(A_log) * softplus(a_t + dt_bias) + beta_t = sigmoid(b_t) + state_t = exp(g_t) * state_{t-1} + beta_t * (k_t outer v_t) + output_t = (q_t @ state_t) + +State is externalized into a :class:`~pymllm.mem_cache.memory_pool.GDNPool` +and computation is delegated to the attention backend via +:class:`~pymllm.layers.attention.radix_linear_attention.RadixLinearAttention`. +""" + +from __future__ import annotations + +import logging +from typing import Any, Optional + +import torch +import torch.nn as nn + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.linear import Linear +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Conv1d weight holder +# --------------------------------------------------------------------------- + + +class GDNConv1d(nn.Module): + """Causal 1D convolution weight holder for GDN sequence mixing. + + The actual convolution computation is performed by the GDN backend + using pooled conv states. This module only holds the learnable weight. + """ + + def __init__(self, channels: int, kernel_size: int): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.weight = nn.Parameter(torch.empty(channels, kernel_size)) + + +# --------------------------------------------------------------------------- +# GatedDeltaNet — main GDN layer +# --------------------------------------------------------------------------- + + +class GatedDeltaNet(MllmBaseLayer): + """Gated Delta Network linear attention layer for Qwen3.5. + + State is externalized into a GDNPool and computation is delegated to + the attention backend via RadixLinearAttention. + + Parameters + ---------- + hidden_size : int + Model hidden dimension. + num_k_heads : int + Number of key heads. + num_v_heads : int + Number of value heads. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_kernel_size : int + Causal conv1d kernel width. + layer_id : int + Global layer index. + gdn_layer_idx : int + Sequential index among GDN layers (0-based). + rms_norm_eps : float + Epsilon for gated RMS normalization. + """ + + def __init__( + self, + hidden_size: int, + num_k_heads: int = 16, + num_v_heads: int = 32, + head_k_dim: int = 128, + head_v_dim: int = 128, + conv_kernel_size: int = 4, + layer_id: int = 0, + gdn_layer_idx: int = 0, + rms_norm_eps: float = 1e-6, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_k_heads = num_k_heads + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + self.key_dim = head_k_dim * num_k_heads + self.value_dim = head_v_dim * num_v_heads + self.conv_kernel_size = conv_kernel_size + self.layer_id = layer_id + self.gdn_layer_idx = gdn_layer_idx + + # Input projections + self.in_proj_qkv = Linear(hidden_size, self.key_dim * 2 + self.value_dim, bias=False) + self.in_proj_z = Linear(hidden_size, self.value_dim, bias=False) + self.in_proj_a = Linear(hidden_size, num_v_heads, bias=False) + self.in_proj_b = Linear(hidden_size, num_v_heads, bias=False) + + # Causal convolution (weight only — computation is in the backend) + self.conv1d = GDNConv1d(self.key_dim * 2 + self.value_dim, conv_kernel_size) + + # State parameters (must stay float32 for numerical stability) + self.A_log = nn.Parameter(torch.empty(num_v_heads, dtype=torch.float32)) + self.dt_bias = nn.Parameter(torch.ones(num_v_heads, dtype=torch.float32)) + set_weight_attrs(self.A_log, {"weight_loader": self.weight_loader}) + set_weight_attrs(self.dt_bias, {"weight_loader": self.weight_loader}) + + # Gated RMSNorm (mllm-kernel accelerated) + from pymllm.layers.rms_norm_gated import RMSNormGated + self.norm = RMSNormGated(head_v_dim, eps=rms_norm_eps, norm_before_gate=True) + + # Output projection + self.out_proj = Linear(self.value_dim, hidden_size, bias=False) + + # RadixLinearAttention — delegates to the attention backend + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + self.attn = RadixLinearAttention( + layer_id=layer_id, + gdn_layer_idx=gdn_layer_idx, + num_k_heads=num_k_heads, + num_v_heads=num_v_heads, + head_k_dim=head_k_dim, + head_v_dim=head_v_dim, + conv_weight=self.conv1d.weight, + A_log=self.A_log, + dt_bias=self.dt_bias, + ) + + def forward( + self, hidden_states: torch.Tensor, forward_batch: Any = None, + ) -> torch.Tensor: + seq_len, _ = hidden_states.shape + + # Input projections + mixed_qkv = self.in_proj_qkv(hidden_states) + z = self.in_proj_z(hidden_states) + a = self.in_proj_a(hidden_states) + b = self.in_proj_b(hidden_states) + + # Delegate to backend via RadixLinearAttention + # The backend handles: conv1d, SiLU, split, gating, recurrent update + attn_out = self.attn(forward_batch, mixed_qkv, a, b) + + # Gated norm + output projection + attn_out = attn_out.view(seq_len, self.num_v_heads, self.head_v_dim) + z = z.view(seq_len, self.num_v_heads, self.head_v_dim) + + attn_flat = attn_out.reshape(-1, self.head_v_dim) + z_flat = z.reshape(-1, self.head_v_dim) + normed = self.norm(attn_flat, z_flat) + normed = normed.view(seq_len, self.num_v_heads, self.head_v_dim) + normed = normed.reshape(seq_len, self.value_dim) + return self.out_proj(normed) diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py index b55a0ea6..b20b36f3 100644 --- a/pymllm/layers/rms_norm.py +++ b/pymllm/layers/rms_norm.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Optional, Tuple, Union + import torch import flashinfer from torch.nn import Parameter @@ -19,7 +21,15 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = Parameter(torch.empty(hidden_size)) set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + flashinfer.norm.fused_add_rmsnorm(x, residual, self.weight.data, self.eps) + return x, residual + if x.shape[-1] != self.hidden_size: raise ValueError( f"Expected last dim == hidden_size ({self.hidden_size}), " @@ -47,7 +57,17 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = Parameter(torch.empty(hidden_size)) set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + flashinfer.norm.gemma_fused_add_rmsnorm( + x, residual, self.weight.data, self.eps + ) + return x, residual + if x.shape[-1] != self.hidden_size: raise ValueError( f"Expected last dim == hidden_size ({self.hidden_size}), " diff --git a/pymllm/layers/rms_norm_gated.py b/pymllm/layers/rms_norm_gated.py new file mode 100644 index 00000000..caec9b88 --- /dev/null +++ b/pymllm/layers/rms_norm_gated.py @@ -0,0 +1,154 @@ +"""Gated RMSNorm layer for Qwen3.5 GDN attention. + +Computes ``rmsnorm(x, weight, eps) * silu(z)`` using a fused CUDA kernel +from mllm-kernel. Falls back to PyTorch when the kernel is unavailable. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Try to load the mllm-kernel fused CUDA implementation +# --------------------------------------------------------------------------- +_HAS_MLLM_KERNEL_CUDA = False +try: + from mllm_kernel.cuda.jit.rms_norm_gated import ( + rms_norm_gated as _mllm_rms_norm_gated, + ) + + _HAS_MLLM_KERNEL_CUDA = True +except Exception: + _mllm_rms_norm_gated = None + + +# --------------------------------------------------------------------------- +# Pure-PyTorch fallback +# --------------------------------------------------------------------------- + + +def _rms_norm_gated_pytorch( + x: torch.Tensor, + weight: torch.Tensor, + z: Optional[torch.Tensor] = None, + eps: float = 1e-6, + norm_before_gate: bool = True, +) -> torch.Tensor: + """Pure-PyTorch reference implementation.""" + dtype = x.dtype + x_fp32 = x.float() + w_fp32 = weight.float() + z_fp32 = z.float() if z is not None else None + + if z_fp32 is not None and not norm_before_gate: + x_fp32 = x_fp32 * F.silu(z_fp32) + + variance = x_fp32.pow(2).mean(dim=-1, keepdim=True) + rstd = torch.rsqrt(variance + eps) + out = x_fp32 * rstd * w_fp32 + + if z_fp32 is not None and norm_before_gate: + out = out * F.silu(z_fp32) + + return out.to(dtype) + + +# --------------------------------------------------------------------------- +# Unified dispatch +# --------------------------------------------------------------------------- + + +def rms_norm_gated( + x: torch.Tensor, + weight: torch.Tensor, + z: Optional[torch.Tensor] = None, + eps: float = 1e-6, + norm_before_gate: bool = True, +) -> torch.Tensor: + """Compute (optionally gated) RMS normalization. + + Uses the fused mllm-kernel CUDA implementation when available, + otherwise falls back to a pure-PyTorch implementation. + """ + if _HAS_MLLM_KERNEL_CUDA and x.is_cuda: + return _mllm_rms_norm_gated(x, weight, z=z, eps=eps) + return _rms_norm_gated_pytorch( + x, weight, z=z, eps=eps, norm_before_gate=norm_before_gate, + ) + + +# --------------------------------------------------------------------------- +# nn.Module wrapper +# --------------------------------------------------------------------------- + + +class RMSNormGated(MllmBaseLayer): + """Gated RMS Normalization layer for Qwen3.5 GDN attention. + + Computes:: + + output = rmsnorm(x, weight) * silu(z) # z is not None + output = rmsnorm(x, weight) # z is None + + Uses a fused CUDA kernel from mllm-kernel for maximum throughput. + + Parameters + ---------- + hidden_size : int + Dimensionality of the input (and weight vector). + eps : float + Small constant for numerical stability. + norm_before_gate : bool + If ``True`` (default): ``rmsnorm(x) * silu(z)``. + If ``False``: ``rmsnorm(x * silu(z))``. + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + group_size: Optional[int] = None, + norm_before_gate: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + self.norm_before_gate = norm_before_gate + + factory_kwargs = {} + if device is not None: + factory_kwargs["device"] = device + if dtype is not None: + factory_kwargs["dtype"] = dtype + + self.weight = Parameter(torch.ones(hidden_size, **factory_kwargs)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward( + self, + x: torch.Tensor, + z: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return rms_norm_gated( + x, self.weight, z=z, eps=self.eps, + norm_before_gate=self.norm_before_gate, + ) + + def extra_repr(self) -> str: + return ( + f"hidden_size={self.hidden_size}, eps={self.eps}, " + f"norm_before_gate={self.norm_before_gate}" + ) diff --git a/pymllm/layers/rope.py b/pymllm/layers/rope.py index 045774e9..94f89b20 100644 --- a/pymllm/layers/rope.py +++ b/pymllm/layers/rope.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Optional, Tuple +from typing import List, Optional, Tuple import torch import flashinfer @@ -44,7 +44,10 @@ def apply_rope( """ if inplace: flashinfer.rope.apply_rope_inplace( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -53,7 +56,10 @@ def apply_rope( return None return flashinfer.rope.apply_rope( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -102,7 +108,10 @@ def apply_llama31_rope( """ if inplace: flashinfer.rope.apply_llama31_rope_inplace( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -114,7 +123,10 @@ def apply_llama31_rope( return None return flashinfer.rope.apply_llama31_rope( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -156,7 +168,9 @@ def apply_rope_pos_ids( """ if inplace: flashinfer.rope.apply_rope_pos_ids_inplace( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -165,7 +179,9 @@ def apply_rope_pos_ids( return None return flashinfer.rope.apply_rope_pos_ids( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -208,7 +224,9 @@ def apply_llama31_rope_pos_ids( """ if inplace: flashinfer.rope.apply_llama31_rope_pos_ids_inplace( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -220,7 +238,9 @@ def apply_llama31_rope_pos_ids( return None return flashinfer.rope.apply_llama31_rope_pos_ids( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -265,12 +285,117 @@ def apply_rope_with_cos_sin_cache( """ if inplace: flashinfer.rope.apply_rope_with_cos_sin_cache_inplace( - positions, query, key, head_size, cos_sin_cache, + positions, + query, + key, + head_size, + cos_sin_cache, is_neox=is_neox, ) return None return flashinfer.rope.apply_rope_with_cos_sin_cache( - positions, query, key, head_size, cos_sin_cache, + positions, + query, + key, + head_size, + cos_sin_cache, is_neox=is_neox, ) + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotate the second half of the last dimension into the first half (neox-style).""" + half = x.shape[-1] // 2 + return torch.cat((-x[..., half:], x[..., :half]), dim=-1) + + +def apply_mrope( + q: torch.Tensor, + k: torch.Tensor, + positions: torch.Tensor, + cos_sin_cache: torch.Tensor, + mrope_section: List[int], + mrope_interleaved: bool = True, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Apply multi-dimensional rotary position embedding (M-RoPE). + + Used by Qwen3-VL which assigns independent (t, h, w) position indices to + each token. For text tokens all three indices are the same sequential + value; for image tokens they follow the spatial grid layout. + + Args: + q: Query tensor, shape ``(T, num_q_heads, head_dim)``. + k: Key tensor, shape ``(T, num_kv_heads, head_dim)``. + positions: 3-D position IDs, shape ``(3, T)`` — rows are + ``(temporal, height, width)`` position indices. + cos_sin_cache: Precomputed cache, shape ``(max_pos, head_dim)``. + The first ``head_dim // 2`` columns are cosine values and the + remaining columns are sine values, each for frequencies + ``0, 1, ..., head_dim // 2 - 1``. + mrope_section: Three integers ``[s_t, s_h, s_w]`` that partition + the ``head_dim // 2`` rotary frequency dimensions among the + temporal, height, and width components. + ``sum(mrope_section)`` must equal ``head_dim // 2``. + mrope_interleaved: When ``True`` (Qwen3-VL default), uses the + interleaved layout where frequency dimensions are cycled + ``(t, h, w, t, h, w, ...)`` rather than grouped consecutively. + + Returns: + ``(q_rope, k_rope)`` with the same shapes as the inputs. + """ + rotary_dim = cos_sin_cache.shape[-1] # = head_dim + half_dim = rotary_dim // 2 + + # Look up cos/sin for each of the 3 position dimensions. + # positions: [3, T] => cos_sin: [3, T, rotary_dim] + cos_sin = cos_sin_cache[positions] + cos = cos_sin[..., :half_dim] # [3, T, half_dim] + sin = cos_sin[..., half_dim:] # [3, T, half_dim] + + if mrope_interleaved: + # Interleaved layout (Qwen3-VL): within the first + # mrope_section[1]*3 frequency dims, indices cycle (t, h, w). + # Remaining dims (indices >= span) all use the temporal position. + # Matches SGLang's apply_interleaved_rope. + cos_merged = cos[0].clone() # start with temporal; shape [T, half_dim] + sin_merged = sin[0].clone() + span_h = mrope_section[1] * 3 + span_w = mrope_section[2] * 3 + cos_merged[..., 1:span_h:3] = cos[1, ..., 1:span_h:3] + cos_merged[..., 2:span_w:3] = cos[2, ..., 2:span_w:3] + sin_merged[..., 1:span_h:3] = sin[1, ..., 1:span_h:3] + sin_merged[..., 2:span_w:3] = sin[2, ..., 2:span_w:3] + else: + # Non-interleaved (Qwen2-VL style): consecutive frequency sections. + cos_sects = cos.split(mrope_section, dim=-1) # list of [T, s_i] + sin_sects = sin.split(mrope_section, dim=-1) + # Section i picks its cos/sin from positions[i] + cos_merged = torch.cat( + [cos_sects[i][i] for i in range(3)], dim=-1 + ) # [T, half_dim] + sin_merged = torch.cat( + [sin_sects[i][i] for i in range(3)], dim=-1 + ) # [T, half_dim] + + # Expand to full rotary_dim for the neox-style rotation formula: + # q_rot = q * cos_full + rotate_half(q) * sin_full + cos_full = cos_merged.repeat(1, 2) # [T, rotary_dim] + sin_full = sin_merged.repeat(1, 2) # [T, rotary_dim] + cos_4d = cos_full.unsqueeze(1) # [T, 1, rotary_dim] -- broadcasts over heads + sin_4d = sin_full.unsqueeze(1) + + q_rot = q[..., :rotary_dim] * cos_4d + _rotate_half(q[..., :rotary_dim]) * sin_4d + k_rot = k[..., :rotary_dim] * cos_4d + _rotate_half(k[..., :rotary_dim]) * sin_4d + + q_out = ( + torch.cat([q_rot, q[..., rotary_dim:]], dim=-1) + if rotary_dim < q.shape[-1] + else q_rot + ) + k_out = ( + torch.cat([k_rot, k[..., rotary_dim:]], dim=-1) + if rotary_dim < k.shape[-1] + else k_rot + ) + return q_out, k_out diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py index ff84879c..26c769ff 100644 --- a/pymllm/layers/sampling.py +++ b/pymllm/layers/sampling.py @@ -74,6 +74,15 @@ def softmax( torch.Tensor Probabilities with the same shape as *logits*. """ + # Clamp temperature to avoid division by zero (temperature=0 → greedy). + # Replace 0 with 1 here; the caller (ModelRunner.sample) handles + # temperature=0 via argmax before reaching this path. + if temperature is not None: + if isinstance(temperature, torch.Tensor): + temperature = temperature.clamp(min=1e-6) + elif temperature < 1e-6: + temperature = 1.0 # effectively no scaling; caller uses argmax + if _HAS_FLASHINFER: return _fi_sampling.softmax( logits, temperature=temperature, enable_pdl=enable_pdl diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py index f9c176a9..9c8ab2a9 100644 --- a/pymllm/mem_cache/memory_pool.py +++ b/pymllm/mem_cache/memory_pool.py @@ -83,6 +83,10 @@ def __init__( self.device = torch.device(device) self.dtype = dtype + # pin_memory only applies to CPU tensors + if self.device.type != "cpu": + pin_memory = False + buf_len = size + 1 # slot 0 is padding if buf_len % 8 != 0: @@ -472,6 +476,161 @@ def make_full_attention_net_mem_pool( return pool, allocator +class GDNPool: + """Pre-allocated memory pool for GDN recurrent and conv states. + + Indexed by ``req_pool_idx`` (same index space as :class:`ReqToTokenPool`). + Slot 0 is reserved as a padding / dummy slot and is never allocated. + + Layout:: + + recurrent_state[gdn_layer_idx, slot, num_v_heads, head_k_dim, head_v_dim] + float32 (FlashInfer requirement) + conv_state[gdn_layer_idx, slot, conv_dim, kernel_size - 1] + model dtype (bfloat16 / float16) + + Parameters + ---------- + max_reqs : int + Maximum number of concurrent requests (matches ``ReqToTokenPool.size``). + num_gdn_layers : int + Number of GDN (linear attention) layers in the model. + num_v_heads : int + Number of value heads per GDN layer. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_dim : int + Total convolution input dimension (``key_dim * 2 + value_dim``). + conv_kernel_size : int + Causal conv1d kernel width (state stores ``kernel_size - 1`` columns). + device : str | torch.device + Target device. + dtype : torch.dtype + Storage dtype for conv_state (recurrent_state is always float32). + """ + + def __init__( + self, + max_reqs: int, + num_gdn_layers: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_dim: int, + conv_kernel_size: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.bfloat16, + max_track_slots: int = 0, + ): + self.max_reqs = max_reqs + self.num_gdn_layers = num_gdn_layers + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + self.conv_dim = conv_dim + self.conv_kernel_size = conv_kernel_size + self.device = torch.device(device) + self.dtype = dtype + self.max_track_slots = max_track_slots + + # Track slots live after the working slots: indices + # [max_reqs + 1, max_reqs + 1 + max_track_slots) + pool_size = max_reqs + 1 + max_track_slots # slot 0 is padding + + # Recurrent state: always float32 (FlashInfer requirement) + # Shape: [num_gdn_layers, pool_size, num_v_heads, head_v_dim, head_k_dim] + # Note: FlashInfer uses (V, K) layout for the state matrix + self.recurrent_state = torch.zeros( + (num_gdn_layers, pool_size, num_v_heads, head_v_dim, head_k_dim), + dtype=torch.float32, + device=self.device, + ) + + # Conv state: model dtype + # Shape: [num_gdn_layers, pool_size, conv_dim, kernel_size - 1] + self.conv_state = torch.zeros( + (num_gdn_layers, pool_size, conv_dim, conv_kernel_size - 1), + dtype=dtype, + device=self.device, + ) + + # Track-slot free list (indices into the pool starting after working slots) + self._track_slot_base = max_reqs + 1 + self._free_track_slots: List[int] = list( + range(self._track_slot_base, self._track_slot_base + max_track_slots) + ) + + logger.info( + "GDNPool allocated: %d GDN layers, %d working + %d track slots, " + "v_heads=%d, k_dim=%d, v_dim=%d, conv_dim=%d, kernel=%d, %.2f GB", + num_gdn_layers, + max_reqs, + max_track_slots, + num_v_heads, + head_k_dim, + head_v_dim, + conv_dim, + conv_kernel_size, + self.mem_bytes() / (1 << 30), + ) + + def get_layer_state( + self, gdn_layer_idx: int + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Return ``(recurrent_state, conv_state)`` for a specific GDN layer. + + Both are views into the pool tensors with shape: + - recurrent: ``[pool_size, num_v_heads, head_v_dim, head_k_dim]`` + - conv: ``[pool_size, conv_dim, kernel_size - 1]`` + """ + return ( + self.recurrent_state[gdn_layer_idx], + self.conv_state[gdn_layer_idx], + ) + + def reset_states(self, req_pool_indices: torch.Tensor) -> None: + """Zero-init GDN states for the given request pool indices. + + Called when new requests are allocated to ensure clean state. + """ + if req_pool_indices.numel() == 0: + return + # Zero both recurrent and conv states for all GDN layers + self.recurrent_state[:, req_pool_indices] = 0 + self.conv_state[:, req_pool_indices] = 0 + + # ------------------------------------------------------------------ + # Track-slot management (for prefix cache GDN state snapshots) + # ------------------------------------------------------------------ + + def alloc_track_slot(self) -> Optional[int]: + """Allocate a single track slot index. Returns ``None`` if exhausted.""" + if not self._free_track_slots: + return None + return self._free_track_slots.pop() + + def free_track_slot(self, slot: int) -> None: + """Return a track slot to the free list.""" + self._free_track_slots.append(slot) + + def copy_states(self, src_index: int, dst_index: int) -> None: + """Copy recurrent and conv states from *src_index* to *dst_index*. + + Works for any pool indices (working or track slots). + """ + self.recurrent_state[:, dst_index] = self.recurrent_state[:, src_index] + self.conv_state[:, dst_index] = self.conv_state[:, src_index] + + def mem_bytes(self) -> int: + """Total memory consumption in bytes.""" + return ( + self.recurrent_state.nelement() * self.recurrent_state.element_size() + + self.conv_state.nelement() * self.conv_state.element_size() + ) + + def make_req_to_token_pool( max_reqs: int, max_context_len: int, diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index 997790ea..441a8c09 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -18,7 +18,7 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union import torch @@ -189,6 +189,7 @@ class InsertResult: """Returned by :meth:`RadixCache.insert`.""" prefix_len: int = 0 + last_node: Optional[TreeNode] = None @dataclass @@ -224,11 +225,13 @@ def __init__( sliding_window_size: Optional[int] = None, disable: bool = False, token_to_kv_pool_allocator: Any = None, + on_node_evict: Optional[Callable[[int], None]] = None, ): self.page_size = page_size self.sliding_window_size = sliding_window_size self.disable = disable self.pool = token_to_kv_pool_allocator + self.on_node_evict = on_node_evict if self.pool is not None and hasattr(self.pool, "device"): self.device = self.pool.device @@ -332,9 +335,10 @@ def insert( plen = self._insert_swa( self.root_node, key, value, prev_prefix_len, swa_evicted_seqlen ) + return InsertResult(prefix_len=plen) else: - plen = self._insert_normal(self.root_node, key, value) - return InsertResult(prefix_len=plen) + plen, last_node = self._insert_normal(self.root_node, key, value) + return InsertResult(prefix_len=plen, last_node=last_node) def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: """Evict up to *num_tokens* (full) and *swa_num_tokens* (SWA) tokens. @@ -589,30 +593,38 @@ def _match_swa(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode, int]: return values, best_node, best_count - def _insert_normal(self, node: TreeNode, key: RadixKey, value: torch.Tensor) -> int: + def _insert_normal( + self, node: TreeNode, key: RadixKey, value: torch.Tensor + ) -> Tuple[int, TreeNode]: + """Insert into non-SWA tree. Returns ``(prefix_len, last_node)``.""" now = time.monotonic() node.last_access_time = now if len(key) == 0: - return 0 + return 0, node total_prefix = 0 - while len(key) > 0: - ck = _child_key(key, self.page_size) - if ck not in node.children: - break + ck = _child_key(key, self.page_size) + while len(key) > 0 and ck in node.children: node = node.children[ck] node.last_access_time = now plen = _key_match(node.key, key, self.page_size) - if plen < len(node.key): - self._split_node(node.key, node, plen) total_prefix += plen key = key[plen:] value = value[plen:] + if plen < len(node.key): + # Partial match: split the node. ``node`` must advance to + # the NEW parent so that any remaining key is added as a + # sibling of the tail, not a child of it. + node = self._split_node(node.key, node, plen) + if len(key) > 0: + ck = _child_key(key, self.page_size) + if len(key) > 0: - self._add_leaf(node, key, value) + new_leaf = self._add_leaf(node, key, value) + node = new_leaf - return total_prefix + return total_prefix, node def _insert_swa( self, @@ -730,6 +742,8 @@ def _delete_leaf(self, node: TreeNode) -> None: self._evictable_size -= len(node.key) if self.supports_swa and not node.swa_tombstone: self._swa_evictable_size -= len(node.key) + if self.on_node_evict is not None: + self.on_node_evict(node.id) def _tombstone_node(self, node: TreeNode) -> None: node.swa_tombstone = True diff --git a/pymllm/models/__init__.py b/pymllm/models/__init__.py index e69de29b..7751b309 100644 --- a/pymllm/models/__init__.py +++ b/pymllm/models/__init__.py @@ -0,0 +1,62 @@ +"""Model registry for pymllm. + +Maps HuggingFace ``config.architectures[0]`` strings to pymllm model classes. +Models are imported lazily via ``importlib`` so that heavy dependencies (torch, +numpy, etc.) are only loaded when a model is actually requested. +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Dict, Optional, Tuple, Type + +import torch.nn as nn + +logger = logging.getLogger(__name__) + +# (module_path, class_name) +_MODEL_REGISTRY: Dict[str, Tuple[str, str]] = { + "Qwen3VLForConditionalGeneration": ( + "pymllm.models.qwen3_vl", + "Qwen3VLForConditionalGeneration", + ), + # Qwen3.5 (hybrid attention: full + GDN linear) + "Qwen3_5ForCausalLM": ( + "pymllm.models.qwen3_5", + "Qwen3_5ForCausalLM", + ), + "Qwen3_5ForConditionalGeneration": ( + "pymllm.models.qwen3_5", + "Qwen3_5ForConditionalGeneration", + ), +} + + +def get_model_class(architecture: str) -> Optional[Type[nn.Module]]: + """Look up a pymllm model class by HuggingFace architecture string. + + Returns ``None`` if the architecture is not registered or cannot be + imported. The caller is responsible for raising an appropriate error. + """ + entry = _MODEL_REGISTRY.get(architecture) + if entry is None: + return None + + module_path, class_name = entry + try: + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + logger.info( + "Resolved architecture %r -> %s.%s", architecture, module_path, class_name + ) + return cls + except (ImportError, AttributeError) as exc: + logger.warning( + "Failed to import %s.%s for architecture %r: %s", + module_path, + class_name, + architecture, + exc, + ) + return None diff --git a/pymllm/models/qwen3_5.py b/pymllm/models/qwen3_5.py new file mode 100644 index 00000000..ca4dbe2e --- /dev/null +++ b/pymllm/models/qwen3_5.py @@ -0,0 +1,530 @@ +"""Inference-only Qwen3.5 model for pymllm. + +Implements the hybrid attention architecture: +- **Full attention layers** (standard transformer with RoPE + output gate) +- **GDN linear attention layers** (Gated Delta Network, O(n) complexity) + +Layers alternate: linear, attention, linear, attention, ... based on +``full_attention_interval`` in the config. + +Supports: +- Dense (non-MoE) variant +- Vision-Language (multimodal) via inheritance from Qwen3VL + +Adapted from sglang's ``qwen3_5.py``. +""" + +from __future__ import annotations + +import logging +import math +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pymllm.layers.attention.radix_attention import RadixAttention +from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.gated_delta_net import GatedDeltaNet +from pymllm.layers.linear import Linear +from pymllm.layers.mlp import MLP +from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rope import apply_rope_pos_ids +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Config helpers +# --------------------------------------------------------------------------- + + +def _get_text_config(config): + """Extract the text sub-config from a multimodal config, or return as-is.""" + return getattr(config, "text_config", config) + + +def _get_layer_types(config) -> List[str]: + """Return per-layer type list: 'attention' or 'linear_attention'.""" + if hasattr(config, "layers_block_type"): + return config.layers_block_type + # Compute from full_attention_interval + interval = getattr(config, "full_attention_interval", 2) + n_layers = config.num_hidden_layers + types = [] + for i in range(n_layers): + if (i + 1) % interval == 0: + types.append("attention") + else: + types.append("linear_attention") + return types + + +# --------------------------------------------------------------------------- +# Full Attention Layer (with output gate + QK norm) +# --------------------------------------------------------------------------- + + +class Qwen3_5FullAttention(nn.Module): + """Standard multi-head attention with RoPE, QK-norm, and optional output gate.""" + + def __init__(self, config, layer_id: int): + super().__init__() + tc = _get_text_config(config) + self.hidden_size = tc.hidden_size + self.num_heads = tc.num_attention_heads + self.num_kv_heads = tc.num_key_value_heads + self.head_dim = getattr(tc, "head_dim", self.hidden_size // self.num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim ** -0.5 + self.layer_id = layer_id + + # Output gate: Qwen3.5 doubles the Q projection and uses half as a + # sigmoid gate on the attention output. + self.attn_output_gate = getattr(tc, "attn_output_gate", True) + + if self.attn_output_gate: + q_proj_size = self.q_size * 2 # Q + gate + else: + q_proj_size = self.q_size + + self.q_proj = Linear(self.hidden_size, q_proj_size, bias=False) + self.k_proj = Linear(self.hidden_size, self.kv_size, bias=False) + self.v_proj = Linear(self.hidden_size, self.kv_size, bias=False) + self.o_proj = Linear(self.q_size, self.hidden_size, bias=False) + + # QK normalization + self.q_norm = GemmaRMSNorm(self.head_dim, eps=tc.rms_norm_eps) + self.k_norm = GemmaRMSNorm(self.head_dim, eps=tc.rms_norm_eps) + + # RoPE config + self.partial_rotary_factor = getattr(tc, "partial_rotary_factor", 1.0) + rope_config = getattr(tc, "rope_parameters", None) or getattr(tc, "rope_scaling", None) or {} + self.rope_theta = rope_config.get("rope_theta", getattr(tc, "rope_theta", 10000.0)) + self.rotary_dim = int(self.head_dim * self.partial_rotary_factor) + + # RadixAttention layer — delegates to the pluggable attention backend + self.attn = RadixAttention( + num_heads=self.num_heads, + head_dim=self.head_dim, + scaling=self.scaling, + num_kv_heads=self.num_kv_heads, + layer_id=layer_id, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: Any, + ) -> torch.Tensor: + seq_len = hidden_states.shape[0] + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + if self.attn_output_gate: + # Split Q into actual Q and gate + q_gate = q.view(seq_len, self.num_heads, self.head_dim * 2) + q, gate = q_gate.chunk(2, dim=-1) + q = q.reshape(seq_len, -1) + gate = gate.reshape(seq_len, -1) + + # QK norm + q = self.q_norm(q.reshape(-1, self.head_dim)).view(seq_len, -1) + k = self.k_norm(k.reshape(-1, self.head_dim)).view(seq_len, -1) + + # RoPE (inplace; rotary_dim handles partial rotation) + q = q.view(seq_len, self.num_heads, self.head_dim) + k = k.view(seq_len, self.num_kv_heads, self.head_dim) + apply_rope_pos_ids( + q, k, positions, inplace=True, + rotary_dim=self.rotary_dim, rope_theta=self.rope_theta, + ) + q = q.reshape(seq_len, -1) + k = k.reshape(seq_len, -1) + + # Standard attention via RadixAttention → attn_backend + attn_output = self.attn(q, k, v, forward_batch) + + # Output gate + if self.attn_output_gate: + attn_output = attn_output * torch.sigmoid(gate) + + return self.o_proj(attn_output) + + +# --------------------------------------------------------------------------- +# Full Attention Decoder Layer +# --------------------------------------------------------------------------- + + +class Qwen3_5AttentionDecoderLayer(nn.Module): + """Decoder layer with full attention + MLP.""" + + def __init__(self, config, layer_id: int): + super().__init__() + tc = _get_text_config(config) + self.self_attn = Qwen3_5FullAttention(config, layer_id) + self.mlp = MLP( + hidden_size=tc.hidden_size, + intermediate_size=tc.intermediate_size, + activation=tc.hidden_act, + ) + self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + forward_batch: Any, + ): + # Pre-norm + residual + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.self_attn(positions, hidden_states, forward_batch) + + # Post-attention norm + residual + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +# --------------------------------------------------------------------------- +# Linear Attention (GDN) Decoder Layer +# --------------------------------------------------------------------------- + + +class Qwen3_5LinearDecoderLayer(nn.Module): + """Decoder layer with GDN linear attention + MLP.""" + + def __init__(self, config, layer_id: int, gdn_layer_idx: int = 0): + super().__init__() + tc = _get_text_config(config) + self.linear_attn = GatedDeltaNet( + hidden_size=tc.hidden_size, + num_k_heads=getattr(tc, "linear_num_key_heads", 16), + num_v_heads=getattr(tc, "linear_num_value_heads", 32), + head_k_dim=getattr(tc, "linear_key_head_dim", 128), + head_v_dim=getattr(tc, "linear_value_head_dim", 128), + conv_kernel_size=getattr(tc, "linear_conv_kernel_dim", 4), + layer_id=layer_id, + gdn_layer_idx=gdn_layer_idx, + rms_norm_eps=tc.rms_norm_eps, + ) + self.mlp = MLP( + hidden_size=tc.hidden_size, + intermediate_size=tc.intermediate_size, + activation=tc.hidden_act, + ) + self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + forward_batch: Any, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.linear_attn(hidden_states, forward_batch) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +# --------------------------------------------------------------------------- +# Layer type registry +# --------------------------------------------------------------------------- + +_DECODER_LAYER_TYPES = { + "attention": Qwen3_5AttentionDecoderLayer, + "linear_attention": Qwen3_5LinearDecoderLayer, +} + + +# --------------------------------------------------------------------------- +# Qwen3.5 Language Model (dense variant) +# --------------------------------------------------------------------------- + + +class Qwen3_5ForCausalLM(nn.Module): + """Qwen3.5 causal language model with hybrid attention. + + Alternates between full attention and GDN linear attention layers. + Dense (non-MoE) variant. + """ + + def __init__(self, config): + super().__init__() + tc = _get_text_config(config) + self.config = tc + self.hidden_size = tc.hidden_size + self.vocab_size = tc.vocab_size + + # Embedding + self.embed_tokens = VocabParallelEmbedding(tc.vocab_size, tc.hidden_size) + + # Build hybrid decoder layers with sequential GDN indexing + layer_types = _get_layer_types(tc) + self.layer_types = layer_types + self.layers = nn.ModuleList() + gdn_count = 0 + self.full_attn_layer_ids = set() + for idx in range(tc.num_hidden_layers): + layer_type = layer_types[idx] + if layer_type == "linear_attention": + self.layers.append( + Qwen3_5LinearDecoderLayer(config, idx, gdn_layer_idx=gdn_count) + ) + gdn_count += 1 + else: + self.layers.append( + Qwen3_5AttentionDecoderLayer(config, idx) + ) + self.full_attn_layer_ids.add(idx) + self.num_gdn_layers = gdn_count + + # Final norm + self.norm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + logger.info( + "Qwen3_5ForCausalLM: %d layers (%d attention + %d GDN)", + tc.num_hidden_layers, + len(self.full_attn_layer_ids), + self.num_gdn_layers, + ) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: Any, + input_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + forward_batch=forward_batch, + ) + + # Final normalization + if residual is not None: + hidden_states, _ = self.norm(hidden_states, residual) + else: + hidden_states = self.norm(hidden_states) + + return hidden_states + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + """Load HuggingFace checkpoint weights with name remapping.""" + stacked_params_mapping = [ + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded: Set[str] = set() + + for name, weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "mtp" in name: + continue + if "visual" in name: + continue + if "language_model" in name: + name = name.replace("model.language_model.", "") + if name.startswith("model."): + name = name[len("model."):] + # NOTE: do NOT strip .self_attn — pymllm keeps it as a submodule + + # Handle stacked params (gate_up_proj = gate_proj + up_proj) + matched = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + if name not in params_dict: + continue + param = params_dict[name] + # gate_up_proj is a plain Linear — manually place each shard + output_dim = param.shape[0] // 2 + param.data[shard_id * output_dim : (shard_id + 1) * output_dim].copy_( + weight + ) + matched = True + break + + if not matched: + if name not in params_dict: + continue + param = params_dict[name] + loader = getattr(param, "weight_loader", None) + if loader is not None: + loader(param, weight) + else: + # Squeeze conv1d weight from [C, 1, K] to [C, K] + if weight.dim() != param.dim(): + weight = weight.squeeze() + param.data.copy_(weight) + + loaded.add(name) + + logger.info("Loaded %d parameter tensors for Qwen3_5ForCausalLM", len(loaded)) + return loaded + + +# --------------------------------------------------------------------------- +# Qwen3.5 Vision-Language Model +# --------------------------------------------------------------------------- + + +class Qwen3_5ForConditionalGeneration(nn.Module): + """Qwen3.5 multimodal model (text + vision). + + Inherits vision encoder from Qwen3VL and uses Qwen3.5's hybrid + language model. + """ + + def __init__(self, config): + super().__init__() + from pymllm.models.qwen3_vl import ( + Qwen3VLVisionModel, + ) + + self.config = config + tc = _get_text_config(config) + + # Vision encoder (reuse Qwen3VL's vision model) + vision_config = getattr(config, "vision_config", None) + if vision_config is not None: + self.visual = Qwen3VLVisionModel( + depth=getattr(vision_config, "depth", 27), + hidden_size=getattr(vision_config, "hidden_size", 1152), + hidden_act=getattr(vision_config, "hidden_act", "gelu_pytorch_tanh"), + intermediate_size=getattr(vision_config, "intermediate_size", 4304), + num_heads=getattr(vision_config, "num_heads", 16), + in_channels=getattr(vision_config, "in_channels", 3), + patch_size=getattr(vision_config, "patch_size", 16), + spatial_merge_size=getattr(vision_config, "spatial_merge_size", 2), + temporal_patch_size=getattr(vision_config, "temporal_patch_size", 2), + out_hidden_size=getattr(vision_config, "out_hidden_size", 3584), + num_position_embeddings=getattr( + vision_config, "num_position_embeddings", 2304 + ), + deepstack_visual_indexes=getattr( + vision_config, "deepstack_visual_indexes", [8, 16, 24] + ), + norm_eps=getattr(tc, "rms_norm_eps", 1e-6), + ) + else: + self.visual = None + + # Language model + self.model = Qwen3_5ForCausalLM(config) + + # Expose hybrid model metadata for ModelRunner + self.num_gdn_layers = self.model.num_gdn_layers + self.full_attn_layer_ids = self.model.full_attn_layer_ids + + # LM head (tied to embedding when tie_word_embeddings=True) + self.lm_head = Linear(tc.hidden_size, tc.vocab_size, bias=False) + if getattr(tc, "tie_word_embeddings", False): + self.lm_head.weight = self.model.embed_tokens.weight + + # Vision token IDs + self.image_token_id = getattr(config, "image_token_id", 151655) + self.video_token_id = getattr(config, "video_token_id", 151656) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: Any, + input_embeds: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Process vision inputs if provided + if input_embeds is None and pixel_values is not None and self.visual is not None: + input_embeds = self.model.embed_tokens(input_ids) + # Run vision encoder + visual_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + # Replace image/video token positions with visual embeddings + mask = (input_ids == self.image_token_id) | (input_ids == self.video_token_id) + if mask.any(): + input_embeds[mask] = visual_embeds.reshape(-1, visual_embeds.shape[-1]) + + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + forward_batch=forward_batch, + input_embeds=input_embeds, + ) + + # LM head + logits = self.lm_head(hidden_states) + return logits + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + """Load weights, dispatching visual vs language params.""" + visual_weights = [] + language_weights = [] + + for name, weight in weights: + if "visual" in name or "model.visual" in name: + # Normalize visual weight names + name = name.replace("model.visual.", "visual.") + name = name.replace("attn.qkv.", "attn.qkv_proj.") + visual_weights.append((name, weight)) + else: + language_weights.append((name, weight)) + + # Load language model weights + self.model.load_weights(language_weights) + + # Load visual weights + if self.visual is not None and visual_weights: + params_dict = dict(self.named_parameters()) + for name, weight in visual_weights: + if name in params_dict: + param = params_dict[name] + loader = getattr(param, "weight_loader", None) + if loader is not None: + loader(param, weight) + else: + param.data.copy_(weight) + + logger.info("Qwen3_5ForConditionalGeneration weights loaded") diff --git a/pymllm/models/qwen3_vl.py b/pymllm/models/qwen3_vl.py new file mode 100644 index 00000000..3bee27c8 --- /dev/null +++ b/pymllm/models/qwen3_vl.py @@ -0,0 +1,1329 @@ +# Copyright 2025 Qwen Team +# Copyright 2025 SGLang Team +# Adapted for pymllm +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Inference-only Qwen3-VL model for pymllm. + +Adapted from sglang's Qwen3-VL implementation for pymllm's single-GPU +inference architecture. Uses pymllm layers (RadixAttention, RMSNorm, MLP) +and conforms to the pymllm forward interface:: + + model.forward(input_ids, positions, forward_batch) + +Designed for a single accelerator card — no tensor / pipeline parallelism. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pymllm.layers import RMSNorm, apply_mrope +from pymllm.layers.attention.radix_attention import RadixAttention +from pymllm.layers.mlp import MLP + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Vision Encoder +# --------------------------------------------------------------------------- + + +class Qwen3VisionMLP(nn.Module): + """MLP block for the vision encoder.""" + + def __init__( + self, + in_features: int, + hidden_features: int, + hidden_act: str = "silu", + bias: bool = True, + ): + super().__init__() + self.linear_fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.linear_fc2 = nn.Linear(hidden_features, in_features, bias=bias) + if hidden_act == "gelu_pytorch_tanh": + self.act = nn.GELU(approximate="tanh") + elif hidden_act == "gelu": + self.act = nn.GELU() + else: + self.act = nn.SiLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear_fc2(self.act(self.linear_fc1(x))) + + +class Qwen3VLVisionPatchEmbed(nn.Module): + """3D convolution patch embedding for video/image patchification.""" + + def __init__( + self, + patch_size: int = 16, + temporal_patch_size: int = 2, + in_channels: int = 3, + embed_dim: int = 1152, + ): + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.in_channels = in_channels + self.embed_dim = embed_dim + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = nn.Conv3d( + in_channels, + embed_dim, + kernel_size=kernel_size, + stride=kernel_size, + bias=True, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + target_dtype = self.proj.weight.dtype + hidden_states = hidden_states.view( + -1, + self.in_channels, + self.temporal_patch_size, + self.patch_size, + self.patch_size, + ) + hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view( + -1, self.embed_dim + ) + return hidden_states + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotate half the hidden dims of the input for RoPE.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +class Qwen3VisionAttention(nn.Module): + """Multi-head self-attention for the vision encoder (no KV cache).""" + + def __init__(self, embed_dim: int, num_heads: int): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + ) -> torch.Tensor: + """Forward pass with variable-length sequences via cu_seqlens. + + Args: + x: [total_tokens, embed_dim] + cu_seqlens: [num_seqs + 1] cumulative sequence lengths + rotary_pos_emb_cos: [total_tokens, rotary_dim] + rotary_pos_emb_sin: [total_tokens, rotary_dim] + """ + seq_len = x.shape[0] + qkv = self.qkv_proj(x) + q, k, v = qkv.reshape(seq_len, 3, self.num_heads, self.head_dim).unbind(dim=1) + + # Apply rotary position embedding. + # cos/sin are [total_tokens, head_dim // 2]. Following sglang's + # VisionAttention: double them to full head_dim and apply RoPE to + # all head dimensions (the rotation pairs (q[i], q[i + head_dim//2])). + cos = rotary_pos_emb_cos + sin = rotary_pos_emb_sin + if cos.shape[-1] * 2 == self.head_dim: + cos = torch.cat([cos, cos], dim=-1) + sin = torch.cat([sin, sin], dim=-1) + + cos = cos.unsqueeze(1) # [seq, 1, head_dim] + sin = sin.unsqueeze(1) # [seq, 1, head_dim] + + q = q * cos + _rotate_half(q) * sin + k = k * cos + _rotate_half(k) * sin + + # Scaled dot-product attention per variable-length sequence + output = torch.empty_like(q) + num_seqs = cu_seqlens.shape[0] - 1 + for i in range(num_seqs): + start = cu_seqlens[i].item() + end = cu_seqlens[i + 1].item() + qi = q[start:end].transpose(0, 1).unsqueeze(0) # [1, heads, seq, dim] + ki = k[start:end].transpose(0, 1).unsqueeze(0) + vi = v[start:end].transpose(0, 1).unsqueeze(0) + oi = F.scaled_dot_product_attention(qi, ki, vi) + output[start:end] = oi.squeeze(0).transpose(0, 1) + + output = output.reshape(seq_len, self.embed_dim) + return self.out_proj(output) + + +class Qwen3VisionBlock(nn.Module): + """Single vision transformer block.""" + + def __init__( + self, + dim: int, + num_heads: int, + intermediate_dim: int, + hidden_act: str = "silu", + norm_eps: float = 1e-6, + ): + super().__init__() + self.norm1 = nn.LayerNorm(dim, eps=norm_eps) + self.norm2 = nn.LayerNorm(dim, eps=norm_eps) + self.attn = Qwen3VisionAttention(embed_dim=dim, num_heads=num_heads) + self.mlp = Qwen3VisionMLP( + dim, intermediate_dim, hidden_act=hidden_act, bias=True + ) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + ) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + ) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen3VLVisionPatchMerger(nn.Module): + """Merges spatial patches to reduce sequence length. + + Groups ``spatial_merge_size ** 2`` consecutive patch tokens and projects + them to the language model hidden dimension. + """ + + def __init__( + self, + dim: int, + context_dim: int, + spatial_merge_size: int = 2, + use_postshuffle_norm: bool = False, + norm_eps: float = 1e-6, + ): + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + self.use_postshuffle_norm = use_postshuffle_norm + self.norm = nn.LayerNorm( + self.hidden_size if use_postshuffle_norm else context_dim, eps=norm_eps + ) + self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True) + self.act_fn = nn.GELU() + self.linear_fc2 = nn.Linear(self.hidden_size, dim, bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + x = self.norm(x.view(-1, self.hidden_size)) + else: + x = self.norm(x).view(-1, self.hidden_size) + x = self.act_fn(self.linear_fc1(x)) + return self.linear_fc2(x) + + +class Qwen3VLVisionModel(nn.Module): + """Complete vision encoder for Qwen3-VL. + + Produces patch embeddings from raw pixel values, applies a stack of + vision transformer blocks with 3D rotary embeddings, then merges + spatial patches. Supports "deep stack" where intermediate layer + outputs are captured and concatenated to the final output. + """ + + def __init__( + self, + depth: int = 27, + hidden_size: int = 1152, + hidden_act: str = "gelu_pytorch_tanh", + intermediate_size: int = 4304, + num_heads: int = 16, + in_channels: int = 3, + patch_size: int = 16, + spatial_merge_size: int = 2, + temporal_patch_size: int = 2, + out_hidden_size: int = 3584, + num_position_embeddings: int = 2304, + deepstack_visual_indexes: Optional[List[int]] = None, + norm_eps: float = 1e-6, + ): + super().__init__() + if deepstack_visual_indexes is None: + deepstack_visual_indexes = [8, 16, 24] + + self.hidden_size = hidden_size + self.num_heads = num_heads + self.num_position_embeddings = num_position_embeddings + self.num_grid_per_side = int(num_position_embeddings**0.5) + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.deepstack_visual_indexes = deepstack_visual_indexes + # Total output dim = out_hidden_size * (1 main + N deepstack mergers) + self.out_hidden_size = out_hidden_size * (1 + len(deepstack_visual_indexes)) + + self.patch_embed = Qwen3VLVisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + embed_dim=hidden_size, + ) + + self.pos_embed = nn.Embedding(num_position_embeddings, hidden_size) + + head_dim = hidden_size // num_heads + self._init_rope_cache(head_dim) + + self.blocks = nn.ModuleList( + [ + Qwen3VisionBlock( + dim=hidden_size, + num_heads=num_heads, + intermediate_dim=intermediate_size, + hidden_act=hidden_act, + norm_eps=norm_eps, + ) + for _ in range(depth) + ] + ) + + self.merger = Qwen3VLVisionPatchMerger( + dim=out_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + norm_eps=norm_eps, + ) + + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3VLVisionPatchMerger( + dim=out_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + use_postshuffle_norm=True, + norm_eps=norm_eps, + ) + for _ in range(len(deepstack_visual_indexes)) + ] + ) + + def _init_rope_cache(self, head_dim: int, max_grid_size: int = 8192): + """Precompute cos/sin cache for 2D rotary embeddings.""" + rotary_dim = head_dim // 2 + inv_freq = 1.0 / ( + 10000.0 + ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim) + ) + t = torch.arange(max_grid_size, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cache", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cache", torch.sin(freqs), persistent=False) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + # -- Rotary position embedding helpers -- + + @staticmethod + def _rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor: + """Compute 2D rotary position IDs for a grid of *h* x *w* patches. + + The patches are re-ordered to group ``spatial_merge_size ** 2`` + neighbours together (matching the merger's token order). + + Returns tensor of shape ``[h*w, 2]`` with ``(height_pos, width_pos)``. + """ + merge = spatial_merge_size + h_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + w_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + + h_ids = h_ids.reshape(h // merge, merge, w // merge, merge) + w_ids = w_ids.reshape(h // merge, merge, w // merge, merge) + + h_ids = h_ids.permute(0, 2, 1, 3).flatten() + w_ids = w_ids.permute(0, 2, 1, 3).flatten() + + return torch.stack([h_ids, w_ids], dim=-1) + + def rot_pos_emb( + self, grid_thw: List[List[int]] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute rotary pos-emb cos/sin for all images/videos in the batch.""" + pos_ids = [] + for t, h, w in grid_thw: + base = self._rot_pos_ids(h, w, self.spatial_merge_size) + pos_ids.append(base if t == 1 else base.repeat(t, 1)) + + pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True) + cos_combined = self.cos_cache[pos_ids].flatten(1) + sin_combined = self.sin_cache[pos_ids].flatten(1) + return cos_combined, sin_combined + + # -- Position embedding interpolation -- + + def _get_interpolation_indices(self, dim_size: int) -> np.ndarray: + indices = (np.arange(dim_size, dtype=np.float32) + 0.5) * ( + self.num_grid_per_side / dim_size + ) - 0.5 + return np.clip(indices, 0, self.num_grid_per_side - 1) + + def _calculate_indices_and_weights( + self, h_idxs: np.ndarray, w_idxs: np.ndarray + ) -> Tuple[List[np.ndarray], List[np.ndarray]]: + """Compute bilinear interpolation indices and weights.""" + side = self.num_grid_per_side + h_f = np.floor(h_idxs).astype(np.int64) + h_c = np.clip(h_f + 1, 0, side - 1) + dh = h_idxs - h_f + w_f = np.floor(w_idxs).astype(np.int64) + w_c = np.clip(w_f + 1, 0, side - 1) + dw = w_idxs - w_f + + indices = [ + (h_f[:, None] * side + w_f).flatten(), + (h_f[:, None] * side + w_c).flatten(), + (h_c[:, None] * side + w_f).flatten(), + (h_c[:, None] * side + w_c).flatten(), + ] + weights = [ + ((1 - dh)[:, None] * (1 - dw)).flatten(), + ((1 - dh)[:, None] * dw).flatten(), + (dh[:, None] * (1 - dw)).flatten(), + (dh[:, None] * dw).flatten(), + ] + return indices, weights + + def _get_position_embedding( + self, + patch_pos_embeds: List[torch.Tensor], + grid_ts: List[int], + grid_hs: List[int], + grid_ws: List[int], + ) -> torch.Tensor: + """Tile and reorganize position embeddings to align with the merged token order.""" + result_parts = [] + merge = self.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge, merge, w // merge, merge, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + result_parts.append(pos_embed) + return torch.cat(result_parts, dim=0) + + def fast_pos_embed_interpolate(self, grid_thw: torch.Tensor) -> torch.Tensor: + """Interpolate position embeddings via bilinear interpolation.""" + grid_thw_cpu = grid_thw.cpu().numpy() + temporal_dims = grid_thw_cpu[:, 0].tolist() + height_dims = grid_thw_cpu[:, 1].tolist() + width_dims = grid_thw_cpu[:, 2].tolist() + + device = self.pos_embed.weight.device + dtype = self.pos_embed.weight.dtype + + patches_size = [h * w for h, w in zip(height_dims, width_dims)] + total_patches = sum(patches_size) + all_indices_np = np.zeros((4, total_patches), dtype=np.int64) + all_weights_np = np.zeros((4, total_patches), dtype=np.float32) + + current_idx = 0 + for _t, h, w in zip(temporal_dims, height_dims, width_dims): + h_idxs = self._get_interpolation_indices(h) + w_idxs = self._get_interpolation_indices(w) + indices, weights = self._calculate_indices_and_weights(h_idxs, w_idxs) + end_idx = current_idx + h * w + for i in range(4): + all_indices_np[i, current_idx:end_idx] = indices[i] + all_weights_np[i, current_idx:end_idx] = weights[i] + current_idx = end_idx + + idx_tensor = torch.from_numpy(all_indices_np).to(device) + weight_tensor = torch.from_numpy(all_weights_np).to(dtype=dtype, device=device) + + pos_embeds = self.pos_embed(idx_tensor.view(-1)) + pos_embeds = pos_embeds.view(4, total_patches, -1) + patch_pos_embeds = (pos_embeds * weight_tensor.unsqueeze(-1)).sum(dim=0) + patch_pos_embeds = patch_pos_embeds.split(patches_size) + return self._get_position_embedding( + list(patch_pos_embeds), temporal_dims, height_dims, width_dims + ) + + # -- Forward -- + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + """Run the vision encoder. + + Args: + x: Pixel values, shape ``[total_patches, patch_dim]``. + grid_thw: Grid dimensions ``[num_images, 3]`` with ``(T, H, W)``. + + Returns: + Vision features of shape + ``[num_merged_tokens, out_hidden_size * (1 + num_deepstack)]``. + """ + x = x.to(device=self.device, dtype=self.dtype) + x = self.patch_embed(x) + + if isinstance(grid_thw, list): + grid_thw_list = grid_thw + grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + else: + grid_thw_list = grid_thw.tolist() + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + x += pos_embeds + + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) + + cu_seqlens = _compute_cu_seqlens_from_grid(grid_thw) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + + deepstack_features = [] + ds_idx = 0 + + for layer_num, blk in enumerate(self.blocks): + x = blk(x, cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin) + + if layer_num in self.deepstack_visual_indexes: + # x is [total_tokens, hidden]. The merger expects the last + # dim to be context_dim so it can group spatial_merge_size^2 + # tokens; reshape to [total_tokens, 1, hidden] so that the + # `.view(-1, hidden_size)` inside the merger collapses the + # spatial merge correctly. + ds_feat = self.deepstack_merger_list[ds_idx](x.unsqueeze(1)) + deepstack_features.append(ds_feat) + ds_idx += 1 + + x = self.merger(x.unsqueeze(1)) + + # Concatenate main + deepstack features along the feature dimension. + # Result: [num_merged_tokens, out_hidden_size * (1 + num_deepstack)] + hidden_states = torch.cat([x] + deepstack_features, dim=-1) + return hidden_states + + +def _compute_cu_seqlens_from_grid(grid_thw: torch.Tensor) -> torch.Tensor: + """Compute cumulative sequence lengths from grid dimensions.""" + grid_np = grid_thw.cpu().numpy() + seq_lens = (grid_np[:, 0] * grid_np[:, 1] * grid_np[:, 2]).astype(np.int32) + cu_seqlens = np.concatenate([[0], np.cumsum(seq_lens)]) + return torch.tensor(cu_seqlens, dtype=torch.int32) + + +def _build_cos_sin_cache( + head_dim: int, + rope_theta: float, + max_pos: int, + dtype: torch.dtype, +) -> torch.Tensor: + """Build a [max_pos, head_dim] cos/sin cache for M-RoPE. + + Layout: first ``head_dim // 2`` columns are cos values, second half are sin. + Each row corresponds to one position index. + """ + inv_freq = 1.0 / ( + rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim) + ) + t = torch.arange(max_pos, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) # [max_pos, head_dim // 2] + return torch.cat([torch.cos(freqs), torch.sin(freqs)], dim=-1).to(dtype) + + +def get_rope_index( + input_ids: torch.Tensor, + image_grid_thw: Optional[torch.Tensor], + image_token_id: int, + vision_start_token_id: int, + spatial_merge_size: int, +) -> Tuple[torch.Tensor, int]: + """Compute M-RoPE 3-D position IDs for one sequence. + + For text tokens all three (temporal, height, width) indices are equal to + the sequential counter. For image tokens the indices follow the spatial + grid ``(t, h, w)``. + + Args: + input_ids: Token IDs for one sequence, shape ``[T]``. + image_grid_thw: Grid dimensions for every image in the sequence, + shape ``[num_images, 3]``. ``None`` when there are no images. + image_token_id: Token ID used as placeholder for image patches. + vision_start_token_id: Token ID that precedes each image block. + spatial_merge_size: Number of patches merged per spatial dimension + (e.g. 2 → 2x2 merge, so llm_grid_h = H // 2). + + Returns: + ``(position_ids, mrope_position_delta)`` where ``position_ids`` has + shape ``[3, T]`` and ``mrope_position_delta`` is a Python ``int`` + equal to ``max_position_used + 1 - T``. + """ + total_tokens = input_ids.shape[0] + device = input_ids.device + position_ids = torch.zeros(3, total_tokens, dtype=torch.long, device=device) + + if image_grid_thw is None or image_grid_thw.shape[0] == 0: + pos = torch.arange(total_tokens, dtype=torch.long, device=device) + position_ids[0] = pos + position_ids[1] = pos + position_ids[2] = pos + return position_ids, 0 + + input_ids_cpu = input_ids.cpu().tolist() + grid_thw_list = image_grid_thw.cpu().tolist() + + llm_pos_ids_start = 0 + image_idx = 0 + i = 0 + + while i < total_tokens: + token = input_ids_cpu[i] + + if token == vision_start_token_id and image_idx < len(grid_thw_list): + # The vision_start token itself gets a regular sequential position. + position_ids[:, i] = llm_pos_ids_start + llm_pos_ids_start += 1 + i += 1 + + # Compute LLM-side grid dimensions (after spatial merging). + t_g = int(grid_thw_list[image_idx][0]) + h_g = int(grid_thw_list[image_idx][1]) + w_g = int(grid_thw_list[image_idx][2]) + llm_grid_t = t_g + llm_grid_h = h_g // spatial_merge_size + llm_grid_w = w_g // spatial_merge_size + num_image_tokens = llm_grid_t * llm_grid_h * llm_grid_w + + # Build per-patch 3-D indices. + t_idx = ( + torch.arange(llm_grid_t, device=device) + .view(-1, 1, 1) + .expand(-1, llm_grid_h, llm_grid_w) + .flatten() + ) + h_idx = ( + torch.arange(llm_grid_h, device=device) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_idx = ( + torch.arange(llm_grid_w, device=device) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + + img_start = i + img_end = i + num_image_tokens + position_ids[0, img_start:img_end] = t_idx + llm_pos_ids_start + position_ids[1, img_start:img_end] = h_idx + llm_pos_ids_start + position_ids[2, img_start:img_end] = w_idx + llm_pos_ids_start + + llm_pos_ids_start += max(llm_grid_t, llm_grid_h, llm_grid_w) + i += num_image_tokens + image_idx += 1 + else: + # Text token (including vision_end and all non-image tokens). + position_ids[:, i] = llm_pos_ids_start + llm_pos_ids_start += 1 + i += 1 + + mrope_position_delta = llm_pos_ids_start - total_tokens + return position_ids, mrope_position_delta + + +# --------------------------------------------------------------------------- +# Text Decoder (Language Model) +# --------------------------------------------------------------------------- + + +class Qwen3VLAttention(nn.Module): + """Attention layer for the Qwen3-VL text decoder. + + Uses QK-norm (per-head RMSNorm on Q and K before RoPE) and + :class:`RadixAttention` for KV-cached inference. Applies + interleaved M-RoPE with a precomputed cos/sin cache. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + layer_id: int, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.q_size = num_heads * head_dim + self.kv_size = num_kv_heads * head_dim + self.scaling = head_dim**-0.5 + self.mrope_section = list(mrope_section) + self.mrope_interleaved = mrope_interleaved + + # Fused QKV projection + self.qkv_proj = nn.Linear( + hidden_size, self.q_size + 2 * self.kv_size, bias=False + ) + + # Output projection + self.o_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=False) + + # QK normalization + self.q_norm = RMSNorm(head_dim, eps=rms_norm_eps) + self.k_norm = RMSNorm(head_dim, eps=rms_norm_eps) + + # Precomputed M-RoPE cos/sin cache: [max_pos, head_dim] + cos_sin = _build_cos_sin_cache( + head_dim, rope_theta, max_position_embeddings, torch.float32 + ) + self.register_buffer("cos_sin_cache", cos_sin, persistent=False) + + # Radix attention (single-GPU: heads == tp_heads) + self.attn = RadixAttention( + num_heads=num_heads, + head_dim=head_dim, + scaling=self.scaling, + num_kv_heads=num_kv_heads, + layer_id=layer_id, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: "ForwardBatch", + ) -> torch.Tensor: + qkv = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # Per-head QK normalization + q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)) + k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)) + + # Apply M-RoPE. positions is [3, T] for prefill (3-D) or may arrive + # as [T] for purely text-only batches; expand to [3, T] in that case. + if positions.ndim == 1: + positions = positions.unsqueeze(0).expand(3, -1) + q, k = apply_mrope( + q, + k, + positions, + self.cos_sin_cache.to(q.dtype), + self.mrope_section, + self.mrope_interleaved, + ) + + q = q.reshape(-1, self.q_size) + k = k.reshape(-1, self.kv_size) + + # Attention with KV cache + attn_output = self.attn(q, k, v, forward_batch) + return self.o_proj(attn_output) + + +class Qwen3VLDecoderLayer(nn.Module): + """Single decoder layer for the Qwen3-VL text model.""" + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + intermediate_size: int, + layer_id: int, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.self_attn = Qwen3VLAttention( + hidden_size=hidden_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + layer_id=layer_id, + rope_theta=rope_theta, + rms_norm_eps=rms_norm_eps, + mrope_section=mrope_section, + mrope_interleaved=mrope_interleaved, + max_position_embeddings=max_position_embeddings, + ) + self.mlp = MLP( + hidden_size=hidden_size, + intermediate_size=intermediate_size, + activation="silu", + use_fused_gate_up_proj=True, + use_bias_gate_up=False, + use_bias_down=False, + ) + self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: "ForwardBatch", + deepstack_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Self-attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(positions, hidden_states, forward_batch) + hidden_states = residual + hidden_states + + # Add deepstack embeddings after residual (matches HF ordering) + if deepstack_embeds is not None: + hidden_states = hidden_states + deepstack_embeds + + # MLP + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Qwen3VLTextModel(nn.Module): + """Qwen3-VL text backbone (embedding + decoder layers + final norm).""" + + def __init__( + self, + vocab_size: int = 151936, + hidden_size: int = 4096, + intermediate_size: int = 22016, + num_hidden_layers: int = 32, + num_attention_heads: int = 32, + num_key_value_heads: int = 32, + head_dim: int = 128, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + + self.embed_tokens = nn.Embedding(vocab_size, hidden_size) + + self.layers = nn.ModuleList( + [ + Qwen3VLDecoderLayer( + hidden_size=hidden_size, + num_heads=num_attention_heads, + num_kv_heads=num_key_value_heads, + head_dim=head_dim, + intermediate_size=intermediate_size, + layer_id=layer_id, + rope_theta=rope_theta, + rms_norm_eps=rms_norm_eps, + mrope_section=mrope_section, + mrope_interleaved=mrope_interleaved, + max_position_embeddings=max_position_embeddings, + ) + for layer_id in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: "ForwardBatch", + input_embeds: Optional[torch.Tensor] = None, + input_deepstack_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + + for layer_idx, layer in enumerate(self.layers): + ds_embeds = _get_deepstack_embeds( + layer_idx, input_deepstack_embeds, self.hidden_size + ) + hidden_states = layer( + positions, + hidden_states, + forward_batch, + deepstack_embeds=ds_embeds, + ) + + return self.norm(hidden_states) + + +def _get_deepstack_embeds( + layer_idx: int, + input_deepstack_embeds: Optional[torch.Tensor], + hidden_size: int, +) -> Optional[torch.Tensor]: + """Extract deepstack embeddings for a specific decoder layer.""" + if input_deepstack_embeds is None: + return None + num_deepstack = input_deepstack_embeds.shape[-1] // hidden_size + if layer_idx >= num_deepstack: + return None + start = hidden_size * layer_idx + return input_deepstack_embeds[:, start : start + hidden_size] + + +# --------------------------------------------------------------------------- +# Full Model: Qwen3VLForConditionalGeneration +# --------------------------------------------------------------------------- + + +class Qwen3VLForConditionalGeneration(nn.Module): + """Qwen3-VL multimodal model for conditional generation. + + Combines a vision encoder and text decoder. During prefill, image/video + tokens are replaced with visual features from the vision encoder. + During decode, the model runs only the text decoder. + + Forward interface:: + + logits = model.forward(input_ids, positions, forward_batch) + """ + + def __init__(self, config) -> None: + super().__init__() + self.config = config + + text_config = getattr(config, "text_config", config) + vision_config = getattr(config, "vision_config", None) + + # Vision encoder + if vision_config is not None: + self.visual = Qwen3VLVisionModel( + depth=getattr(vision_config, "depth", 27), + hidden_size=getattr(vision_config, "hidden_size", 1152), + hidden_act=getattr(vision_config, "hidden_act", "gelu_pytorch_tanh"), + intermediate_size=getattr(vision_config, "intermediate_size", 4304), + num_heads=getattr(vision_config, "num_heads", 16), + in_channels=getattr(vision_config, "in_channels", 3), + patch_size=getattr(vision_config, "patch_size", 16), + spatial_merge_size=getattr(vision_config, "spatial_merge_size", 2), + temporal_patch_size=getattr(vision_config, "temporal_patch_size", 2), + out_hidden_size=getattr(vision_config, "out_hidden_size", 3584), + num_position_embeddings=getattr( + vision_config, "num_position_embeddings", 2304 + ), + deepstack_visual_indexes=getattr( + vision_config, "deepstack_visual_indexes", [8, 16, 24] + ), + norm_eps=getattr(text_config, "rms_norm_eps", 1e-6), + ) + else: + self.visual = None + + # Text decoder + hidden_size = getattr(text_config, "hidden_size", 4096) + vocab_size = getattr(text_config, "vocab_size", 151936) + + # M-RoPE configuration -- mrope_section lives inside rope_scaling, + # NOT as a top-level attribute of text_config. + rope_scaling = getattr(text_config, "rope_scaling", None) or {} + if isinstance(rope_scaling, dict): + mrope_section = rope_scaling.get("mrope_section", [24, 20, 20]) + mrope_interleaved = rope_scaling.get("mrope_interleaved", True) + else: + mrope_section = getattr(rope_scaling, "mrope_section", [24, 20, 20]) + mrope_interleaved = getattr(rope_scaling, "mrope_interleaved", True) + max_position_embeddings = getattr(text_config, "max_position_embeddings", 32768) + + self.model = Qwen3VLTextModel( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=getattr(text_config, "intermediate_size", 22016), + num_hidden_layers=getattr(text_config, "num_hidden_layers", 32), + num_attention_heads=getattr(text_config, "num_attention_heads", 32), + num_key_value_heads=getattr(text_config, "num_key_value_heads", 32), + head_dim=getattr(text_config, "head_dim", 128), + rope_theta=getattr(text_config, "rope_theta", 5_000_000.0), + rms_norm_eps=getattr(text_config, "rms_norm_eps", 1e-6), + mrope_section=tuple(mrope_section), + mrope_interleaved=bool(mrope_interleaved), + max_position_embeddings=max_position_embeddings, + ) + + # LM head — following sglang's pattern: always use lm_head.weight + # for matmul in forward(), so it works whether lm_head is nn.Embedding + # (tied) or nn.Linear (untied). + tie_word_embeddings = getattr(config, "tie_word_embeddings", False) + if tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False) + + # Token IDs for multimodal + self.image_token_id = getattr(config, "image_token_id", 151655) + self.video_token_id = getattr(config, "video_token_id", 151656) + self.vision_start_token_id = getattr(config, "vision_start_token_id", 151652) + + # Spatial merge size (needed for get_rope_index) + self.spatial_merge_size = ( + getattr(vision_config, "spatial_merge_size", 2) + if vision_config is not None + else 2 + ) + + # Deepstack config + if vision_config is not None: + ds_indexes = getattr(vision_config, "deepstack_visual_indexes", [8, 16, 24]) + self.num_deepstack_embeddings = len(ds_indexes) + else: + self.num_deepstack_embeddings = 0 + + self._hidden_size = hidden_size + + def get_input_embeddings(self) -> nn.Module: + return self.model.embed_tokens + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: "ForwardBatch", + ) -> torch.Tensor: + """Run forward pass for Qwen3-VL. + + Args: + input_ids: Flattened input token IDs, shape ``[num_tokens]``. + positions: Position IDs, shape ``[num_tokens]`` (1-D, from model + runner). Overridden internally with 3-D M-RoPE positions. + forward_batch: :class:`ForwardBatch` with attention metadata. + + Returns: + Logits tensor of shape ``[num_tokens, vocab_size]``. + """ + pixel_values = getattr(forward_batch, "pixel_values", None) + image_grid_thw = getattr(forward_batch, "image_grid_thw", None) + + # ------------------------------------------------------------------ + # Build 3-D M-RoPE positions + # ------------------------------------------------------------------ + if forward_batch.forward_mode.is_extend(): + # Prefill: compute per-sequence 3-D position IDs from input_ids + # and image grids, then store per-request deltas for future decode. + mrope_positions_list: List[torch.Tensor] = [] + deltas: List[int] = [] + image_idx_offset = 0 + + for i in range(forward_batch.batch_size): + start = int(forward_batch.extend_start_loc[i].item()) + length = int(forward_batch.extend_seq_lens[i].item()) + seq_ids = input_ids[start : start + length] + + # Determine how many images belong to this sequence. + num_img = int((seq_ids == self.vision_start_token_id).sum().item()) + if image_grid_thw is not None and num_img > 0: + thw_seq = image_grid_thw[ + image_idx_offset : image_idx_offset + num_img + ] + image_idx_offset += num_img + else: + thw_seq = None + + pos3d, delta = get_rope_index( + seq_ids, + thw_seq, + self.image_token_id, + self.vision_start_token_id, + self.spatial_merge_size, + ) + mrope_positions_list.append(pos3d) + deltas.append(delta) + + # Concatenate across sequences: [3, total_extend_tokens] + positions = torch.cat(mrope_positions_list, dim=1) + forward_batch.mrope_position_deltas = torch.tensor( + deltas, dtype=torch.int64, device=input_ids.device + ) + else: + # Decode: each sequence emits exactly one token. Apply the stored + # per-request delta so the position matches the image extent. + stored_deltas = getattr(forward_batch, "mrope_position_deltas", None) + if stored_deltas is not None: + pos_1d = forward_batch.positions + stored_deltas + else: + pos_1d = forward_batch.positions + positions = pos_1d.unsqueeze(0).expand(3, -1) # [3, batch_size] + + input_embeds = None + input_deepstack_embeds = None + + if ( + pixel_values is not None + and image_grid_thw is not None + and self.visual is not None + and not forward_batch.forward_mode.is_decode() + ): + # Run vision encoder + vision_features = self.visual(pixel_values, grid_thw=image_grid_thw) + + # Separate main embeddings and deepstack embeddings + if self.num_deepstack_embeddings > 0: + vision_embeds = vision_features[:, : self._hidden_size] + deepstack_embeds = vision_features[:, self._hidden_size :] + else: + vision_embeds = vision_features + deepstack_embeds = None + + # Get text embeddings and replace image tokens with vision features + input_embeds = self.model.embed_tokens(input_ids) + image_mask = input_ids == self.image_token_id + if image_mask.any(): + input_embeds[image_mask] = vision_embeds.to(input_embeds.dtype) + + # Build per-token deepstack embeddings + if deepstack_embeds is not None and image_mask.any(): + input_deepstack_embeds = torch.zeros( + input_embeds.shape[0], + deepstack_embeds.shape[-1], + dtype=input_embeds.dtype, + device=input_embeds.device, + ) + input_deepstack_embeds[image_mask] = deepstack_embeds.to( + input_embeds.dtype + ) + + # Text decoder + hidden_states = self.model( + input_ids, + positions, + forward_batch, + input_embeds=input_embeds, + input_deepstack_embeds=input_deepstack_embeds, + ) + + # Prune hidden_states before lm_head to avoid a wasteful + # [total_tokens, vocab] matmul during prefill. Following sglang's + # LogitsProcessor._get_pruned_states(): in extend mode only keep + # the last token of each sequence; in decode mode all rows are + # already one-per-sequence. + if forward_batch.forward_mode.is_extend(): + if ( + forward_batch.extend_start_loc is not None + and forward_batch.extend_seq_lens is not None + ): + last_index = ( + forward_batch.extend_start_loc + forward_batch.extend_seq_lens - 1 + ).long() + hidden_states = hidden_states[last_index] + else: + hidden_states = hidden_states[-1:] + + # LM head: always use weight matrix directly for the linear + # projection. Works for both nn.Embedding (tied) and nn.Linear + # (untied). Matches sglang LogitsProcessor._compute_lm_head(). + logits = torch.matmul( + hidden_states.to(self.lm_head.weight.dtype), + self.lm_head.weight.T, + ) + + # Return LogitsProcessorOutput so that ModelRunner._process_logits + # skips redundant last-token gathering. + from pymllm.executor.model_runner import LogitsProcessorOutput + + return LogitsProcessorOutput(next_token_logits=logits) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None: + """Load weights from a HuggingFace checkpoint. + + Handles weight name remapping between HuggingFace Qwen3-VL + checkpoints and this model's parameter names. + """ + stacked_params_mapping = [ + # (param_name, weight_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".up_proj", 1), + (".gate_up_proj", ".gate_proj", 0), + ] + + params_dict = dict(self.named_parameters()) + + tie_word_embeddings = getattr(self.config, "tie_word_embeddings", False) + + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + # When weights are tied, lm_head.weight is the same tensor as + # embed_tokens.weight — skip the duplicate from the checkpoint. + if tie_word_embeddings and "lm_head.weight" in name: + continue + + name = _remap_weight_name(name) + + # Handle language model stacked parameters (QKV, gate_up) + handled = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name or "visual" in name: + continue + name = name.replace(weight_name, param_name) + if name not in params_dict: + continue + _load_stacked_weight(params_dict[name], loaded_weight, shard_id) + handled = True + break + + if handled: + continue + + # Handle vision encoder QKV stacking + if "visual" in name: + for qkv_key in (".attn.q.", ".attn.k.", ".attn.v."): + if qkv_key not in name: + continue + qkv_name = name.replace(qkv_key, ".attn.qkv_proj.") + if qkv_name in params_dict: + shard = {"q": 0, "k": 1, "v": 2}[qkv_key[-2]] + _load_vision_qkv_weight( + params_dict[qkv_name], loaded_weight, shard + ) + handled = True + break + + if handled: + continue + + # Direct parameter loading + if name in params_dict: + param = params_dict[name] + if param.data.shape == loaded_weight.shape: + param.data.copy_(loaded_weight) + else: + logger.warning( + "Shape mismatch: param %s (%s) vs loaded (%s), skipping.", + name, + param.data.shape, + loaded_weight.shape, + ) + + +# --------------------------------------------------------------------------- +# Weight loading helpers +# --------------------------------------------------------------------------- + + +def _remap_weight_name(name: str) -> str: + """Remap HuggingFace weight names to pymllm parameter names.""" + # transformers >= v4.52: model.language_model.* -> model.* + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.", 1) + # model.visual.* -> visual.* + elif name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + # Vision attention QKV renaming (fused weights in checkpoint) + if "visual" in name: + name = name.replace("attn.qkv.", "attn.qkv_proj.") + + return name + + +def _load_stacked_weight( + param: nn.Parameter, + loaded_weight: torch.Tensor, + shard_id, +) -> None: + """Load one shard (q/k/v or gate/up) into a fused parameter. + + For QKV with GQA (grouped-query attention), Q has a different size + from K and V. The fused layout is ``[Q, K, V]`` where + ``Q_size = total - 2 * KV_size``. We must use cumulative offsets + rather than ``idx * shard_size`` to handle the asymmetry correctly. + """ + if isinstance(shard_id, str): + # QKV fused layout: [Q, K, V] + # Q may have a different size from K/V (GQA). + total_size = param.data.shape[0] + shard_size = loaded_weight.shape[0] + if shard_id == "q": + param.data[0:shard_size].copy_(loaded_weight) + elif shard_id == "k": + kv_size = shard_size + q_size = total_size - 2 * kv_size + param.data[q_size : q_size + kv_size].copy_(loaded_weight) + elif shard_id == "v": + kv_size = shard_size + q_size = total_size - 2 * kv_size + param.data[q_size + kv_size : q_size + 2 * kv_size].copy_( + loaded_weight + ) + else: + # gate_up: 0 -> gate, 1 -> up (same size, idx*size is correct) + shard_size = loaded_weight.shape[0] + param.data[shard_id * shard_size : (shard_id + 1) * shard_size].copy_( + loaded_weight + ) + + +def _load_vision_qkv_weight( + param: nn.Parameter, + loaded_weight: torch.Tensor, + shard_idx: int, +) -> None: + """Load a Q, K, or V weight shard into a fused QKV parameter.""" + shard_size = param.data.shape[0] // 3 + start = shard_idx * shard_size + param.data[start : start + shard_size].copy_(loaded_weight) diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py deleted file mode 100644 index ef3fd5f0..00000000 --- a/pymllm/orchestrator/async_disk_io_process.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -AsyncDiskIoProcess -- optional subprocess for asynchronous disk I/O. - -Handles weight loading, checkpoint saving, or other heavy disk operations -without blocking the scheduler or model runner. -""" - -import logging -from multiprocessing.connection import Connection -from typing import Any, Dict, Optional - -import zmq - -from pymllm.orchestrator.ipc_utils import create_zmq_socket - -logger = logging.getLogger(__name__) - - -class AsyncDiskIoProcess: - """Runs inside a subprocess. Performs disk I/O on behalf of the scheduler.""" - - def __init__(self, recv_addr: str): - self._recv_addr = recv_addr - - self._zmq_ctx: Optional[zmq.Context] = None - self._recv_sock: Optional[zmq.Socket] = None - - # ------------------------------------------------------------------ - # Lifecycle - # ------------------------------------------------------------------ - - def init_sockets(self) -> None: - self._zmq_ctx = zmq.Context() - self._recv_sock = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_addr, bind=True, - ) - - def event_loop(self) -> None: - """Infinite loop: recv I/O request -> execute -> (optionally reply).""" - logger.info("AsyncDiskIoProcess event loop started") - while True: - io_request: Dict[str, Any] = self._recv_sock.recv_pyobj() - self._handle(io_request) - - # ------------------------------------------------------------------ - # I/O handling (placeholder) - # ------------------------------------------------------------------ - - def _handle(self, io_request: Dict[str, Any]) -> None: - """Dispatch an I/O request. - - TODO: implement weight loading, checkpoint save, etc. - """ - kind = io_request.get("kind") - logger.debug("AsyncDiskIoProcess received request kind=%s", kind) - - # ------------------------------------------------------------------ - # Cleanup - # ------------------------------------------------------------------ - - def shutdown(self) -> None: - if self._recv_sock is not None: - self._recv_sock.close() - if self._zmq_ctx is not None: - self._zmq_ctx.term() - - -def run_async_disk_io_process( - recv_addr: str, - pipe_writer: Connection, -) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = AsyncDiskIoProcess(recv_addr) - proc.init_sockets() - - pipe_writer.send({"status": "ready", "process": "async_disk_io"}) - pipe_writer.close() - - try: - proc.event_loop() - except KeyboardInterrupt: - pass - finally: - proc.shutdown() diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index e9d5184b..c2154e44 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -12,7 +12,7 @@ import zmq -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging logger = logging.getLogger(__name__) @@ -24,16 +24,19 @@ def __init__( self, recv_from_scheduler_addr: str, send_to_rr_addr: str, + tokenizer_cfg: Optional[Dict[str, Any]] = None, ): self._recv_from_scheduler_addr = recv_from_scheduler_addr self._send_to_rr_addr = send_to_rr_addr + self._tokenizer_cfg = tokenizer_cfg or {} self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_scheduler: Optional[zmq.Socket] = None self._send_to_rr: Optional[zmq.Socket] = None - # TODO: initialise the tokenizer (needed for decode) self._tokenizer = None + # Track previous decoded text per rid for incremental (delta) output + self._rid_to_prev_text: Dict[str, str] = {} # ------------------------------------------------------------------ # Lifecycle @@ -54,32 +57,102 @@ def init_sockets(self) -> None: bind=False, ) + def init_tokenizer(self) -> None: + """Load the tokenizer from the configured path.""" + tokenizer_path = self._tokenizer_cfg.get("tokenizer_path") + if tokenizer_path is None: + logger.warning( + "No tokenizer_path in tokenizer_cfg; detokenization disabled" + ) + return + + from transformers import AutoTokenizer + + trust_remote_code = self._tokenizer_cfg.get("trust_remote_code", False) + self._tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path, + trust_remote_code=trust_remote_code, + ) + logger.info("Detokenizer loaded tokenizer from %s", tokenizer_path) + def event_loop(self) -> None: """Infinite loop: recv token IDs -> detokenize -> send text to RR.""" logger.info("DetokenizerProcess event loop started") while True: token_id_out = self._recv_from_scheduler.recv_pyobj() - str_out = self._detokenize(token_id_out) - self._send_to_rr.send_pyobj(str_out) + results = self._detokenize(token_id_out) + for result in results: + self._send_to_rr.send_pyobj(result) # ------------------------------------------------------------------ - # Detokenization (placeholder) + # Detokenization # ------------------------------------------------------------------ - def _detokenize(self, token_id_out: Dict[str, Any]) -> Dict[str, Any]: - """Convert token IDs to text. + def _detokenize(self, token_id_out: Dict[str, Any]) -> List[Dict[str, Any]]: + """Convert token IDs to text and fan out one result per rid. - TODO: replace with real tokenizer.decode() call and incremental - detokenization logic. + The scheduler sends a batch dict with parallel lists keyed by + ``"rids"``, ``"output_ids"``, ``"finished_reasons"``, etc. + This method decodes each rid's output_ids and produces one result + dict per rid with keys ``"rid"`` (singular) and ``"finished"`` + (bool) as expected by ``RequestResponseProcess._recv_loop``. """ - output_ids: List[int] = token_id_out.get("output_token_ids", []) - # placeholder: join ids as string - text = "" # TODO: self._tokenizer.decode(output_ids) - return { - "rid": token_id_out.get("rid"), - "text": text, - "output_token_ids": output_ids, - } + rids: List[str] = token_id_out.get("rids", []) + output_ids: List[int] = token_id_out.get("output_ids", []) + finished_reasons: List[Optional[str]] = token_id_out.get("finished_reasons", []) + decode_ids: List[int] = token_id_out.get("decode_ids", []) + skip_special_tokens_list: List[bool] = token_id_out.get( + "skip_special_tokens", [] + ) + prompt_tokens_list: List[int] = token_id_out.get("prompt_tokens", []) + completion_tokens_list: List[int] = token_id_out.get("completion_tokens", []) + + results: List[Dict[str, Any]] = [] + + for i, rid in enumerate(rids): + finished_reason = finished_reasons[i] if i < len(finished_reasons) else None + is_finished = finished_reason is not None + skip_special = ( + skip_special_tokens_list[i] + if i < len(skip_special_tokens_list) + else True + ) + prompt_tokens = prompt_tokens_list[i] if i < len(prompt_tokens_list) else 0 + completion_tokens = ( + completion_tokens_list[i] if i < len(completion_tokens_list) else 0 + ) + + # Decode text from output_ids + if self._tokenizer is not None: + text = self._tokenizer.decode( + output_ids, + skip_special_tokens=skip_special, + ) + else: + text = "" + + # Compute incremental delta by diffing against previous text + prev_text = self._rid_to_prev_text.get(rid, "") + delta_text = text[len(prev_text):] + self._rid_to_prev_text[rid] = text + + # Clean up tracking when request finishes + if is_finished: + self._rid_to_prev_text.pop(rid, None) + + result: Dict[str, Any] = { + "rid": rid, + "text": text, + "delta": delta_text, + "output_token_ids": list(output_ids), + "finished": is_finished, + "finished_reason": finished_reason, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + } + results.append(result) + + return results # ------------------------------------------------------------------ # Cleanup @@ -98,10 +171,17 @@ def run_detokenizer_process( recv_from_scheduler_addr: str, send_to_rr_addr: str, pipe_writer: Connection, + tokenizer_cfg: Optional[Dict[str, Any]] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = DetokenizerProcess(recv_from_scheduler_addr, send_to_rr_addr) + setup_subprocess_logging((tokenizer_cfg or {}).get("log_level", "info")) + proc = DetokenizerProcess( + recv_from_scheduler_addr, + send_to_rr_addr, + tokenizer_cfg=tokenizer_cfg, + ) proc.init_sockets() + proc.init_tokenizer() pipe_writer.send({"status": "ready", "process": "detokenizer"}) pipe_writer.close() diff --git a/pymllm/orchestrator/ipc_utils.py b/pymllm/orchestrator/ipc_utils.py index faaf7a6d..b464a397 100644 --- a/pymllm/orchestrator/ipc_utils.py +++ b/pymllm/orchestrator/ipc_utils.py @@ -4,6 +4,7 @@ ZMQ sockets so that every process uses the same conventions. """ +import logging import os import tempfile from typing import Optional @@ -68,3 +69,24 @@ def close_zmq_socket(sock: zmq.Socket) -> None: sock.close() except zmq.ZMQError: pass + + +def setup_subprocess_logging(log_level: str = "info") -> None: + """Configure logging for a spawned subprocess. + + When Python spawns a subprocess (``mp.set_start_method('spawn')``), the + child starts with a blank logging configuration. Call this function at the + very beginning of every subprocess entry point so that log records are + emitted at the correct level. + + Parameters + ---------- + log_level + Case-insensitive level name, e.g. ``"debug"``, ``"info"``, ``"warning"``. + """ + level = getattr(logging, log_level.upper(), logging.INFO) + logging.basicConfig( + level=level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + logging.getLogger("pymllm").setLevel(level) diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index b60966dd..d850dd53 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -1,143 +1,968 @@ """ -ModelRunnerProcess -- subprocess that executes model forward passes. +ModelRunnerProcess -- GPU-owning component that executes model forward passes. -Receives batches from the SchedulerProcess, runs the model forward + sampling, -and returns the results (logits, next_token_ids) back to the scheduler. +Instantiated **in-process** by :class:`SchedulerProcess` (sglang-style +architecture). The scheduler calls :meth:`_forward_batch` directly — +no inter-process communication is involved. + +This component owns the GPU: it holds a :class:`ModelRunner` with model +weights, KV-cache memory pools, and the attention backend. It also owns +the :class:`RadixCache` for prefix-aware KV reuse. + +RadixCache lifecycle +-------------------- +1. **match_prefix** — called during ``_allocate_extend`` before KV allocation. +2. **inc_lock_ref** — locks matched radix-tree nodes to prevent eviction. +3. **insert (prefill)** — inserts prompt KV indices after prefill. +4. **insert (completion)** — re-inserts the full sequence when a request finishes. +5. **dec_lock_ref** — unlocks radix-tree nodes when a request is freed. +6. **evict** — called when KV allocation fails to free stale cache entries. """ import logging -from multiprocessing.connection import Connection -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple -import zmq +import torch -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.mem_cache.radix_cache import RadixCache, RadixKey, TreeNode logger = logging.getLogger(__name__) +# Fraction of KV pool to try evicting when allocation fails. +_EVICT_FRACTION = 0.10 +# Maximum number of eviction retries before giving up. +_MAX_EVICT_RETRIES = 3 + class ModelRunnerProcess: - """Runs inside a subprocess. Owns the model and performs forward passes.""" + """GPU-owning component created in-process by SchedulerProcess.""" def __init__( self, - recv_from_scheduler_addr: str, - send_to_scheduler_addr: str, + gpu_id: int = 0, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, ): - self._recv_from_scheduler_addr = recv_from_scheduler_addr - self._send_to_scheduler_addr = send_to_scheduler_addr + self._gpu_id = gpu_id + self._server_config = server_config + self._model_config = model_config + + # The ModelRunner instance (created in init_model) + self._runner = None + self._is_hybrid: bool = False + + # RadixCache instance (created in init_model, after memory pools) + self._radix_cache: Optional[RadixCache] = None - self._zmq_ctx: Optional[zmq.Context] = None - self._recv_from_scheduler: Optional[zmq.Socket] = None - self._send_to_scheduler: Optional[zmq.Socket] = None + # GPU resource tracking: maps rid -> req_pool_idx (slot in ReqToTokenPool) + self._rid_to_req_pool_idx: Dict[str, int] = {} + # Maps rid -> kv_indices tensor (all KV-cache token indices for this request) + self._rid_to_kv_indices: Dict[str, torch.Tensor] = {} + # Maps rid -> input_ids used for prefill (needed for radix cache insert) + self._rid_to_input_ids: Dict[str, List[int]] = {} + # Maps rid -> list of generated (decode) token ids, appended each step. + # Used to build the full sequence for radix cache insert at completion. + self._rid_to_output_ids: Dict[str, List[int]] = {} + # Maps rid -> cache_protected_len: the length of the prefix that has + # already been inserted into the radix cache. When insert() returns + # prefix_len > cache_protected_len, the KV indices in the overlap + # range [cache_protected_len, prefix_len) are duplicates that must + # be freed from the allocator (the tree already holds cloned copies). + self._rid_to_cache_protected_len: Dict[str, int] = {} + # Maps rid -> (last_node, swa_boundary_id) for radix cache lock tracking + self._rid_to_radix_lock: Dict[str, Tuple[TreeNode, Optional[int]]] = {} + # Maps rid -> mrope_position_delta (M-RoPE positional offset per request) + # Populated during prefill; used to offset decode-step positions for + # multimodal models (Qwen3-VL) that consume more position indices than + # tokens due to 3-D image grid positions. + self._rid_to_mrope_delta: Dict[str, int] = {} - # TODO: initialise model, attention backend, memory pool, etc. - self._model = None + # GDN prefix cache state tracking (hybrid models only): + # Maps rid -> GDN track slot index in GDNPool (for snapshotting state) + self._rid_to_gdn_track_slot: Dict[str, int] = {} + # Maps radix tree node id -> GDN track slot index + self._node_id_to_gdn_track_slot: Dict[int, int] = {} # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ - def init_sockets(self) -> None: - self._zmq_ctx = zmq.Context() - self._recv_from_scheduler = create_zmq_socket( - self._zmq_ctx, - zmq.PULL, - self._recv_from_scheduler_addr, - bind=False, + def init_model(self) -> None: + """Create and initialise the ModelRunner and RadixCache. + + Must run inside the subprocess (after spawn) since it does CUDA init. + """ + from pymllm.executor.model_runner import ModelRunner + + logger.info( + "ModelRunnerProcess: initialising ModelRunner on GPU %d", + self._gpu_id, ) - self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, - zmq.PUSH, - self._send_to_scheduler_addr, - bind=False, + self._runner = ModelRunner( + server_config=self._server_config, + model_config=self._model_config, + gpu_id=self._gpu_id, ) + self._runner.initialize() - def event_loop(self) -> None: - """Infinite loop: recv batch -> forward -> sample -> send result.""" - logger.info("ModelRunnerProcess event loop started") - while True: - batch = self._recv_from_scheduler.recv_pyobj() - result = self._forward_batch(batch) - self._send_to_scheduler.send_pyobj(result) + # Initialise RadixCache after memory pools are ready. + disable_cache = getattr(self._server_config, "disable_radix_cache", False) + self._is_hybrid = self._runner.num_gdn_layers > 0 + if self._is_hybrid and not disable_cache: + logger.info( + "ModelRunnerProcess: prefix caching ENABLED with GDN state " + "tracking (%d GDN layers)", + self._runner.num_gdn_layers, + ) + sliding_window = self._runner.sliding_window_size + page_size = getattr(self._server_config, "radix_cache_page_size", 1) + # For hybrid models, register an eviction callback so that evicted + # radix nodes free their associated GDN track slots. + evict_cb = self._on_radix_node_evict if self._is_hybrid else None + self._radix_cache = RadixCache( + page_size=page_size, + sliding_window_size=sliding_window, + disable=disable_cache, + token_to_kv_pool_allocator=self._runner.token_to_kv_pool_allocator, + on_node_evict=evict_cb, + ) + logger.info( + "ModelRunnerProcess: RadixCache initialized " + "(disable=%s, sliding_window=%s)", + disable_cache, + sliding_window, + ) + logger.info("ModelRunnerProcess: ModelRunner ready") # ------------------------------------------------------------------ - # Forward pass (placeholder) + # Forward pass # ------------------------------------------------------------------ def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: """Run the model forward pass and sampling for *batch*. - *batch* is a dict produced by ``SchedulerProcess.get_next_batch_to_run`` - whose ``"requests"`` list contains - :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` objects. - - Returns a dict ``{"batch_id": ..., "finished": [...], "unfinished": [...]}`` - where each element of *finished* / *unfinished* is a plain output dict - containing at least ``"rid"`` and ``"output_token_ids"``. + *batch* is a dict produced by ``ScheduleBatch.to_batch_dict()`` + containing ``"forward_mode"``, ``"input_ids"``, ``"seq_lens"``, + ``"req_pool_indices"``, ``"requests"`` (metadata list), etc. - TODO: implement real forward pass, logits processing, and sampling. + Implements 6 phases: + 1. Cleanup: free GPU resources for rids no longer in the batch + 2. Prefix matching + KV allocation + 3. Build GPU tensors + 4. Forward + sample + 5. Radix cache insert (extend only) + 6. Build result dict """ - requests = batch.get("requests", []) - finished: List[Dict[str, Any]] = [] - unfinished: List[Dict[str, Any]] = [] + runner = self._runner + forward_mode = batch.get("forward_mode", "decode") + batch_size = batch.get("batch_size", 0) + requests_meta: List[Dict[str, Any]] = batch.get("requests", []) + + if batch_size == 0: + return {"batch_id": batch.get("batch_id"), "outputs": []} + + device = runner.device + + # Collect current batch rids + current_rids: Set[str] = {m["rid"] for m in requests_meta} + + # ============================================================== + # Phase 2: Prefix matching + KV allocation + # ============================================================== + # For extend batches, match_prefix is done inside _allocate_extend + # which may update extend_prefix_lens and extend_seq_lens. + if forward_mode == "extend": + out_cache_loc, actual_prefix_lens, actual_extend_lens = ( + self._allocate_extend(batch, requests_meta) + ) + else: + out_cache_loc = self._allocate_decode(batch, requests_meta) + actual_prefix_lens = None + actual_extend_lens = None + + # ============================================================== + # Phase 3: Build GPU tensors + # ============================================================== + if forward_mode == "extend" and actual_prefix_lens is not None: + # Rebuild input_ids and seq_lens using actual prefix matches. + # The scheduler sent tokens assuming prefix_len=0; we need to + # trim the input_ids to skip the prefix-matched tokens. + ( + input_ids_tensor, + seq_lens_tensor, + extend_seq_lens_t, + extend_prefix_lens_t, + ) = self._rebuild_extend_tensors( + batch, requests_meta, actual_prefix_lens, actual_extend_lens, device + ) + else: + input_ids_list: List[int] = batch["input_ids"] + seq_lens_list: List[int] = batch["seq_lens"] + input_ids_tensor = torch.tensor( + input_ids_list, dtype=torch.int32, device=device + ) + seq_lens_tensor = torch.tensor( + seq_lens_list, dtype=torch.int32, device=device + ) + extend_seq_lens_t = None + extend_prefix_lens_t = None + + # Build req_pool_indices from our own tracking (NOT from scheduler) + req_pool_indices = torch.tensor( + [self._rid_to_req_pool_idx[m["rid"]] for m in requests_meta], + dtype=torch.int64, + device=device, + ) + + out_cache_loc = out_cache_loc.to(torch.int64) - for req in requests: - # Support both TokenizedGenerateReqInput dataclass (normal path) and - # legacy plain dicts (defensive). - rid: str = req.rid if hasattr(req, "rid") else req.get("rid") - input_ids: List[int] = ( - req.input_ids if hasattr(req, "input_ids") else req.get("input_ids", []) + # ============================================================== + # Phase 4: Forward + sample + # ============================================================== + # Extract per-request sampling params + temperatures = [] + top_ps = [] + top_ks = [] + for m in requests_meta: + sp = m.get("sampling_params") or {} + temperatures.append(sp.get("temperature", 1.0)) + top_ps.append(sp.get("top_p", 1.0)) + top_ks.append(sp.get("top_k", -1)) + + temps_tensor = torch.tensor(temperatures, dtype=torch.float32, device=device) + top_ps_tensor = torch.tensor(top_ps, dtype=torch.float32, device=device) + top_ks_tensor = torch.tensor(top_ks, dtype=torch.int32, device=device) + + if forward_mode == "extend": + if extend_seq_lens_t is None: + extend_seq_lens_list: List[int] = batch["extend_seq_lens"] + extend_prefix_lens_list: List[int] = batch["extend_prefix_lens"] + extend_seq_lens_t = torch.tensor( + extend_seq_lens_list, dtype=torch.int32, device=device + ) + extend_prefix_lens_t = torch.tensor( + extend_prefix_lens_list, dtype=torch.int32, device=device + ) + + fb = runner.prepare_forward_batch_extend( + input_ids=input_ids_tensor, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens_tensor, + extend_seq_lens=extend_seq_lens_t, + extend_prefix_lens=extend_prefix_lens_t, + out_cache_loc=out_cache_loc, ) - mm_inputs: Optional[Dict[str, Any]] = ( - req.mm_inputs if hasattr(req, "mm_inputs") else req.get("mm_inputs") + + # Attach multimodal vision inputs to ForwardBatch so the + # model's vision encoder can process images during prefill. + # The tokenizer wraps processor output under "image_inputs"; + # fall back to top-level keys for direct dicts. + pixel_values_list = [] + image_grid_thw_list = [] + for m in requests_meta: + mm = m.get("mm_inputs") + if mm is None: + continue + # AutoProcessor output is nested under "image_inputs" + src = mm.get("image_inputs") if "image_inputs" in mm else mm + if src is None: + continue + pv = src.get("pixel_values") if hasattr(src, "get") else getattr(src, "pixel_values", None) + thw = src.get("image_grid_thw") if hasattr(src, "get") else getattr(src, "image_grid_thw", None) + if pv is not None: + if not isinstance(pv, torch.Tensor): + pv = torch.as_tensor(pv) + pixel_values_list.append(pv.to(device=device)) + if thw is not None: + if not isinstance(thw, torch.Tensor): + thw = torch.as_tensor(thw) + image_grid_thw_list.append(thw.to(device=device)) + if pixel_values_list: + fb.pixel_values = torch.cat(pixel_values_list, dim=0) + if image_grid_thw_list: + fb.image_grid_thw = torch.cat(image_grid_thw_list, dim=0) + else: + # Build mrope_position_deltas tensor for decode batches. + mrope_deltas = [ + self._rid_to_mrope_delta.get(m["rid"], 0) for m in requests_meta + ] + mrope_deltas_tensor = torch.tensor( + mrope_deltas, dtype=torch.int64, device=device ) - # TODO: actual model forward; pass input_ids and mm_inputs to the model. - next_token_ids: List[int] = [] # placeholder + fb = runner.prepare_forward_batch_decode( + input_ids=input_ids_tensor, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens_tensor, + out_cache_loc=out_cache_loc, + mrope_position_deltas=mrope_deltas_tensor, + ) - output: Dict[str, Any] = { + logits_output = runner.forward(fb) + + # Persist M-RoPE position deltas for multimodal models (Qwen3-VL). + # The model sets mrope_position_deltas on the ForwardBatch during + # prefill; we store them here so decode steps can retrieve them. + if ( + forward_mode == "extend" + and getattr(fb, "mrope_position_deltas", None) is not None + ): + deltas_cpu = fb.mrope_position_deltas.cpu().tolist() + for idx, m in enumerate(requests_meta): + self._rid_to_mrope_delta[m["rid"]] = int(deltas_cpu[idx]) + + next_token_ids = runner.sample( + logits_output, + fb, + temperatures=temps_tensor, + top_ps=top_ps_tensor, + top_ks=top_ks_tensor, + ) + + # ============================================================== + # Phase 4.5: Snapshot GDN state after extend (hybrid models) + # ============================================================== + if forward_mode == "extend" and self._is_hybrid: + self._track_gdn_state_after_extend(requests_meta) + + # ============================================================== + # Phase 5: Radix cache insert (extend only) + # ============================================================== + if forward_mode == "extend" and self._radix_cache is not None: + self._insert_into_radix_cache(requests_meta) + + # ============================================================== + # Phase 6: Build result & track output tokens + # ============================================================== + next_ids_cpu = next_token_ids.cpu().tolist() + outputs: List[Dict[str, Any]] = [] + for i, m in enumerate(requests_meta): + rid = m["rid"] + token_id = next_ids_cpu[i] if i < len(next_ids_cpu) else 0 + # Track output tokens for radix cache insert at completion + out_ids = self._rid_to_output_ids.get(rid) + if out_ids is not None: + out_ids.append(token_id) + + out: Dict[str, Any] = { "rid": rid, - "output_token_ids": next_token_ids, - "finished": True, + "output_token_ids": [token_id], } - # TODO: check EOS / max_tokens to decide finished vs. unfinished. - finished.append(output) + # Report actual prefix_len back to the scheduler so it can + # update its token budget tracking accurately. + if actual_prefix_lens is not None: + out["prefix_len"] = actual_prefix_lens[i] + outputs.append(out) return { "batch_id": batch.get("batch_id"), - "finished": finished, - "unfinished": unfinished, + "outputs": outputs, } + # ------------------------------------------------------------------ + # Tensor rebuild for prefix-matched extend + # ------------------------------------------------------------------ + + def _rebuild_extend_tensors( + self, + batch: Dict[str, Any], + requests_meta: List[Dict[str, Any]], + actual_prefix_lens: List[int], + actual_extend_lens: List[int], + device: str, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Rebuild input_ids and related tensors after prefix matching. + + The scheduler sent input_ids assuming no prefix cache hit. After + radix cache matching, we know the actual prefix lengths and must + trim the input_ids accordingly. + + Returns (input_ids, seq_lens, extend_seq_lens, extend_prefix_lens) + as GPU tensors. + """ + # Reconstruct trimmed input_ids: for each request, take only the + # tokens beyond the matched prefix. + new_input_ids: List[int] = [] + seq_lens_list: List[int] = batch["seq_lens"] + + for i, m in enumerate(requests_meta): + full_input_ids = m.get("input_ids", []) + prefix_len = actual_prefix_lens[i] + # Only send tokens after the prefix + new_input_ids.extend(full_input_ids[prefix_len:]) + + input_ids = torch.tensor(new_input_ids, dtype=torch.int32, device=device) + seq_lens = torch.tensor(seq_lens_list, dtype=torch.int32, device=device) + extend_seq_lens = torch.tensor( + actual_extend_lens, dtype=torch.int32, device=device + ) + extend_prefix_lens = torch.tensor( + actual_prefix_lens, dtype=torch.int32, device=device + ) + return input_ids, seq_lens, extend_seq_lens, extend_prefix_lens + + # ------------------------------------------------------------------ + # Radix cache insert + # ------------------------------------------------------------------ + + def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: + """Insert prefill KV indices into the radix cache for future reuse. + + Mirrors sglang's ``cache_unfinished_req`` pattern: + + 1. **Insert** the request's token → KV index mapping into the tree. + 2. **Free duplicates** — indices in ``[cache_protected_len, new_prefix_len)`` + are now owned by the tree; the request's copies are redundant. + 3. **Re-match + write-back** — fetch the tree's *own* indices via + ``match_prefix`` and write them into ``req_to_token_pool``, + replacing the just-freed entries. Without this step the pool + still points at freed slots → use-after-free during decode. + 4. **Update** ``cache_protected_len`` and radix lock. + """ + cache = self._radix_cache + if cache is None or cache.disable: + return + + runner = self._runner + gdn_pool = getattr(runner, "gdn_pool", None) + + for m in requests_meta: + rid = m["rid"] + input_ids = self._rid_to_input_ids.get(rid) + if input_ids is None: + continue + + slot = self._rid_to_req_pool_idx.get(rid) + if slot is None: + continue + + seq_len = len(input_ids) + kv_indices = runner.req_to_token_pool.req_to_token[slot, :seq_len].to( + torch.int64 + ) + + key = RadixKey(input_ids) + result = cache.insert(key, kv_indices) + new_prefix_len = result.prefix_len + + # --- Step 2: free duplicates --- + cache_protected_len = self._rid_to_cache_protected_len.get(rid, 0) + if new_prefix_len > cache_protected_len: + dup_indices = kv_indices[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + # --- Step 3: re-match + write-back --- + # The tree now owns indices for [0, new_prefix_len). Fetch them + # and patch req_to_token_pool so the request reads the tree's + # (still-live) indices instead of the freed ones. + rematch = cache.match_prefix(key) + new_indices = rematch.indices + if len(new_indices) > cache_protected_len: + runner.req_to_token_pool.write( + (slot, slice(cache_protected_len, len(new_indices))), + new_indices[cache_protected_len:].to(torch.int32), + ) + + # --- Step 4: update tracking --- + self._rid_to_cache_protected_len[rid] = len(new_indices) + + # Update radix lock to cover the new (potentially deeper) node. + old_lock = self._rid_to_radix_lock.pop(rid, None) + if old_lock is not None: + old_node, old_swa = old_lock + cache.dec_lock_ref(old_node, old_swa) + new_last_node = rematch.last_node + if new_last_node is not None and len(new_indices) > 0: + swa_id = cache.inc_lock_ref(new_last_node) + self._rid_to_radix_lock[rid] = (new_last_node, swa_id) + + # --- GDN track slot association (hybrid models) --- + if gdn_pool is not None and result.last_node is not None: + track_slot = self._rid_to_gdn_track_slot.get(rid) + if track_slot is not None: + node_id = result.last_node.id + old_ts = self._node_id_to_gdn_track_slot.get(node_id) + if old_ts is None: + self._node_id_to_gdn_track_slot[node_id] = track_slot + else: + gdn_pool.free_track_slot(track_slot) + self._rid_to_gdn_track_slot.pop(rid, None) + + # ------------------------------------------------------------------ + # KV allocation helpers + # ------------------------------------------------------------------ + + def _allocate_extend( + self, batch: Dict[str, Any], requests_meta: List[Dict[str, Any]] + ) -> Tuple[torch.Tensor, List[int], List[int]]: + """Allocate req pool slots and KV tokens for an extend (prefill) batch. + + Performs radix cache prefix matching before allocation: + 1. For each request, call ``match_prefix`` to find cached KV indices. + 2. Write cached indices into ``ReqToTokenPool``. + 3. Only allocate new KV tokens for the non-cached suffix. + 4. Lock matched radix nodes to prevent eviction. + + Returns ``(out_cache_loc, actual_prefix_lens, actual_extend_lens)``. + ``out_cache_loc`` has shape ``[total_new_tokens]``. + """ + runner = self._runner + cache = self._radix_cache + batch_size = batch["batch_size"] + seq_lens: List[int] = batch["seq_lens"] + + # --- Step 1: Radix cache prefix matching --- + actual_prefix_lens: List[int] = [] + actual_extend_lens: List[int] = [] + matched_nodes: List[Optional[TreeNode]] = [] + # Cache the match results so we don't call match_prefix twice + cached_indices_list: List[Optional[torch.Tensor]] = [] + gdn_pool = getattr(runner, "gdn_pool", None) + + for i, m in enumerate(requests_meta): + full_input_ids: List[int] = m.get("input_ids", []) + full_seq_len = seq_lens[i] + + # Store input_ids for later radix cache insert + self._rid_to_input_ids[m["rid"]] = full_input_ids + + if cache is not None and not cache.disable and len(full_input_ids) > 0: + key = RadixKey(full_input_ids) + match_result = cache.match_prefix(key) + prefix_len = match_result.prefix_len + last_node = match_result.last_node + cached_indices = match_result.indices + else: + prefix_len = 0 + last_node = None + cached_indices = None + + # Hybrid model guard: only use a KV cache hit if the matched + # node has a GDN state snapshot. Without it, the full-attention + # layers would use cached KV while GDN layers start from zero, + # causing an attention/GDN state mismatch. Discard the hit so + # the entire prompt is processed from scratch. + if ( + gdn_pool is not None + and prefix_len > 0 + and last_node is not None + and self._node_id_to_gdn_track_slot.get(last_node.id) is None + ): + logger.debug( + "Discarding radix cache hit for rid=%s: no GDN state " + "for matched node (prefix_len=%d)", + m["rid"], prefix_len, + ) + prefix_len = 0 + last_node = None + cached_indices = None + + # Ensure at least 1 token is extended (not fully cached). + # A full cache hit (prefix_len == full_seq_len) would produce a + # 0-length input tensor that crashes CUDA kernels. Back off by 1 + # so the model always sees the last token. + if prefix_len >= full_seq_len: + prefix_len = full_seq_len - 1 + if cached_indices is not None: + cached_indices = cached_indices[:prefix_len] + + extend_len = full_seq_len - prefix_len + actual_prefix_lens.append(prefix_len) + actual_extend_lens.append(extend_len) + matched_nodes.append(last_node) + cached_indices_list.append(cached_indices) + + if prefix_len > 0: + logger.info( + "Radix cache hit for rid=%s: %d/%d tokens reused (%.1f%%)", + m["rid"], + prefix_len, + full_seq_len, + 100.0 * prefix_len / full_seq_len, + ) + + total_new_tokens = sum(actual_extend_lens) + + # --- Step 2: Allocate req pool slots --- + slots = runner.req_to_token_pool.alloc(batch_size) + if slots is None: + raise RuntimeError("Failed to allocate req pool slots for extend batch") + + # --- Step 3: Allocate KV tokens (with eviction retry) --- + out_cache_loc = self._alloc_kv_with_eviction(total_new_tokens) + if out_cache_loc is None: + for s in slots: + runner.req_to_token_pool.free(s) + raise RuntimeError( + f"Failed to allocate {total_new_tokens} KV tokens for extend batch " + f"(even after eviction)" + ) + + # --- Step 4: Write indices into req_to_token_pool --- + offset = 0 + for i, m in enumerate(requests_meta): + rid = m["rid"] + slot = slots[i] + prefix_len = actual_prefix_lens[i] + extend_len = actual_extend_lens[i] + full_seq_len = seq_lens[i] + + # Write cached prefix indices (from the match result we saved) + cached_indices = cached_indices_list[i] + if cached_indices is not None and prefix_len > 0: + runner.req_to_token_pool.write( + (slot, slice(0, prefix_len)), + cached_indices[:prefix_len].to(torch.int32), + ) + + # Write new KV indices for the suffix + kv_indices = out_cache_loc[offset : offset + extend_len] + runner.req_to_token_pool.write( + (slot, slice(prefix_len, full_seq_len)), kv_indices + ) + + self._rid_to_req_pool_idx[rid] = slot + self._rid_to_kv_indices[rid] = kv_indices.clone() + self._rid_to_output_ids[rid] = [] + # The prefix portion is already protected in the radix cache + # (from a previous request's insert). We start with this as + # cache_protected_len so that subsequent insert() calls know + # which range is already covered. + self._rid_to_cache_protected_len[rid] = actual_prefix_lens[i] + offset += extend_len + + # GDN state management: restore from track slot on cache hit, or reset + if gdn_pool is not None: + for i, m in enumerate(requests_meta): + rid = m["rid"] + working_slot = slots[i] + prefix_len = actual_prefix_lens[i] + node = matched_nodes[i] + + if prefix_len > 0 and node is not None: + # Cache hit — try to restore GDN state from the track slot + # associated with the matched radix node. + track_slot = self._node_id_to_gdn_track_slot.get(node.id) + if track_slot is not None: + gdn_pool.copy_states(track_slot, working_slot) + logger.debug( + "GDN state restored for rid=%s from track_slot=%d " + "(prefix_len=%d)", + rid, track_slot, prefix_len, + ) + else: + # Cache hit but no GDN snapshot — reset to zero. + # This can happen if the track slot was evicted. + idx = torch.tensor( + [working_slot], dtype=torch.int64, device=runner.device + ) + gdn_pool.reset_states(idx) + logger.debug( + "GDN state reset for rid=%s (cache hit but no " + "track slot, prefix_len=%d)", + rid, prefix_len, + ) + else: + # No cache hit — fresh request, zero-init + idx = torch.tensor( + [working_slot], dtype=torch.int64, device=runner.device + ) + gdn_pool.reset_states(idx) + + # Allocate a track slot only when the radix cache is enabled; + # track slots are freed via the eviction callback so they must + # be associated with a node, which only happens when cache is on. + if cache is not None and not cache.disable: + ts = gdn_pool.alloc_track_slot() + if ts is not None: + self._rid_to_gdn_track_slot[rid] = ts + + # --- Step 5: Lock matched radix nodes --- + if cache is not None and not cache.disable: + for i, m in enumerate(requests_meta): + node = matched_nodes[i] + if node is not None and actual_prefix_lens[i] > 0: + swa_boundary_id = cache.inc_lock_ref(node) + self._rid_to_radix_lock[m["rid"]] = (node, swa_boundary_id) + + return out_cache_loc, actual_prefix_lens, actual_extend_lens + + def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]: + """Try to allocate KV tokens, evicting from radix cache if needed.""" + runner = self._runner + cache = self._radix_cache + + if num_tokens == 0: + return torch.empty(0, dtype=torch.int32, device=runner.device) + + # First attempt: direct allocation + result = runner.token_to_kv_pool_allocator.alloc(num_tokens) + if result is not None: + return result + + # Eviction loop: try evicting from radix cache to free space + if cache is None or cache.disable: + return None + + for attempt in range(_MAX_EVICT_RETRIES): + evictable = cache.evictable_size() + if evictable == 0: + logger.warning( + "KV allocation failed: need %d tokens, no evictable cache entries", + num_tokens, + ) + return None + + # Evict a fraction of the cache (at least what we need) + evict_target = max( + num_tokens, + int(runner.token_to_kv_pool_allocator.size * _EVICT_FRACTION), + ) + evict_result = cache.evict(evict_target) + logger.info( + "Radix cache eviction attempt %d: evicted %d tokens (target=%d)", + attempt + 1, + evict_result.full_evicted, + evict_target, + ) + + # Retry allocation + result = runner.token_to_kv_pool_allocator.alloc(num_tokens) + if result is not None: + return result + + return None + + def _allocate_decode( + self, batch: Dict[str, Any], requests_meta: List[Dict[str, Any]] + ) -> torch.Tensor: + """Allocate 1 KV token per request for a decode step. + + Returns ``out_cache_loc`` tensor of shape ``[batch_size]``. + """ + runner = self._runner + batch_size = batch["batch_size"] + seq_lens: List[int] = batch["seq_lens"] + + # Allocate 1 new KV token per request (with eviction retry) + out_cache_loc = self._alloc_kv_with_eviction(batch_size) + if out_cache_loc is None: + raise RuntimeError( + f"Failed to allocate {batch_size} KV tokens for decode batch" + ) + + # Write the new KV token index into each request's mapping + for i, m in enumerate(requests_meta): + rid = m["rid"] + slot = self._rid_to_req_pool_idx.get(rid) + if slot is None: + logger.warning("Decode step for unknown rid=%s, skipping KV write", rid) + continue + + cur_seq_len = seq_lens[i] + kv_new = out_cache_loc[i : i + 1] + # The scheduler increments req.seq_len by 1 after every step, so + # seq_lens[i] == (number of tokens in the KV cache INCLUDING the + # token being decoded now). The new token's slot must therefore be + # written at index seq_lens[i] - 1, matching the position used by + # prepare_forward_batch_decode (positions = seq_lens - 1) and the + # window FlashInfer reads (req_to_token_pool[slot, 0:seq_lens[i]]). + write_pos = cur_seq_len - 1 + runner.req_to_token_pool.write( + (slot, slice(write_pos, write_pos + 1)), kv_new + ) + + # Append to tracked kv_indices + prev = self._rid_to_kv_indices.get(rid) + if prev is not None: + self._rid_to_kv_indices[rid] = torch.cat([prev, kv_new]) + else: + self._rid_to_kv_indices[rid] = kv_new.clone() + + return out_cache_loc + + # ------------------------------------------------------------------ + # Resource cleanup + # ------------------------------------------------------------------ + + def _free_rid_resources(self, rid: str) -> None: + """Free GPU resources (req pool slot + KV indices) for a finished rid. + + KV index ownership model (when radix cache is enabled): + + ``req_to_token_pool[slot]`` contains three regions after + ``insert()`` returns ``new_prefix_len``:: + + [0, cache_protected_len) + Indices shared with the radix tree from a previous insert. + **Do not free** — the tree already owns them. + + [cache_protected_len, new_prefix_len) + Indices allocated by THIS request that turned out to overlap + with tree nodes inserted concurrently. The tree already + holds cloned copies → these are duplicates → **free them**. + + [new_prefix_len, total_len) + Indices that ``insert()`` just added to the tree (cloned). + The tree now owns the underlying KV pool slots. + **Do not free** — the tree will free during eviction. + + When the radix cache is disabled, all KV indices are freed directly. + """ + runner = self._runner + cache = self._radix_cache + + slot = self._rid_to_req_pool_idx.pop(rid, None) + kv_indices = self._rid_to_kv_indices.pop(rid, None) + input_ids = self._rid_to_input_ids.pop(rid, None) + output_ids = self._rid_to_output_ids.pop(rid, None) + cache_protected_len = self._rid_to_cache_protected_len.pop(rid, 0) + radix_lock = self._rid_to_radix_lock.pop(rid, None) + self._rid_to_mrope_delta.pop(rid, None) + + # Free GDN track slot (if any) — the slot's association with a + # radix node is managed separately via _node_id_to_gdn_track_slot + # and the eviction callback; here we just remove the rid mapping. + self._rid_to_gdn_track_slot.pop(rid, None) + + cache_enabled = cache is not None and not cache.disable + + # ---------------------------------------------------------- + # Phase 1: Read all KV indices BEFORE freeing anything. + # ---------------------------------------------------------- + prompt_len = len(input_ids) if input_ids is not None else 0 + decode_len = len(output_ids) if output_ids else 0 + total_len = prompt_len + decode_len + + all_kv_indices: Optional[torch.Tensor] = None + if slot is not None and input_ids is not None: + all_kv_indices = runner.req_to_token_pool.req_to_token[slot, :total_len].to( + torch.int64 + ) + + # ---------------------------------------------------------- + # Phase 2: Insert into radix cache (if enabled). + # ---------------------------------------------------------- + did_insert = False + if cache_enabled and all_kv_indices is not None: + if self._is_hybrid and decode_len > 0: + # Hybrid model: insert only prompt tokens (not decode) + # because GDN state is only tracked at the prompt boundary. + prompt_kv = all_kv_indices[:prompt_len] + decode_kv = all_kv_indices[prompt_len:] + key = RadixKey(list(input_ids)) + result = cache.insert(key, prompt_kv) + new_prefix_len = result.prefix_len + + # Free duplicate KV indices in the overlap region. + if new_prefix_len > cache_protected_len: + dup_indices = prompt_kv[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + # Free decode KV indices (tree does not own them) + if decode_kv.numel() > 0: + runner.token_to_kv_pool_allocator.free(decode_kv) + else: + # Non-hybrid or no decode tokens: insert full sequence + full_token_ids = list(input_ids) + if output_ids: + full_token_ids.extend(output_ids) + key = RadixKey(full_token_ids) + result = cache.insert(key, all_kv_indices) + new_prefix_len = result.prefix_len + + # Free duplicate KV indices in the overlap region. + if new_prefix_len > cache_protected_len: + dup_indices = all_kv_indices[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + did_insert = True + + # ---------------------------------------------------------- + # Phase 3: Unlock radix cache nodes. + # ---------------------------------------------------------- + if cache_enabled and radix_lock is not None: + node, swa_boundary_id = radix_lock + cache.dec_lock_ref(node, swa_boundary_id) + + # ---------------------------------------------------------- + # Phase 4: Free KV indices not owned by the radix cache. + # ---------------------------------------------------------- + if not did_insert: + if cache_enabled and all_kv_indices is not None: + # Cache enabled but insert skipped (shouldn't happen in + # normal flow). Tree owns [0, cache_protected_len); + # free the rest. + tail = all_kv_indices[cache_protected_len:] + if tail.numel() > 0: + runner.token_to_kv_pool_allocator.free(tail) + elif not cache_enabled: + # Cache disabled — free all newly-allocated KV indices. + if all_kv_indices is not None and all_kv_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(all_kv_indices) + elif kv_indices is not None and kv_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(kv_indices) + + # ---------------------------------------------------------- + # Phase 5: Free the req pool slot. + # ---------------------------------------------------------- + if slot is not None: + runner.req_to_token_pool.free(slot) + + logger.debug( + "Freed resources for rid=%s (slot=%s, kv_tokens=%d)", + rid, + slot, + kv_indices.numel() if kv_indices is not None else 0, + ) + + # ------------------------------------------------------------------ + # GDN state tracking helpers (hybrid models) + # ------------------------------------------------------------------ + + def _track_gdn_state_after_extend( + self, requests_meta: List[Dict[str, Any]] + ) -> None: + """Snapshot working GDN state into each request's track slot. + + Called immediately after ``runner.forward()`` for extend batches so + that the FINAL recurrent/conv state (after processing the full prompt) + is saved. The track slot is later associated with a radix node in + ``_insert_into_radix_cache``. + """ + gdn_pool = getattr(self._runner, "gdn_pool", None) + if gdn_pool is None: + return + + for m in requests_meta: + rid = m["rid"] + working_slot = self._rid_to_req_pool_idx.get(rid) + track_slot = self._rid_to_gdn_track_slot.get(rid) + if working_slot is not None and track_slot is not None: + gdn_pool.copy_states(working_slot, track_slot) + + def _on_radix_node_evict(self, node_id: int) -> None: + """Callback invoked by RadixCache when a node is evicted. + + Frees the GDN track slot associated with the evicted node. + """ + track_slot = self._node_id_to_gdn_track_slot.pop(node_id, None) + if track_slot is not None: + gdn_pool = getattr(self._runner, "gdn_pool", None) + if gdn_pool is not None: + gdn_pool.free_track_slot(track_slot) + logger.debug( + "Freed GDN track slot %d for evicted node %d", + track_slot, node_id, + ) + # ------------------------------------------------------------------ # Cleanup # ------------------------------------------------------------------ def shutdown(self) -> None: - if self._recv_from_scheduler is not None: - self._recv_from_scheduler.close() - if self._send_to_scheduler is not None: - self._send_to_scheduler.close() - if self._zmq_ctx is not None: - self._zmq_ctx.term() - - -def run_model_runner_process( - recv_from_scheduler_addr: str, - send_to_scheduler_addr: str, - pipe_writer: Connection, -) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = ModelRunnerProcess(recv_from_scheduler_addr, send_to_scheduler_addr) - proc.init_sockets() - - pipe_writer.send({"status": "ready", "process": "model_runner"}) - pipe_writer.close() - - try: - proc.event_loop() - except KeyboardInterrupt: - pass - finally: - proc.shutdown() + if self._runner is not None: + self._runner.shutdown() diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index fa9d92ec..5c72a14c 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -65,8 +65,12 @@ def __init__( self._loop_task: Optional[asyncio.Task] = None - def start(self, loop: asyncio.AbstractEventLoop) -> None: - """Kick off the background send/recv tasks on *loop*.""" + def start(self) -> None: + """Bind ZMQ sockets. Background tasks are started lazily by + :meth:`listen` on the first :meth:`add_request` call, so they + always run on the correct event loop regardless of whether the + caller is uvicorn, ``loop.run_until_complete``, or anything else. + """ self._zmq_ctx = zmq.asyncio.Context() self._send_to_tokenizer = create_zmq_socket( self._zmq_ctx, @@ -80,7 +84,20 @@ def start(self, loop: asyncio.AbstractEventLoop) -> None: self._recv_from_detokenizer_addr, bind=True, ) + + def listen(self) -> None: + """Start the send/recv background tasks on the **current** running + event loop. Idempotent — subsequent calls are no-ops while the + tasks are still alive. + + Called automatically by :meth:`add_request`, so callers never need + to invoke this directly. + """ + if self._loop_task is not None and not self._loop_task.done(): + return + loop = asyncio.get_running_loop() self._loop_task = loop.create_task(self._run()) + logger.debug("RequestResponseProcess: background tasks started") async def add_request( self, request: GenerateReqInput @@ -98,6 +115,8 @@ async def add_request( Callers should ``await state.event.wait()`` in a loop, consuming ``state.out_list`` entries until ``state.finished`` is ``True``. """ + self.listen() + if request.is_single: rid = request.rid if isinstance(request.rid, str) else request.rid[0] state = ReqState() diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 8f2d9a95..8594a899 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -1,27 +1,28 @@ """ -SchedulerProcess -- the central scheduling hub. +SchedulerProcess -- the central scheduling and inference hub. Receives tokenized requests from the TokenizerProcess, organises them into -batches, dispatches batches to the ModelRunnerProcess for forward passes, -collects results, and streams finished token IDs to the DetokenizerProcess. +batches, runs model forward passes via the **in-process** model runner +(sglang-style), and streams finished token IDs to the DetokenizerProcess. -Supports two modes: - 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj - 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory +Architecture: the scheduler owns the :class:`ModelRunnerProcess` directly +(same process, direct function calls). GPU resources (KV cache, req pool +slots) are freed immediately when requests finish — no cross-process +communication needed. -When the shared queue fast path is active the scheduler also handles CUDA IPC -tensor reconstruction via -:func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. +Request ingestion supports two modes: + 1. ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj + 2. Shared queue fast path: Read from shared memory + multiprocessing queue -The main ``event_loop`` scheduler flow:: +The main ``event_loop``:: while True: recv_requests() process_input_requests() - batch = get_next_batch_to_run() + batch = get_next_batch_to_run() # also frees finished GPU resources if batch: - run_batch(batch) - process_batch_result(batch) + result = run_batch(batch) # direct call to model runner + process_batch_result(batch, result) stream_output() """ @@ -34,16 +35,297 @@ import zmq -from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.engine.forward_batch import ForwardMode +from pymllm.engine.io_struct import BatchTokenIDOutput, TokenizedGenerateReqInput from pymllm.orchestrator.cuda_ipc_transport import ( TensorTransportMode, unwrap_mm_inputs_from_ipc, ) -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) +# Default scheduling limits +_DEFAULT_MAX_RUNNING_REQUESTS = 256 +_DEFAULT_MAX_PREFILL_TOKENS = 8192 +_DEFAULT_MAX_TOTAL_TOKENS = 131072 +_DEFAULT_MAX_NEW_TOKENS = 32768 + + +# ====================================================================== +# Req -- per-request state tracker +# ====================================================================== + + +class Req: + """Tracks a single request through its lifecycle (prefill -> decode -> finish). + + Created by :meth:`SchedulerProcess.process_input_requests` from a + :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput`. + """ + + __slots__ = ( + "rid", + "input_ids", + "input_text", + "sampling_params", + "mm_inputs", + "stream", + "return_logprob", + "logprob_start_len", + "top_logprobs_num", + # KV-cache state + "req_pool_idx", + "seq_len", + # Prefix-cache hit (set during scheduling when radix cache is active) + "prefix_len", + # Generation state + "output_ids", + "finished_reason", + "is_prefilled", + # Sampling parameters (parsed) + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "stop_token_ids", + # Streaming + "read_offset", + # Prompt length (for token accounting) + "prompt_len", + ) + + def __init__( + self, + rid: str, + input_ids: List[int], + input_text: str = "", + sampling_params: Optional[Dict[str, Any]] = None, + mm_inputs: Optional[Dict[str, Any]] = None, + stream: bool = False, + return_logprob: bool = False, + logprob_start_len: int = -1, + top_logprobs_num: int = 0, + ): + self.rid = rid + self.input_ids = list(input_ids) + self.input_text = input_text + self.mm_inputs = mm_inputs + self.stream = stream + self.return_logprob = return_logprob + self.logprob_start_len = logprob_start_len + self.top_logprobs_num = top_logprobs_num + + # Parse sampling params + sp = sampling_params or {} + self.sampling_params = sp + self.max_new_tokens: int = sp.get("max_new_tokens", _DEFAULT_MAX_NEW_TOKENS) + self.temperature: float = sp.get("temperature", 1.0) + self.top_p: float = sp.get("top_p", 1.0) + self.top_k: int = sp.get("top_k", -1) + self.stop_token_ids: List[int] = list(sp.get("stop_token_ids", [])) + + # KV-cache state (assigned during scheduling) + self.req_pool_idx: int = -1 + self.seq_len: int = len(input_ids) + # Number of prefix tokens served from the radix/KV cache (0 = no hit). + # Updated by process_batch_result when the model runner reports a + # prefix cache hit. Used in _free_req_resources to correctly + # release the token budget. + self.prefix_len: int = 0 + + # Generation state + self.output_ids: List[int] = [] + self.finished_reason: Optional[str] = None + self.is_prefilled: bool = False + + # Streaming + self.read_offset: int = 0 + + # Prompt length + self.prompt_len: int = len(input_ids) + + def check_finished(self, eos_token_id: Optional[int] = None) -> bool: + """Check if this request has reached a finish condition. + + Sets ``finished_reason`` and returns True if finished. + Checks: + 1. EOS token in the latest generated token + 2. ``max_new_tokens`` reached + """ + if self.finished_reason is not None: + return True + + if self.output_ids: + last_token = self.output_ids[-1] + # Check model EOS token + if eos_token_id is not None and last_token == eos_token_id: + self.finished_reason = "eos" + return True + # Check stop token IDs from sampling params + if last_token in self.stop_token_ids: + self.finished_reason = "eos" + return True + + # Check max_new_tokens + if len(self.output_ids) >= self.max_new_tokens: + self.finished_reason = "length" + return True + + return False + + @property + def is_finished(self) -> bool: + return self.finished_reason is not None + + def abort(self) -> None: + """Mark this request as aborted.""" + self.finished_reason = "abort" + + def __repr__(self) -> str: + return ( + f"Req(rid={self.rid!r}, seq_len={self.seq_len}, " + f"out={len(self.output_ids)}, finished={self.finished_reason})" + ) + + +# ====================================================================== +# ScheduleBatch -- batch container +# ====================================================================== + + +class ScheduleBatch: + """Wraps a list of :class:`Req` objects for a single forward pass. + + Provides helpers to assemble the batch dict sent to the ModelRunnerProcess + in the format expected by :class:`~pymllm.engine.forward_batch.ForwardBatch`. + """ + + def __init__(self, reqs: List[Req], forward_mode: ForwardMode): + self.reqs = reqs + self.forward_mode = forward_mode + + @property + def batch_size(self) -> int: + return len(self.reqs) + + def prepare_for_extend(self) -> Dict[str, Any]: + """Assemble a batch dict for prefill / extend forward pass. + + Returns a dict with flattened ``input_ids``, per-request ``positions``, + ``req_pool_indices``, ``seq_lens``, ``extend_seq_lens``, + ``extend_prefix_lens``, and request metadata. + + Note: The scheduler sends the **full** input_ids (no prefix trimming). + The ModelRunnerProcess performs radix cache prefix matching and + rebuilds the tensors with actual prefix lengths before the forward + pass. The ``extend_prefix_lens`` here are always 0 from the + scheduler; they serve as placeholders. + """ + all_input_ids: List[int] = [] + all_positions: List[int] = [] + req_pool_indices: List[int] = [] + seq_lens: List[int] = [] + extend_seq_lens: List[int] = [] + extend_prefix_lens: List[int] = [] + requests_meta: List[Dict[str, Any]] = [] + + for req in self.reqs: + input_len = len(req.input_ids) + + # Send full input_ids; model runner will trim based on prefix + all_input_ids.extend(req.input_ids) + all_positions.extend(range(input_len)) + req_pool_indices.append(req.req_pool_idx) + seq_lens.append(req.seq_len) + extend_seq_lens.append(input_len) + extend_prefix_lens.append(0) + requests_meta.append( + { + "rid": req.rid, + "input_ids": req.input_ids, + "mm_inputs": req.mm_inputs, + "sampling_params": req.sampling_params, + "return_logprob": req.return_logprob, + "logprob_start_len": req.logprob_start_len, + "top_logprobs_num": req.top_logprobs_num, + } + ) + + return { + "forward_mode": "extend", + "batch_size": self.batch_size, + "input_ids": all_input_ids, + "positions": all_positions, + "req_pool_indices": req_pool_indices, + "seq_lens": seq_lens, + "extend_seq_lens": extend_seq_lens, + "extend_prefix_lens": extend_prefix_lens, + "requests": requests_meta, + "batch_id": id(self), + "created_at": time.time(), + } + + def prepare_for_decode(self) -> Dict[str, Any]: + """Assemble a batch dict for decode forward pass (one token per request). + + Returns a dict with one input token per request (the last generated + token), positions at ``seq_len``, and request metadata. + """ + all_input_ids: List[int] = [] + all_positions: List[int] = [] + req_pool_indices: List[int] = [] + seq_lens: List[int] = [] + requests_meta: List[Dict[str, Any]] = [] + + for req in self.reqs: + # For decode, the input is the last generated token + if req.output_ids: + all_input_ids.append(req.output_ids[-1]) + else: + # Fallback: last input token (shouldn't happen normally) + all_input_ids.append(req.input_ids[-1]) + all_positions.append(req.seq_len) + req_pool_indices.append(req.req_pool_idx) + seq_lens.append(req.seq_len) + requests_meta.append( + { + "rid": req.rid, + "sampling_params": req.sampling_params, + "return_logprob": req.return_logprob, + "logprob_start_len": req.logprob_start_len, + "top_logprobs_num": req.top_logprobs_num, + } + ) + + return { + "forward_mode": "decode", + "batch_size": self.batch_size, + "input_ids": all_input_ids, + "positions": all_positions, + "req_pool_indices": req_pool_indices, + "seq_lens": seq_lens, + "requests": requests_meta, + "batch_id": id(self), + "created_at": time.time(), + } + + def to_batch_dict(self) -> Dict[str, Any]: + """Build the batch dict appropriate for the current forward mode.""" + if self.forward_mode.is_extend(): + return self.prepare_for_extend() + else: + return self.prepare_for_decode() + + def __repr__(self) -> str: + return f"ScheduleBatch(mode={self.forward_mode.name}, size={self.batch_size})" + + +# ====================================================================== +# SchedulerProcess +# ====================================================================== + class SchedulerProcess: """Runs inside a subprocess. Central hub that drives the inference loop.""" @@ -51,19 +333,29 @@ class SchedulerProcess: def __init__( self, recv_from_tokenizer_addr: str, - send_to_model_runner_addr: str, - recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, + gpu_id: int = 0, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, tensor_transport_mode: TensorTransportMode = "default", + # Scheduling limits + max_running_requests: int = _DEFAULT_MAX_RUNNING_REQUESTS, + max_prefill_tokens: int = _DEFAULT_MAX_PREFILL_TOKENS, + max_total_tokens: int = _DEFAULT_MAX_TOTAL_TOKENS, + eos_token_ids: Optional[List[int]] = None, + default_max_new_tokens: int = _DEFAULT_MAX_NEW_TOKENS, ): - # ZMQ addresses + # ZMQ addresses (tokenizer + detokenizer only) self._recv_from_tokenizer_addr = recv_from_tokenizer_addr - self._send_to_model_runner_addr = send_to_model_runner_addr - self._recv_from_model_runner_addr = recv_from_model_runner_addr self._send_to_detokenizer_addr = send_to_detokenizer_addr + # Model config (for in-process model runner, sglang-style) + self._server_config = server_config + self._model_config = model_config + self._gpu_id = gpu_id + # Shared queue configuration self._shared_queue = shared_queue self._enable_shared_queue = enable_shared_queue @@ -72,16 +364,53 @@ def __init__( # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_tokenizer: Optional[zmq.Socket] = None - self._send_to_model_runner: Optional[zmq.Socket] = None - self._recv_from_model_runner: Optional[zmq.Socket] = None self._send_to_detokenizer: Optional[zmq.Socket] = None self._poller: Optional[zmq.Poller] = None - # Request management + # In-process model runner (initialised in init_model) + self._model_runner = None + + # Request management -- three-stage pipeline self._waiting_queue: Deque[TokenizedGenerateReqInput] = deque() - self._running_batch: Optional[Dict[str, Any]] = None + self._pending_queue: List[Req] = [] + self._running_batch: List[Req] = [] self._finished: List[Dict[str, Any]] = [] + # Scheduling limits + self._max_running_requests = max_running_requests + self._max_prefill_tokens = max_prefill_tokens + + # KV-cache token budget (simplified single-GPU tracking). + self._max_total_tokens = max_total_tokens + self._used_tokens: int = 0 + + # EOS token(s) for finish detection + self._eos_token_ids: List[int] = list(eos_token_ids) if eos_token_ids else [] + + # Default max_new_tokens (from model config or fallback) + self._default_max_new_tokens = default_max_new_tokens + + # Monotonic request-slot counter (simplified; no GPU pool access) + self._next_req_pool_idx: int = 0 + + # ------ Throughput metrics (sglang-style interval logging) ------ + # How often (in decode batches) to log throughput stats. + self._decode_log_interval: int = ( + server_config.decode_log_interval + if server_config is not None and hasattr(server_config, "decode_log_interval") + else 40 + ) + # Accumulators reset at each log interval + self._num_prefill_tokens: int = 0 # new prefill tokens (excluding cache hits) + self._num_prefill_cache_tokens: int = 0 # prefill tokens served from cache + self._num_decode_tokens: int = 0 # generated decode tokens + self._num_prefill_reqs: int = 0 # prefill requests count + # Timestamps for throughput calculation + self._last_prefill_stats_tic: float = time.time() + self._last_decode_stats_tic: float = time.time() + # Forward pass counters + self._forward_ct_decode: int = 0 + # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ @@ -95,18 +424,6 @@ def init_sockets(self) -> None: self._recv_from_tokenizer_addr, bind=False, ) - self._send_to_model_runner = create_zmq_socket( - self._zmq_ctx, - zmq.PUSH, - self._send_to_model_runner_addr, - bind=True, - ) - self._recv_from_model_runner = create_zmq_socket( - self._zmq_ctx, - zmq.PULL, - self._recv_from_model_runner_addr, - bind=True, - ) self._send_to_detokenizer = create_zmq_socket( self._zmq_ctx, zmq.PUSH, @@ -118,6 +435,22 @@ def init_sockets(self) -> None: self._poller = zmq.Poller() self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) + def init_model(self) -> None: + """Create and initialise the in-process model runner (sglang-style). + + Must be called after ``init_sockets`` and inside the subprocess + (after spawn) since it performs CUDA initialisation. + """ + from pymllm.orchestrator.model_runner_process import ModelRunnerProcess + + self._model_runner = ModelRunnerProcess( + gpu_id=self._gpu_id, + server_config=self._server_config, + model_config=self._model_config, + ) + self._model_runner.init_model() + logger.info("In-process model runner initialised on GPU %d", self._gpu_id) + def event_loop(self) -> None: """Infinite scheduling loop.""" logger.info( @@ -170,6 +503,8 @@ def _recv_from_zmq(self) -> None: self._waiting_queue = type(self._waiting_queue)( r for r in self._waiting_queue if r.rid != rid ) + # Also abort from pending queue + self._abort_request(rid) else: self._waiting_queue.append(msg) @@ -236,90 +571,398 @@ def _recv_from_shared_queue(self) -> None: # ------------------------------------------------------------------ def process_input_requests(self) -> None: - """Pre-process and validate requests sitting in ``_waiting_queue``. - - TODO: attach sampling params, allocate KV-cache slots, etc. + """Convert raw :class:`TokenizedGenerateReqInput` in ``_waiting_queue`` + into :class:`Req` objects and move them to ``_pending_queue``. + + For each request: + 1. Parse sampling params (max_new_tokens, temperature, top_p, top_k, + stop_token_ids with defaults from EOS token). + 2. Create a ``Req`` object. + 3. Move from ``_waiting_queue`` to ``_pending_queue``. """ - pass + while self._waiting_queue: + raw = self._waiting_queue.popleft() + + # Merge EOS token into stop_token_ids if not already present + sp = dict(raw.sampling_params) if raw.sampling_params else {} + # Inject model-aware default for max_new_tokens when not provided + if "max_new_tokens" not in sp: + sp["max_new_tokens"] = self._default_max_new_tokens + stop_ids = list(sp.get("stop_token_ids", [])) + for eid in self._eos_token_ids: + if eid not in stop_ids: + stop_ids.append(eid) + sp["stop_token_ids"] = stop_ids + + req = Req( + rid=raw.rid, + input_ids=raw.input_ids, + input_text=raw.input_text, + sampling_params=sp, + mm_inputs=raw.mm_inputs, + stream=raw.stream, + return_logprob=raw.return_logprob, + logprob_start_len=raw.logprob_start_len, + top_logprobs_num=raw.top_logprobs_num, + ) + self._pending_queue.append(req) + logger.debug("Processed input request %s (len=%d)", req.rid, req.seq_len) # ------------------------------------------------------------------ # Step 3: build the next batch # ------------------------------------------------------------------ - def get_next_batch_to_run(self) -> Optional[Dict[str, Any]]: - """Select requests from ``_waiting_queue`` and form a batch. - - TODO: implement real batching / scheduling policy. + def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: + """Implements continuous batching with two phases. + + 1. **Filter finished**: Remove finished requests from + ``_running_batch`` and free their token budget. + 2. **Schedule new prefills**: From ``_pending_queue``, admit + requests that fit within the token budget and + ``max_running_requests``. + 3. **Build batch**: + - If new prefill requests exist -> EXTEND batch + - Else if running decode requests exist -> DECODE batch + - Else -> None (idle) + + Note on prefix cache: The actual prefix matching is done by the + ModelRunnerProcess (which owns the RadixCache). The scheduler + uses ``input_len`` as a conservative budget estimate. The model + runner reports back actual ``prefix_len`` in results, and the + scheduler adjusts ``_used_tokens`` accordingly in + ``process_batch_result``. """ - if not self._waiting_queue: - return None + # Phase 1: filter finished requests from running batch + still_running: List[Req] = [] + for req in self._running_batch: + if req.is_finished: + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + still_running.append(req) + self._running_batch = still_running + + # Phase 2: schedule new prefill requests from pending queue + new_prefill: List[Req] = [] + remaining_pending: List[Req] = [] + prefill_token_budget = self._max_prefill_tokens + + for req in self._pending_queue: + input_len = len(req.input_ids) + total_running = len(self._running_batch) + len(new_prefill) + + # Check capacity constraints. + # We reserve the full input_len as KV budget (conservative). + # If the model runner finds a prefix cache hit, some tokens + # won't need new KV allocation; the budget is corrected in + # process_batch_result. + can_fit_request = total_running < self._max_running_requests + can_fit_tokens = (self._used_tokens + input_len) <= self._max_total_tokens + can_fit_prefill = input_len <= prefill_token_budget + + if can_fit_request and can_fit_tokens and can_fit_prefill: + # Allocate req pool slot + req.req_pool_idx = self._next_req_pool_idx + self._next_req_pool_idx += 1 + # Reserve token budget (full input_len as conservative estimate) + self._used_tokens += input_len + prefill_token_budget -= input_len + new_prefill.append(req) + logger.debug( + "Scheduled prefill for %s (len=%d, used=%d/%d)", + req.rid, + input_len, + self._used_tokens, + self._max_total_tokens, + ) + else: + remaining_pending.append(req) - batch_requests: List[Dict[str, Any]] = [] - # TODO: respect max_running_requests, memory budget, etc. - while self._waiting_queue: - batch_requests.append(self._waiting_queue.popleft()) + self._pending_queue = remaining_pending - batch = { - "requests": batch_requests, - "batch_id": id(batch_requests), - "created_at": time.time(), - } - return batch + # Phase 3: build batch + if new_prefill: + return ScheduleBatch(new_prefill, ForwardMode.EXTEND) + elif self._running_batch: + return ScheduleBatch(self._running_batch, ForwardMode.DECODE) + else: + return None # ------------------------------------------------------------------ # Step 4: run the batch via ModelRunnerProcess # ------------------------------------------------------------------ - def run_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: - """Send *batch* to ModelRunnerProcess and wait for the result. + def run_batch(self, batch: ScheduleBatch) -> Dict[str, Any]: + """Execute the batch via the in-process model runner (sglang-style). - This is a **blocking** call: the scheduler is synchronous with the - model runner for simplicity. Overlap scheduling can be added later. + Direct function call — no ZMQ serialisation overhead. """ - self._send_to_model_runner.send_pyobj(batch) - result = self._recv_from_model_runner.recv_pyobj() - return result + batch_dict = batch.to_batch_dict() + return self._model_runner._forward_batch(batch_dict) # ------------------------------------------------------------------ # Step 5: process batch result # ------------------------------------------------------------------ def process_batch_result( - self, batch: Dict[str, Any], result: Dict[str, Any] + self, batch: ScheduleBatch, result: Dict[str, Any] ) -> None: """Handle the result returned by the ModelRunnerProcess. - TODO: check completion status (EOS, max_tokens), manage KV-cache, - split finished vs. unfinished requests. + For each request in the result: + 1. Update ``prefix_len`` from the model runner's radix cache hit. + 2. Adjust ``_used_tokens`` if a prefix cache hit was found (the + scheduler over-reserved during scheduling). + 3. Append new token(s) to ``req.output_ids``. + 4. Increment ``req.seq_len``. + 5. Call ``req.check_finished()`` (EOS token, max_new_tokens). + 6. If prefill request: mark ``req.is_prefilled = True``, move to + running batch for decode. + 7. If finished: collect for output, free KV-cache budget. """ - finished_requests = result.get("finished", []) - unfinished_requests = result.get("unfinished", []) - - self._finished.extend(finished_requests) - - # Put unfinished requests back for the next iteration - for req in unfinished_requests: - self._waiting_queue.appendleft(req) + # Build a rid -> Req lookup for the batch + rid_to_req: Dict[str, Req] = {req.rid: req for req in batch.reqs} + + # The result may contain per-request outputs in "finished" and + # "unfinished" lists, or a flat "outputs" list. Handle both. + output_items: List[Dict[str, Any]] = [] + output_items.extend(result.get("finished", [])) + output_items.extend(result.get("unfinished", [])) + if "outputs" in result: + output_items.extend(result["outputs"]) + + for out in output_items: + rid = out.get("rid") + req = rid_to_req.get(rid) + if req is None: + logger.warning("Result for unknown rid=%s, skipping", rid) + continue + + # Update prefix_len from model runner's radix cache matching. + # The model runner reports the actual prefix_len it found. + # The scheduler originally reserved full input_len in + # get_next_batch_to_run; correct the over-reservation now. + if "prefix_len" in out and batch.forward_mode.is_extend(): + actual_prefix_len = out["prefix_len"] + if actual_prefix_len > req.prefix_len: + saved = actual_prefix_len - req.prefix_len + req.prefix_len = actual_prefix_len + # Give back the over-reserved tokens. The model runner + # reused cached KV for `saved` tokens, so those tokens + # do not consume new KV pool slots. + self._used_tokens = max(0, self._used_tokens - saved) + logger.info( + "Prefix cache hit for rid=%s: %d tokens reused, " + "budget adjusted by -%d (used=%d/%d)", + rid, + actual_prefix_len, + saved, + self._used_tokens, + self._max_total_tokens, + ) + + # Append generated token(s) + new_token_ids = out.get("output_token_ids", []) + if isinstance(new_token_ids, int): + new_token_ids = [new_token_ids] + req.output_ids.extend(new_token_ids) + req.seq_len += len(new_token_ids) + + # Update token budget for newly generated tokens + self._used_tokens += len(new_token_ids) + + # Check finish conditions + req.check_finished(eos_token_id=self._eos_token_ids[0] if self._eos_token_ids else None) + + # Process batch requests based on forward mode + if batch.forward_mode.is_extend(): + # Prefill batch: mark as prefilled and route + for req in batch.reqs: + req.is_prefilled = True + if req.is_finished: + self._collect_finished_output(req) + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + self._running_batch.append(req) + + # --- Accumulate prefill metrics --- + total_input = 0 + total_cached = 0 + for req in batch.reqs: + total_input += req.prompt_len + total_cached += req.prefix_len + self._num_prefill_tokens += total_input - total_cached + self._num_prefill_cache_tokens += total_cached + self._num_prefill_reqs += len(batch.reqs) + self._log_prefill_stats() + else: + # Decode batch: check finish and collect + new_running: List[Req] = [] + for req in batch.reqs: + if req.is_finished: + self._collect_finished_output(req) + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + new_running.append(req) + self._running_batch = new_running + + # --- Accumulate decode metrics --- + self._num_decode_tokens += batch.batch_size # 1 token per request + self._forward_ct_decode += 1 + if ( + self._decode_log_interval > 0 + and self._forward_ct_decode % self._decode_log_interval == 0 + ): + self._log_decode_stats() # ------------------------------------------------------------------ # Step 6: stream output to DetokenizerProcess # ------------------------------------------------------------------ def stream_output(self) -> None: - """Send finished token-ID outputs to the DetokenizerProcess.""" + """Send finished/streaming outputs to the DetokenizerProcess. + + Produces :class:`~pymllm.engine.io_struct.BatchTokenIDOutput`-compatible + dicts. For streaming requests, intermediate tokens are also sent. + """ + # Collect streaming outputs from running requests + for req in self._running_batch: + if req.stream and len(req.output_ids) > req.read_offset: + decode_ids = req.output_ids[req.read_offset :] + output = { + "rids": [req.rid], + "finished_reasons": [None], + "decode_ids": decode_ids, + "read_offsets": [req.read_offset], + "output_ids": list(req.output_ids), + "skip_special_tokens": [True], + "prompt_tokens": [req.prompt_len], + "completion_tokens": [len(req.output_ids)], + } + req.read_offset = len(req.output_ids) + self._send_to_detokenizer.send_pyobj(output) + + # Send finished outputs while self._finished: item = self._finished.pop(0) self._send_to_detokenizer.send_pyobj(item) + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _log_prefill_stats(self) -> None: + """Log prefill throughput at INFO level (called after each prefill batch).""" + now = time.time() + elapsed = now - self._last_prefill_stats_tic + self._last_prefill_stats_tic = now + + if elapsed > 0: + input_throughput = self._num_prefill_tokens / elapsed + else: + input_throughput = 0.0 + + logger.info( + "Prefill batch: %d reqs, " + "new tokens: %d, " + "cached tokens: %d, " + "input throughput: %.2f token/s", + self._num_prefill_reqs, + self._num_prefill_tokens, + self._num_prefill_cache_tokens, + input_throughput, + ) + # Reset accumulators + self._num_prefill_tokens = 0 + self._num_prefill_cache_tokens = 0 + self._num_prefill_reqs = 0 + + def _log_decode_stats(self) -> None: + """Log decode throughput at INFO level (called every decode_log_interval batches).""" + now = time.time() + elapsed = now - self._last_decode_stats_tic + self._last_decode_stats_tic = now + + if elapsed > 0: + gen_throughput = self._num_decode_tokens / elapsed + else: + gen_throughput = 0.0 + + logger.info( + "Decode: %d steps, " + "gen tokens: %d, " + "running: %d reqs, " + "gen throughput: %.2f token/s", + self._forward_ct_decode, + self._num_decode_tokens, + len(self._running_batch), + gen_throughput, + ) + # Reset accumulators + self._num_decode_tokens = 0 + self._forward_ct_decode = 0 + + def _collect_finished_output(self, req: Req) -> None: + """Build a finished output dict and add it to ``_finished``.""" + decode_ids = req.output_ids[req.read_offset :] + output: Dict[str, Any] = { + "rids": [req.rid], + "finished_reasons": [req.finished_reason], + "decode_ids": decode_ids, + "read_offsets": [req.read_offset], + "output_ids": list(req.output_ids), + "skip_special_tokens": [True], + "prompt_tokens": [req.prompt_len], + "completion_tokens": [len(req.output_ids)], + } + self._finished.append(output) + logger.debug( + "Request %s finished: reason=%s, tokens=%d", + req.rid, + req.finished_reason, + len(req.output_ids), + ) + + def _free_req_resources(self, req: Req) -> None: + """Release KV-cache token budget for a finished request. + + The budget was charged as follows: + - At scheduling: ``+input_len`` (full prompt as conservative estimate) + - After prefix correction: ``-prefix_len`` (cached prefix doesn't need + new KV allocation; model runner manages those via radix cache) + - At each decode step: ``+1`` per generated token + + So the net charge for this request is: + ``(input_len - prefix_len) + num_decode_tokens`` + = ``seq_len - prefix_len`` + + We release exactly that amount. + """ + tokens_to_free = req.seq_len - req.prefix_len + self._used_tokens = max(0, self._used_tokens - tokens_to_free) + req.req_pool_idx = -1 + + def _abort_request(self, rid: str) -> None: + """Abort a request by rid from pending or running queues.""" + # Remove from pending queue + self._pending_queue = [r for r in self._pending_queue if r.rid != rid] + # Abort in running batch + for req in self._running_batch: + if req.rid == rid: + req.abort() + break + # ------------------------------------------------------------------ # Cleanup # ------------------------------------------------------------------ def shutdown(self) -> None: + if self._model_runner is not None: + self._model_runner.shutdown() for sock in ( self._recv_from_tokenizer, - self._send_to_model_runner, - self._recv_from_model_runner, self._send_to_detokenizer, ): if sock is not None: @@ -330,25 +973,38 @@ def shutdown(self) -> None: def run_scheduler_process( recv_from_tokenizer_addr: str, - send_to_model_runner_addr: str, - recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, pipe_writer: Connection, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, tensor_transport_mode: TensorTransportMode = "default", + log_level: str = "info", + default_max_new_tokens: int = _DEFAULT_MAX_NEW_TOKENS, + eos_token_ids: Optional[List[int]] = None, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, + gpu_id: int = 0, ) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + """Entry point for ``torch.multiprocessing.Process(target=...)``. + + The scheduler process now also owns the model runner (sglang-style), + so model initialisation happens here. + """ + setup_subprocess_logging(log_level) proc = SchedulerProcess( recv_from_tokenizer_addr, - send_to_model_runner_addr, - recv_from_model_runner_addr, send_to_detokenizer_addr, + server_config=server_config, + model_config=model_config, + gpu_id=gpu_id, shared_queue=shared_queue, enable_shared_queue=enable_shared_queue, tensor_transport_mode=tensor_transport_mode, + default_max_new_tokens=default_max_new_tokens, + eos_token_ids=eos_token_ids, ) proc.init_sockets() + proc.init_model() pipe_writer.send({"status": "ready", "process": "scheduler"}) pipe_writer.close() diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 587a7c1e..703618a4 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -35,7 +35,7 @@ from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.cuda_ipc_transport import MmItemMemoryPool, TensorTransportMode -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -352,6 +352,7 @@ def _tokenize( ) # Accept a list for robustness; take the first element. input_text = str(text[0]) if isinstance(text, list) else str(text) + logger.debug(f"Tokenizing input text {input_text}") encode_kwargs: Dict[str, Any] = { "add_special_tokens": True, @@ -485,6 +486,7 @@ def run_tokenizer_process( shared_queue: Optional[TensorQueue] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + setup_subprocess_logging(tokenizer_cfg.get("log_level", "info")) proc = TokenizerProcess( recv_from_rr_addr, send_to_scheduler_addr, tokenizer_cfg, shared_queue ) diff --git a/pymllm/parsers/__init__.py b/pymllm/parsers/__init__.py new file mode 100644 index 00000000..5ac5c292 --- /dev/null +++ b/pymllm/parsers/__init__.py @@ -0,0 +1,10 @@ +"""Output parsers for reasoning (thinking) content and tool calls.""" + +from pymllm.parsers.reasoning_parser import ReasoningParser +from pymllm.parsers.tool_call_parser import ToolCallParser, ToolCallItem + +__all__ = [ + "ReasoningParser", + "ToolCallParser", + "ToolCallItem", +] diff --git a/pymllm/parsers/reasoning_parser.py b/pymllm/parsers/reasoning_parser.py new file mode 100644 index 00000000..1f73c788 --- /dev/null +++ b/pymllm/parsers/reasoning_parser.py @@ -0,0 +1,212 @@ +"""Reasoning / thinking content parser. + +Separates ``...`` (or model-specific markers) from normal +assistant content. Supports both one-shot and incremental streaming modes. + +Usage:: + + # Non-streaming + parser = ReasoningParser("qwen3") + reasoning, content = parser.parse_non_stream(full_text) + + # Streaming + parser = ReasoningParser("qwen3") + for delta in deltas: + reasoning_delta, content_delta = parser.parse_stream_chunk(delta) +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Type + + +# --------------------------------------------------------------------------- +# Detector registry +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class _DetectorConfig: + start: str + end: str + force: bool # True = always assume reasoning at start + + +_DETECTOR_MAP: Dict[str, _DetectorConfig] = { + # DeepSeek-R1: always starts in reasoning mode + "deepseek-r1": _DetectorConfig("", "", force=True), + # Qwen3: optional thinking (controlled by request) + "qwen3": _DetectorConfig("", "", force=False), + # Qwen3 forced thinking + "qwen3-thinking": _DetectorConfig("", "", force=True), + # GLM-4.5 + "glm45": _DetectorConfig("", "", force=False), + # Kimi + "kimi": _DetectorConfig("\u25c1think\u25b7", "\u25c1/think\u25b7", force=False), +} + + +# --------------------------------------------------------------------------- +# ReasoningParser +# --------------------------------------------------------------------------- + + +class ReasoningParser: + """Model-agnostic reasoning content parser. + + Parameters + ---------- + model_type + Key into the detector registry (e.g. ``"qwen3"``, ``"deepseek-r1"``). + stream_reasoning + If ``True``, stream reasoning content incrementally as it arrives. + If ``False``, buffer reasoning until the end tag is found. + """ + + SUPPORTED = set(_DETECTOR_MAP) + + def __init__(self, model_type: str, stream_reasoning: bool = True): + cfg = _DETECTOR_MAP.get(model_type) + if cfg is None: + raise ValueError( + f"Unknown reasoning parser {model_type!r}. " + f"Supported: {sorted(_DETECTOR_MAP)}" + ) + self._start = cfg.start + self._end = cfg.end + self._force = cfg.force + self._stream_reasoning = stream_reasoning + + # -- streaming state -- + self._buffer = "" + self._in_reasoning = cfg.force + self._start_consumed = False # True once start tag has been stripped + self._done = False # True once end tag has been seen + + # ------------------------------------------------------------------ # + # Non-streaming + # ------------------------------------------------------------------ # + + def parse_non_stream(self, text: str) -> Tuple[Optional[str], str]: + """Parse complete text. + + Returns ``(reasoning_content, content)`` where either may be empty. + """ + start_idx = text.find(self._start) + end_idx = text.find(self._end) + + if start_idx == -1 and not self._force: + return None, text + + # Determine boundaries + if self._force and start_idx == -1: + # Model didn't emit explicit start tag; treat prefix as reasoning + reason_start = 0 + else: + reason_start = start_idx + len(self._start) + + before = text[:start_idx] if start_idx != -1 else "" + + if end_idx != -1 and end_idx >= reason_start: + reasoning = text[reason_start:end_idx] + after = text[end_idx + len(self._end) :] + else: + reasoning = text[reason_start:] + after = "" + + content = (before + after).strip() + reasoning = reasoning.strip() + return reasoning or None, content + + # ------------------------------------------------------------------ # + # Streaming + # ------------------------------------------------------------------ # + + def parse_stream_chunk(self, delta: str) -> Tuple[str, str]: + """Parse an incremental streaming delta. + + Returns ``(reasoning_delta, content_delta)``. Either may be ``""``. + """ + if not delta: + return "", "" + + if self._done: + return "", delta + + self._buffer += delta + reasoning_out = "" + content_out = "" + + # In forced reasoning mode, consume the start tag if it appears + # (the model may or may not emit it explicitly). + if self._in_reasoning and not self._start_consumed: + idx = self._buffer.find(self._start) + if idx != -1: + # Start tag found — strip it and any text before it + self._buffer = self._buffer[idx + len(self._start) :] + self._start_consumed = True + elif _could_be_partial(self._buffer, self._start): + # Might be a partial start tag — hold the buffer + return "", "" + else: + # No start tag coming — mark consumed and continue + self._start_consumed = True + + if not self._in_reasoning: + # --- look for start tag --- + idx = self._buffer.find(self._start) + if idx != -1: + content_out += self._buffer[:idx] + self._buffer = self._buffer[idx + len(self._start) :] + self._in_reasoning = True + self._start_consumed = True + elif _could_be_partial(self._buffer, self._start): + # Potential partial match at tail — hold the buffer + safe = len(self._buffer) - len(self._start) + 1 + if safe > 0: + content_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + return "", content_out + else: + content_out += self._buffer + self._buffer = "" + return "", content_out + + if self._in_reasoning: + # --- look for end tag --- + idx = self._buffer.find(self._end) + if idx != -1: + reasoning_out += self._buffer[:idx] + after = self._buffer[idx + len(self._end) :] + self._buffer = "" + self._in_reasoning = False + self._done = True + if after: + content_out += after + elif _could_be_partial(self._buffer, self._end): + safe = len(self._buffer) - len(self._end) + 1 + if safe > 0: + reasoning_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + else: + reasoning_out += self._buffer + self._buffer = "" + + if not self._stream_reasoning: + reasoning_out = "" + + return reasoning_out, content_out + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _could_be_partial(text: str, pattern: str) -> bool: + """Return True if *text* ends with a prefix of *pattern*.""" + for i in range(1, len(pattern)): + if text.endswith(pattern[:i]): + return True + return False diff --git a/pymllm/parsers/tool_call_parser.py b/pymllm/parsers/tool_call_parser.py new file mode 100644 index 00000000..fdfe9391 --- /dev/null +++ b/pymllm/parsers/tool_call_parser.py @@ -0,0 +1,433 @@ +"""Tool-call (function-calling) output parser. + +Extracts structured tool calls from model output text. Supports both +one-shot and incremental streaming modes. + +Formats supported: + +* **qwen25** — ``{"name":...,"arguments":...}`` +* **llama3** — ``<|python_tag|>{"name":...,"parameters":...}`` +* **hermes** — ``{"name":...,"arguments":...}`` (same tags, Hermes schema) + +Usage:: + + # Non-streaming + parser = ToolCallParser("qwen25", tools=tools_list) + content, tool_calls = parser.parse_non_stream(full_text) + + # Streaming + parser = ToolCallParser("qwen25", tools=tools_list) + for delta in deltas: + content_delta, tool_call_deltas = parser.parse_stream_chunk(delta) +""" + +from __future__ import annotations + +import json +import re +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + + +@dataclass +class ToolCallItem: + """A single parsed tool call.""" + + name: Optional[str] = None + arguments: str = "" + tool_call_id: str = "" + index: int = 0 + + def to_openai_dict(self, streaming: bool = True) -> Dict[str, Any]: + """Convert to OpenAI ``tool_calls[]`` element format. + + Parameters + ---------- + streaming + If True, include ``index`` (streaming delta format). + If False, omit ``index`` (non-streaming message format). + """ + d: Dict[str, Any] = {"type": "function", "function": {}} + if streaming: + d["index"] = self.index + if self.tool_call_id: + d["id"] = self.tool_call_id + fn: Dict[str, Any] = d["function"] + if self.name is not None: + fn["name"] = self.name + fn["arguments"] = self.arguments or "" + return d + + +# --------------------------------------------------------------------------- +# Detector base +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class _FormatConfig: + bot_token: str + end_token: str + # Regex to extract individual call bodies between bot/end tokens. + # If None, the entire text between bot and end tokens is one call. + call_regex: Optional[str] = None + + +_FORMAT_MAP: Dict[str, _FormatConfig] = { + "qwen25": _FormatConfig( + bot_token="\n", + end_token="\n", + ), + "qwen3_coder": _FormatConfig( + bot_token="", + end_token="", + ), + "hermes": _FormatConfig( + bot_token="\n", + end_token="\n", + ), + "llama3": _FormatConfig( + bot_token="<|python_tag|>", + end_token="", # Llama3 uses EOT, detected via EOS + ), +} + + +# --------------------------------------------------------------------------- +# ToolCallParser +# --------------------------------------------------------------------------- + + +class ToolCallParser: + """Model-agnostic tool-call parser. + + Parameters + ---------- + model_type + Key into the format registry (e.g. ``"qwen25"``, ``"llama3"``). + tools + The ``tools`` list from the OpenAI chat request (used to resolve + function names). + """ + + SUPPORTED = set(_FORMAT_MAP) + + def __init__(self, model_type: str, tools: Optional[List[Any]] = None): + cfg = _FORMAT_MAP.get(model_type) + if cfg is None: + raise ValueError( + f"Unknown tool-call parser {model_type!r}. " + f"Supported: {sorted(_FORMAT_MAP)}" + ) + self._bot = cfg.bot_token + self._end = cfg.end_token + self._model_type = model_type + self._tools = tools or [] + + # -- streaming state -- + self._buffer = "" + self._in_call = False + self._current_tool_idx = 0 + self._current_call_buf = "" + self._prev_args_len = 0 + self._name_sent = False + self._completed_calls: List[ToolCallItem] = [] + + # ------------------------------------------------------------------ # + # Non-streaming + # ------------------------------------------------------------------ # + + def has_tool_call(self, text: str) -> bool: + """Return True if *text* contains a tool-call pattern.""" + return self._bot in text + + def parse_non_stream( + self, text: str + ) -> Tuple[str, List[ToolCallItem]]: + """Parse complete text. + + Returns ``(remaining_content, tool_calls)``. + """ + if not self.has_tool_call(text): + return text, [] + + tool_calls: List[ToolCallItem] = [] + normal_parts: List[str] = [] + + remaining = text + idx = 0 + while True: + bot_pos = remaining.find(self._bot) + if bot_pos == -1: + normal_parts.append(remaining) + break + normal_parts.append(remaining[:bot_pos]) + remaining = remaining[bot_pos + len(self._bot) :] + + if self._end: + end_pos = remaining.find(self._end) + if end_pos == -1: + call_body = remaining + remaining = "" + else: + call_body = remaining[:end_pos] + remaining = remaining[end_pos + len(self._end) :] + else: + call_body = remaining + remaining = "" + + parsed = self._parse_call_body(call_body.strip()) + if parsed is not None: + parsed.index = idx + parsed.tool_call_id = _make_tool_call_id() + tool_calls.append(parsed) + idx += 1 + + content = "".join(normal_parts).strip() + return content, tool_calls + + # ------------------------------------------------------------------ # + # Streaming + # ------------------------------------------------------------------ # + + def parse_stream_chunk( + self, delta: str + ) -> Tuple[str, List[ToolCallItem]]: + """Parse an incremental streaming delta. + + Returns ``(content_delta, tool_call_items)``. + + For tool call items: + - First item for a call: ``name`` is set, ``arguments`` is ``""``. + - Subsequent items: ``name`` is ``None``, ``arguments`` is the new + characters appended (argument delta). + """ + if not delta: + return "", [] + + self._buffer += delta + content_out = "" + items: List[ToolCallItem] = [] + + while True: + if not self._in_call: + # --- look for bot token --- + bot_pos = self._buffer.find(self._bot) + if bot_pos != -1: + content_out += self._buffer[:bot_pos] + self._buffer = self._buffer[bot_pos + len(self._bot) :] + self._in_call = True + self._current_call_buf = "" + self._prev_args_len = 0 + self._name_sent = False + continue # try to process call content + else: + # Check for partial bot token at tail + if self._bot and _could_be_partial(self._buffer, self._bot): + safe = len(self._buffer) - len(self._bot) + 1 + if safe > 0: + content_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + else: + content_out += self._buffer + self._buffer = "" + break + + if self._in_call: + # --- look for end token --- + if self._end: + end_pos = self._buffer.find(self._end) + if end_pos != -1: + self._current_call_buf += self._buffer[:end_pos] + self._buffer = self._buffer[end_pos + len(self._end) :] + # Emit final tool call + item = self._finalize_call() + if item is not None: + items.append(item) + self._in_call = False + self._current_tool_idx += 1 + continue # there may be more calls + else: + # Accumulate and stream arguments + self._current_call_buf += self._buffer + self._buffer = "" + item = self._stream_partial_call() + if item is not None: + items.append(item) + break + else: + # No end token (e.g. Llama3) — accumulate everything + self._current_call_buf += self._buffer + self._buffer = "" + item = self._stream_partial_call() + if item is not None: + items.append(item) + break + + return content_out, items + + def flush(self) -> List[ToolCallItem]: + """Flush any remaining buffered tool call (call at request end).""" + items: List[ToolCallItem] = [] + if self._in_call and self._current_call_buf.strip(): + item = self._finalize_call() + if item is not None: + items.append(item) + self._in_call = False + return items + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + def _parse_call_body(self, body: str) -> Optional[ToolCallItem]: + """Parse a single call body (JSON or qwen3_coder XML-style).""" + if self._model_type == "qwen3_coder": + return self._parse_qwen3_coder_body(body) + try: + obj = json.loads(body) + except json.JSONDecodeError: + return None + name = obj.get("name") + args = obj.get("arguments") or obj.get("parameters") or {} + if isinstance(args, dict): + args = json.dumps(args, ensure_ascii=False) + return ToolCallItem(name=name, arguments=args) + + @staticmethod + def _parse_qwen3_coder_body(body: str) -> Optional[ToolCallItem]: + """Parse qwen3_coder XML-style: ``V...``.""" + # Extract function name + func_m = re.search(r"]+)>", body) + if func_m is None: + return None + name = func_m.group(1) + # Extract parameters + params: Dict[str, Any] = {} + for pm in re.finditer( + r"]+)>(.*?)(?:|(?=))", + body, + re.DOTALL, + ): + key = pm.group(1) + val = pm.group(2).strip() + # Try to parse as JSON value, otherwise keep as string + try: + params[key] = json.loads(val) + except (json.JSONDecodeError, ValueError): + params[key] = val + return ToolCallItem( + name=name, + arguments=json.dumps(params, ensure_ascii=False), + ) + + def _stream_partial_call(self) -> Optional[ToolCallItem]: + """Try to extract streaming information from the partial call.""" + body = self._current_call_buf.strip() + if not body: + return None + + # Try to extract name first + if not self._name_sent: + name = self._try_extract_name(body) + if name is not None: + self._name_sent = True + return ToolCallItem( + name=name, + arguments="", + tool_call_id=_make_tool_call_id(), + index=self._current_tool_idx, + ) + return None + + # Stream argument characters + args_str = self._try_extract_args_partial(body) + if args_str is not None and len(args_str) > self._prev_args_len: + new_chars = args_str[self._prev_args_len :] + self._prev_args_len = len(args_str) + return ToolCallItem( + name=None, + arguments=new_chars, + index=self._current_tool_idx, + ) + return None + + def _finalize_call(self) -> Optional[ToolCallItem]: + """Finalize a complete call — emit any remaining argument chars.""" + parsed = self._parse_call_body(self._current_call_buf.strip()) + if parsed is None: + return None + + if not self._name_sent: + # Entire call came at once + parsed.index = self._current_tool_idx + parsed.tool_call_id = _make_tool_call_id() + return parsed + + # Name was already sent — emit remaining arguments + full_args = parsed.arguments + new_chars = full_args[self._prev_args_len :] + if new_chars: + return ToolCallItem( + name=None, + arguments=new_chars, + index=self._current_tool_idx, + ) + return None + + def _try_extract_name(self, partial: str) -> Optional[str]: + """Try to extract function name from partial call body.""" + if self._model_type == "qwen3_coder": + m = re.search(r"]+)>", partial) + return m.group(1) if m else None + m = re.search(r'"name"\s*:\s*"([^"]+)"', partial) + return m.group(1) if m else None + + def _try_extract_args_partial(self, partial: str) -> Optional[str]: + """Try to extract partial arguments from call body.""" + if self._model_type == "qwen3_coder": + # Build JSON incrementally from V tags + params: Dict[str, Any] = {} + for pm in re.finditer( + r"]+)>(.*?)(?:)", + partial, + re.DOTALL, + ): + key = pm.group(1) + val = pm.group(2).strip() + try: + params[key] = json.loads(val) + except (json.JSONDecodeError, ValueError): + params[key] = val + if params: + return json.dumps(params, ensure_ascii=False) + return None + m = re.search(r'"arguments"\s*:\s*(\{.*)', partial, re.DOTALL) + if m: + return m.group(1) + m = re.search(r'"parameters"\s*:\s*(\{.*)', partial, re.DOTALL) + if m: + return m.group(1) + return None + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_tool_call_id() -> str: + return f"call_{uuid.uuid4().hex[:24]}" + + +def _could_be_partial(text: str, pattern: str) -> bool: + for i in range(1, len(pattern)): + if text.endswith(pattern[:i]): + return True + return False diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index 83a222f7..b9f60322 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -1,17 +1,936 @@ +"""pymllm HTTP server -- RESTful API entry point. + +This module implements a FastAPI-based HTTP server that wraps the pymllm +:class:`Engine` and exposes OpenAI-compatible and native REST endpoints, +following the architecture of sglang's ``http_server.py``. + +Endpoints +--------- +* ``GET /health`` -- liveness probe +* ``GET /v1/models`` -- list served models (OpenAI-compatible) +* ``POST /generate`` -- native generate (streaming via SSE) +* ``POST /v1/completions`` -- OpenAI-compatible completions +* ``POST /v1/chat/completions`` -- OpenAI-compatible chat completions +* ``GET /model_info`` -- model metadata +* ``GET /server_info`` -- runtime config dump +* ``POST /flush_cache`` -- flush internal caches +* ``POST /abort_request`` -- cancel a running request +""" + +import asyncio +import logging +import os +import time +import uuid +from contextlib import asynccontextmanager +from typing import Any, AsyncIterator, Dict, List, Optional, Union + +import orjson +import uvicorn +import uvloop +from fastapi import FastAPI, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import ORJSONResponse, Response, StreamingResponse +from pydantic import BaseModel, Field + +from pymllm.configs.global_config import get_global_config, make_args, read_args from pymllm.engine.launch import Engine -from pymllm.configs.global_config import make_args, read_args + +logger = logging.getLogger(__name__) +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + +# --------------------------------------------------------------------------- +# Global handles (populated at startup) +# --------------------------------------------------------------------------- +_engine: Optional[Engine] = None +_tokenizer: Optional[Any] = None + + +def _get_engine() -> Engine: + """Return the running engine or raise.""" + if _engine is None: + raise RuntimeError("Engine not initialised") + return _engine + + +# --------------------------------------------------------------------------- +# Pydantic request / response models +# --------------------------------------------------------------------------- + + +class GenerateRequest(BaseModel): + """Body for ``POST /generate``.""" + + text: Optional[Union[List[str], str]] = None + input_ids: Optional[Union[List[List[int]], List[int]]] = None + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + image_data: Optional[Any] = None + audio_data: Optional[Any] = None + video_data: Optional[Any] = None + return_logprob: Optional[Union[List[bool], bool]] = None + logprob_start_len: Optional[Union[List[int], int]] = None + top_logprobs_num: Optional[Union[List[int], int]] = None + lora_path: Optional[Union[List[Optional[str]], str]] = None + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + stream: bool = False + rid: Optional[Union[List[str], str]] = None + + model_config = {"extra": "allow"} # forward unknown keys as extra_options + + +# -- OpenAI-compatible models ----------------------------------------------- + + +class ImageUrl(BaseModel): + url: str + detail: Optional[str] = "auto" + + +class ContentPart(BaseModel): + type: str + text: Optional[str] = None + image_url: Optional[ImageUrl] = None + + +class ChatMessage(BaseModel): + role: str + content: Optional[Union[str, List[ContentPart]]] = None + name: Optional[str] = None + tool_calls: Optional[List[Any]] = None + tool_call_id: Optional[str] = None + + model_config = {"extra": "allow"} + + +class StreamOptions(BaseModel): + include_usage: Optional[bool] = False + continuous_usage_stats: Optional[bool] = False + + +class ToolFunction(BaseModel): + name: str + description: Optional[str] = None + parameters: Optional[Dict[str, Any]] = None + + +class Tool(BaseModel): + type: str = "function" + function: ToolFunction + + +class ChatCompletionRequest(BaseModel): + """OpenAI ``POST /v1/chat/completions`` body.""" + + model: str = "" + messages: List[ChatMessage] + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + max_tokens: Optional[int] = None + max_completion_tokens: Optional[int] = None + stream: bool = False + stream_options: Optional[StreamOptions] = None + stop: Optional[Union[str, List[str]]] = None + n: int = 1 + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + repetition_penalty: Optional[float] = None + seed: Optional[int] = None + logprobs: Optional[bool] = None + top_logprobs: Optional[int] = None + user: Optional[str] = None + # Tool calling + tools: Optional[List[Tool]] = None + tool_choice: Optional[Union[str, Dict[str, Any]]] = None + # Reasoning control + separate_reasoning: bool = True + stream_reasoning: bool = True + # Pass-through to tokenizer.apply_chat_template (e.g. enable_thinking) + chat_template_kwargs: Optional[Dict[str, Any]] = None + + model_config = {"extra": "allow"} + + +class CompletionRequest(BaseModel): + """OpenAI ``POST /v1/completions`` body.""" + + model: str = "" + prompt: Union[str, List[str]] + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + max_tokens: Optional[int] = None + stream: bool = False + stream_options: Optional[StreamOptions] = None + stop: Optional[Union[str, List[str]]] = None + n: int = 1 + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + repetition_penalty: Optional[float] = None + seed: Optional[int] = None + echo: bool = False + logprobs: Optional[int] = None + user: Optional[str] = None + + model_config = {"extra": "allow"} + + +class AbortRequest(BaseModel): + rid: Optional[str] = None + + +# --------------------------------------------------------------------------- +# FastAPI application & lifespan +# --------------------------------------------------------------------------- + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup / shutdown hooks for the FastAPI app.""" + global _engine, _tokenizer + _engine = app.state.engine # type: ignore[attr-defined] + + # Load tokenizer in server process for apply_chat_template + cfg = get_global_config() + try: + from transformers import AutoTokenizer + + _tokenizer = AutoTokenizer.from_pretrained( + str(cfg.server.tokenizer_path), + trust_remote_code=cfg.server.trust_remote_code, + ) + logger.info( + "Loaded tokenizer for chat template: %s", cfg.server.tokenizer_path + ) + except Exception as e: + logger.warning("Failed to load tokenizer for chat template: %s", e) + + logger.info( + "HTTP server ready at http://%s:%s", + cfg.server.host, + cfg.server.port, + ) + yield + # Shutdown + if _engine is not None: + _engine.shutdown() + _engine = None + + +app = FastAPI(lifespan=lifespan) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# --------------------------------------------------------------------------- +# Exception handlers +# --------------------------------------------------------------------------- + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + return ORJSONResponse( + content={"error": {"message": exc.detail, "code": exc.status_code}}, + status_code=exc.status_code, + ) + + +# --------------------------------------------------------------------------- +# Health / info endpoints +# --------------------------------------------------------------------------- + + +@app.get("/health") +@app.get("/health_generate") +async def health(): + """Liveness probe.""" + return Response(status_code=200) + + +@app.get("/model_info") +async def model_info(): + """Return basic model metadata.""" + cfg = get_global_config() + hf_cfg = cfg.model.hf_config + return { + "model_path": str(cfg.server.model_path), + "tokenizer_path": str(cfg.server.tokenizer_path), + "served_model_name": cfg.server.served_model_name, + "model_type": getattr(hf_cfg, "model_type", None) if hf_cfg else None, + "architectures": getattr(hf_cfg, "architectures", None) if hf_cfg else None, + } + + +@app.get("/server_info") +async def server_info(): + """Dump runtime server configuration.""" + import dataclasses as _dc + + cfg = get_global_config() + return _dc.asdict(cfg.server) + + +@app.get("/v1/models") +async def list_models(): + """OpenAI-compatible model listing.""" + cfg = get_global_config() + model_name = cfg.server.served_model_name or str(cfg.server.model_path) + return { + "object": "list", + "data": [_model_card(model_name)], + } + + +@app.get("/v1/models/{model_id:path}") +async def retrieve_model(model_id: str): + """OpenAI-compatible single model retrieval.""" + cfg = get_global_config() + model_name = cfg.server.served_model_name or str(cfg.server.model_path) + if model_id != model_name: + raise HTTPException( + status_code=404, + detail=f"Model '{model_id}' not found. Available: '{model_name}'", + ) + return _model_card(model_name) + + +def _model_card(model_name: str) -> Dict[str, Any]: + """Build an OpenAI-compatible Model object.""" + return { + "id": model_name, + "object": "model", + "created": int(time.time()), + "owned_by": "pymllm", + } + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +# Map internal finish reasons to OpenAI-standard values. +_FINISH_REASON_MAP = { + "eos": "stop", + "stop": "stop", + "length": "length", + "abort": "stop", +} + + +def _normalize_finish_reason(reason: Optional[str]) -> Optional[str]: + """Convert internal finish reason to OpenAI-compatible value.""" + if reason is None: + return None + return _FINISH_REASON_MAP.get(reason, reason) + + +def _build_sampling_params( + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + max_tokens: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + **extra: Any, +) -> Dict[str, Any]: + """Build a sampling_params dict from OpenAI-style fields.""" + params: Dict[str, Any] = {} + if temperature is not None: + params["temperature"] = temperature + if top_p is not None: + params["top_p"] = top_p + if top_k is not None: + params["top_k"] = top_k + if max_tokens is not None: + params["max_new_tokens"] = max_tokens + if stop is not None: + params["stop"] = stop if isinstance(stop, list) else [stop] + if frequency_penalty is not None: + params["frequency_penalty"] = frequency_penalty + if presence_penalty is not None: + params["presence_penalty"] = presence_penalty + if repetition_penalty is not None: + params["repetition_penalty"] = repetition_penalty + if seed is not None: + params["seed"] = seed + params.update(extra) + return params + + +def _messages_to_prompt( + messages: List[ChatMessage], + chat_template_kwargs: Optional[Dict[str, Any]] = None, +) -> str: + """Render chat messages into a prompt string via the model's chat template. + + Uses ``tokenizer.apply_chat_template()`` when available (handles Llama, + Qwen, Mistral, etc. automatically). Falls back to ChatML format. + + Parameters + ---------- + chat_template_kwargs + Extra keyword arguments forwarded to ``apply_chat_template`` + (e.g. ``enable_thinking=True`` for Qwen3). + """ + # Flatten each message into a plain dict for the tokenizer. + msg_dicts: List[Dict[str, Any]] = [] + for msg in messages: + content = msg.content + if isinstance(content, list): + # Multimodal: extract only text parts for the prompt string. + text_parts = [p.text for p in content if p.type == "text" and p.text] + content = "\n".join(text_parts) if text_parts else "" + elif content is None: + content = "" + d: Dict[str, Any] = {"role": msg.role, "content": content} + if msg.name is not None: + d["name"] = msg.name + msg_dicts.append(d) + + tokenizer = _tokenizer + if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"): + try: + extra = dict(chat_template_kwargs) if chat_template_kwargs else {} + return tokenizer.apply_chat_template( + msg_dicts, + tokenize=False, + add_generation_prompt=True, + **extra, + ) + except Exception as e: + logger.warning("apply_chat_template failed, using fallback: %s", e) + + # Fallback: ChatML format (Qwen-style) + parts: List[str] = [] + for m in msg_dicts: + parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>") + parts.append("<|im_start|>assistant\n") + return "\n".join(parts) + + +def _extract_image_data(messages: List[ChatMessage]) -> Optional[List[str]]: + """Extract image URLs / base64 strings from multimodal content parts.""" + images: List[str] = [] + for msg in messages: + if not isinstance(msg.content, list): + continue + for part in msg.content: + if part.type == "image_url" and part.image_url is not None: + images.append(part.image_url.url) + return images if images else None + + +def _make_completion_id() -> str: + return f"cmpl-{uuid.uuid4().hex[:24]}" + + +def _make_chat_completion_id() -> str: + return f"chatcmpl-{uuid.uuid4().hex[:24]}" + + +# --------------------------------------------------------------------------- +# Native generate endpoint +# --------------------------------------------------------------------------- + + +@app.api_route("/generate", methods=["POST", "PUT"]) +async def generate(obj: GenerateRequest, request: Request): + """Native generation endpoint. Supports SSE streaming.""" + engine = _get_engine() + + # Collect extra fields as extra_options + known = set(GenerateRequest.model_fields.keys()) + extra_options = {k: v for k, v in obj.model_dump().items() if k not in known} + + kwargs: Dict[str, Any] = { + "prompt": obj.text, + "input_ids": obj.input_ids, + "sampling_params": obj.sampling_params, + "image_data": obj.image_data, + "audio_data": obj.audio_data, + "video_data": obj.video_data, + "return_logprob": obj.return_logprob, + "logprob_start_len": obj.logprob_start_len, + "top_logprobs_num": obj.top_logprobs_num, + "lora_path": obj.lora_path, + "session_params": obj.session_params, + "stream": obj.stream, + "rid": obj.rid, + **extra_options, + } + # Strip None values so Engine defaults are used + kwargs = {k: v for k, v in kwargs.items() if v is not None} + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + try: + async for chunk in engine.generate_async(**kwargs): + if await request.is_disconnected(): + break + # Skip empty intermediate chunks (e.g. special tokens + # stripped by the detokenizer) + if not chunk.get("delta") and not chunk.get("finished"): + continue + yield b"data: " + orjson.dumps(chunk) + b"\n\n" + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + try: + results = [] + async for item in engine.generate_async(**kwargs): + results.append(item) + result = results[0] if len(results) == 1 else results + return ORJSONResponse(result) + except Exception as e: + logger.error("[generate] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# OpenAI-compatible /v1/completions +# --------------------------------------------------------------------------- + + +@app.post("/v1/completions") +async def openai_completions(obj: CompletionRequest, request: Request): + """OpenAI-compatible text completion endpoint.""" + engine = _get_engine() + sp = _build_sampling_params( + temperature=obj.temperature, + top_p=obj.top_p, + top_k=obj.top_k, + max_tokens=obj.max_tokens, + stop=obj.stop, + frequency_penalty=obj.frequency_penalty, + presence_penalty=obj.presence_penalty, + repetition_penalty=obj.repetition_penalty, + seed=obj.seed, + ) + cfg = get_global_config() + model_name = obj.model or cfg.server.served_model_name or str(cfg.server.model_path) + include_usage = ( + obj.stream_options is not None and obj.stream_options.include_usage + ) + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + comp_id = _make_completion_id() + prompt_tokens = 0 + completion_tokens = 0 + try: + async for chunk in engine.generate_async( + prompt=obj.prompt, sampling_params=sp, stream=True + ): + if await request.is_disconnected(): + break + prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) + completion_tokens = chunk.get("completion_tokens", completion_tokens) + delta_text = chunk.get("delta", "") + finish_reason = _normalize_finish_reason( + chunk.get("finished_reason") + ) + # Skip empty intermediate chunks + if not delta_text and finish_reason is None: + continue + sse: Dict[str, Any] = { + "id": comp_id, + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": [ + { + "index": 0, + "text": delta_text, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + } + yield b"data: " + orjson.dumps(sse) + b"\n\n" + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + # Final usage-only chunk (OpenAI stream_options.include_usage) + if include_usage: + usage_chunk: Dict[str, Any] = { + "id": comp_id, + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": [], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + yield b"data: " + orjson.dumps(usage_chunk) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + try: + results = [] + async for item in engine.generate_async( + prompt=obj.prompt, sampling_params=sp + ): + results.append(item) + choices = [] + prompt_tokens = 0 + completion_tokens = 0 + for i, r in enumerate(results): + choices.append( + { + "index": i, + "text": r.get("text", ""), + "logprobs": None, + "finish_reason": _normalize_finish_reason( + r.get("finished_reason", "stop") + ), + } + ) + prompt_tokens += r.get("prompt_tokens", 0) + completion_tokens += r.get("completion_tokens", 0) + + return ORJSONResponse( + { + "id": _make_completion_id(), + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": choices, + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + ) + except Exception as e: + logger.error("[v1/completions] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# OpenAI-compatible /v1/chat/completions +# --------------------------------------------------------------------------- + + +@app.post("/v1/chat/completions") +async def openai_chat_completions(obj: ChatCompletionRequest, request: Request): + """OpenAI-compatible chat completion endpoint with reasoning & tool-call parsing.""" + engine = _get_engine() + cfg = get_global_config() + # Auto-enable thinking when reasoning_parser is configured and the + # client didn't explicitly set enable_thinking. + chat_kwargs = dict(obj.chat_template_kwargs) if obj.chat_template_kwargs else {} + if cfg.server.reasoning_parser and "enable_thinking" not in chat_kwargs: + chat_kwargs["enable_thinking"] = True + prompt = _messages_to_prompt(obj.messages, chat_template_kwargs=chat_kwargs or None) + image_data = _extract_image_data(obj.messages) + + # max_completion_tokens takes precedence over max_tokens (OpenAI convention) + max_tokens = obj.max_completion_tokens if obj.max_completion_tokens is not None else obj.max_tokens + + sp = _build_sampling_params( + temperature=obj.temperature, + top_p=obj.top_p, + top_k=obj.top_k, + max_tokens=max_tokens, + stop=obj.stop, + frequency_penalty=obj.frequency_penalty, + presence_penalty=obj.presence_penalty, + repetition_penalty=obj.repetition_penalty, + seed=obj.seed, + ) + cfg = get_global_config() + model_name = obj.model or cfg.server.served_model_name or str(cfg.server.model_path) + include_usage = ( + obj.stream_options is not None and obj.stream_options.include_usage + ) + + # Resolve parsers from server config + reasoning_type = cfg.server.reasoning_parser + tool_call_type = cfg.server.tool_call_parser + + gen_kwargs: Dict[str, Any] = { + "prompt": prompt, + "sampling_params": sp, + } + if image_data is not None: + gen_kwargs["image_data"] = image_data + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + from pymllm.parsers import ReasoningParser, ToolCallParser + + comp_id = _make_chat_completion_id() + created = int(time.time()) + first = True + prompt_tokens = 0 + completion_tokens = 0 + has_tool_calls = False # track across entire stream + + # Instantiate streaming parsers + r_parser = ( + ReasoningParser(reasoning_type, stream_reasoning=obj.stream_reasoning) + if reasoning_type and obj.separate_reasoning + else None + ) + tc_parser = ( + ToolCallParser(tool_call_type, tools=obj.tools) + if tool_call_type and obj.tools + else None + ) + + def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: + sse: Dict[str, Any] = { + "id": comp_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_name, + "choices": [ + { + "index": 0, + "delta": delta, + "logprobs": None, + "finish_reason": finish, + } + ], + } + return b"data: " + orjson.dumps(sse) + b"\n\n" + + try: + async for chunk in engine.generate_async(**gen_kwargs, stream=True): + if await request.is_disconnected(): + break + prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) + completion_tokens = chunk.get("completion_tokens", completion_tokens) + + raw_delta = chunk.get("delta", "") + finish_reason = _normalize_finish_reason( + chunk.get("finished_reason") + ) + + # --- Phase 1: reasoning parser --- + reasoning_delta = "" + content_delta = raw_delta + if r_parser and raw_delta: + reasoning_delta, content_delta = r_parser.parse_stream_chunk( + raw_delta + ) + + # --- Phase 2: tool-call parser --- + tool_items: list = [] + if tc_parser and content_delta: + content_delta, tool_items = tc_parser.parse_stream_chunk( + content_delta + ) + + # --- Emit chunks --- + # Role chunk (first) + if first: + yield _make_sse({"role": "assistant"}) + first = False + + # Reasoning content + if reasoning_delta: + yield _make_sse({"reasoning_content": reasoning_delta}) + + # Tool call deltas + if tool_items: + has_tool_calls = True + for tc in tool_items: + yield _make_sse({"tool_calls": [tc.to_openai_dict()]}) + + # Normal content + if content_delta: + yield _make_sse({"content": content_delta}) + + # Finish + if finish_reason is not None: + # Flush remaining tool call data + if tc_parser: + remaining = tc_parser.flush() + for tc in remaining: + has_tool_calls = True + yield _make_sse({"tool_calls": [tc.to_openai_dict()]}) + if has_tool_calls: + finish_reason = "tool_calls" + yield _make_sse({}, finish=finish_reason) + + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + # Final usage-only chunk + if include_usage: + usage_chunk: Dict[str, Any] = { + "id": comp_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_name, + "choices": [], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + yield b"data: " + orjson.dumps(usage_chunk) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + # -- Non-streaming -- + try: + from pymllm.parsers import ReasoningParser, ToolCallParser + + r = {} + async for item in engine.generate_async(**gen_kwargs): + r = item + prompt_tokens = r.get("prompt_tokens", 0) + completion_tokens = r.get("completion_tokens", 0) + text = r.get("text", "") + finish_reason = _normalize_finish_reason(r.get("finished_reason", "stop")) + + # Parse reasoning + reasoning_content = None + if reasoning_type and obj.separate_reasoning: + rp = ReasoningParser(reasoning_type) + reasoning_content, text = rp.parse_non_stream(text) + + # Parse tool calls + tool_calls_list = None + if tool_call_type and obj.tools: + tp = ToolCallParser(tool_call_type, tools=obj.tools) + if tp.has_tool_call(text): + text, parsed_calls = tp.parse_non_stream(text) + if parsed_calls: + tool_calls_list = [tc.to_openai_dict(streaming=False) for tc in parsed_calls] + finish_reason = "tool_calls" + + message: Dict[str, Any] = {"role": "assistant", "content": text or None} + if reasoning_content: + message["reasoning_content"] = reasoning_content + if tool_calls_list: + message["tool_calls"] = tool_calls_list + + return ORJSONResponse( + { + "id": _make_chat_completion_id(), + "object": "chat.completion", + "created": int(time.time()), + "model": model_name, + "choices": [ + { + "index": 0, + "message": message, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + ) + except Exception as e: + logger.error("[v1/chat/completions] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# Administrative endpoints +# --------------------------------------------------------------------------- + + +@app.api_route("/flush_cache", methods=["GET", "POST"]) +async def flush_cache(): + """Placeholder cache flush.""" + return Response(content="Cache flushed.\n", status_code=200) + + +@app.post("/abort_request") +async def abort_request(obj: AbortRequest): + """Abort a running request by rid.""" + engine = _get_engine() + if obj.rid and engine._rr_process is not None: + await engine._rr_process.abort_request(obj.rid) + return Response(status_code=200) + raise HTTPException(status_code=400, detail="Missing or invalid rid") + + +# --------------------------------------------------------------------------- +# Prepare args helper +# --------------------------------------------------------------------------- def _prepare_args(): + """Parse CLI arguments into the global config singleton.""" parser = make_args() read_args(parser=parser) -def main(): +# --------------------------------------------------------------------------- +# Server launcher +# --------------------------------------------------------------------------- + + +def launch_server(): + """Launch the pymllm Engine then start the uvicorn HTTP server. + + This function mirrors sglang's ``launch_server``: it first boots all engine + subprocesses (tokenizer, scheduler, model-runner, detokenizer) and then + hands off to uvicorn to serve HTTP traffic. + """ _prepare_args() + cfg = get_global_config() + engine = Engine() engine.launch() + # Attach engine to app.state so the lifespan hook can pick it up. + app.state.engine = engine # type: ignore[attr-defined] + + logger.info( + "Starting HTTP server on %s:%s (root_path=%r)", + cfg.server.host, + cfg.server.port, + cfg.server.fastapi_root_path, + ) + + uvicorn.run( + app, + host=cfg.server.host, + port=cfg.server.port, + root_path=cfg.server.fastapi_root_path, + log_level=cfg.server.log_level, + timeout_keep_alive=5, + loop="uvloop", + ) + + +def main(): + """CLI entry point.""" + launch_server() + if __name__ == "__main__": main()