jax-ml
diff --git a/‎deepseek_r1_jax/scripts/convert_hf_r1_checkpoint.py‎
Lines changed: 21 additions & 6 deletions b/‎deepseek_r1_jax/scripts/convert_hf_r1_checkpoint.py‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎llama3/llama3_jax/model.py‎
Lines changed: 22 additions & 4 deletions b/‎llama3/llama3_jax/model.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎serving/main_serving.py‎
Lines changed: 40 additions & 20 deletions b/‎serving/main_serving.py‎
Lines changed: 40 additions & 20 deletions
@@ -19,13 +19,15 @@
 
 import jax
 from jax.sharding import PartitionSpec as P
+from argparse import ArgumentParser
 
-from deepseek_r1_jax.model import ShardingRules, Config
-from deepseek_r1_jax import chkpt_utils as utils
 
-def main():
-    root_path = Path("/mnt/storage/DeepSeek-R1")
-    dest_path = Path("/mnt/storage/deepseek-r1-jax-chkpt")
+def main(root_path, dest_path):
+    from deepseek_r1_jax.model import ShardingRules, Config
+    from deepseek_r1_jax import chkpt_utils as utils
+
+    root_path, dest_path = Path(root_path), Path(dest_path)
+    dest_path.mkdir(exist_ok=True, parents=True)
 
     cfg = Config()
     cfg.quantize_mlp = False
@@ -39,4 +41,17 @@ def main():
     utils.convert_hf_checkpoint(params_map, root_path, dest_path, cfg)
 
 if __name__ == "__main__":
-    main()
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--source-path", default="/mnt/storage/DeepSeek-R1-weights-only", required=True, help="HF model directory path"
+    )
+    parser.add_argument(
+        "--dest-path",
+        default="~/deepseek_r1_jax",
+        required=True,
+        help="JAX model model directory (to be created).",
+    )
+    args = parser.parse_args()
+    main(args.source_path, args.dest_path)
+
+    main(args)
@@ -21,8 +21,8 @@
 import math
 from functools import partial
 from typing import Callable, Any, TypeVar
-from types import ModuleType
 from inspect import signature
+from collections import OrderedDict as odict
 
 import jax
 import jax.numpy as jnp
@@ -31,7 +31,8 @@
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_kernel as splash
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_mask as mask_lib
 from jax.experimental.shard_map import shard_map
-from jax.sharding import PartitionSpec as P, use_mesh
+from jax.sharding import PartitionSpec as P
+from jax.experimental.array_serialization import pytree_serialization as ser
 try:
     from jax.experimental.shard import auto_axes as _auto_axes, reshard
 except ModuleNotFoundError:
@@ -213,6 +214,7 @@ class ArrayInfo:
 # module reload friendly isinstance check
 is_type = lambda x, cls: (type(x).__name__ == cls.__name__) and (type(x).__module__ == cls.__module__)
 is_param = lambda x: is_type(x, ArrayInfo)
+which_platform = lambda cfg: cfg.mesh.devices.reshape(-1)[0].platform
 _count_left_padding = lambda ids, pad_id=0: auto_axes(
     lambda ids: jnp.sum(jnp.cumsum(ids != pad_id, axis=-1) == 0, axis=-1), out_sharding=P(None)
 )(ids)
@@ -404,15 +406,18 @@ def abstract(cls, cfg: Config):
         )
 
 
-@partial(jax_pytree_struct, meta_fields=("batch_size", "size", "time_axis"))
+@partial(jax_pytree_struct, meta_fields=("batch_size", "size", "time_axis", "insert_sequences"))
 class KVCache(_Init):
     k: list[tuple[jax.Array | QuantArray, ...]]  # (batch_size, key_heads, max_seq_len, head_dim)
     v: list[tuple[jax.Array | QuantArray, ...]]  # (batch_size, key_heads, max_seq_len, head_dim)
     iter: jax.Array  # []  # sequences are right-aligned for slice update performance
     starts: jax.Array  # [batch_size]  # sequences are right-aligned, we need start indices
-    batch_size: int = 0
+    batch_size: int = 1
     size: int = 2 ** 30
     time_axis: int = 2
+    #update_slice: Callable = None
+    insert_sequences: Callable = None
+    #get_sequence: Callable = None
 
     @classmethod
     def abstract(cls, cfg: Config, batch_size: int):
@@ -798,6 +803,8 @@ def _f(q, k, v, q_segment_ids, kv_segment_ids, starts, lengths, k_scale, v_scale
 
 
 def paged_attention_kernel(q, k, v, block_tables, lengths, cfg: Config):
+    if which_platform(cfg) not in ("gpu", "cuda"):
+        raise ValueError("Paged attention is only supported on GPU.")
     k, k_scale = (k.quant, k.scale) if is_type(k, QuantArray) else (k, None)
     v, v_scale = (v.quant, v.scale) if is_type(v, QuantArray) else (v, None)
 
@@ -1030,6 +1037,17 @@ def prepare_chunk(chunk, pad_to: int, pad_id: int):
     return chunk, segment_ids
 
 
+## serialization
+#def save_pytree(data, path):
+#    flat_data = odict(("weights" + "".join(map(str, k)), v) for k, v in jax.tree.flatten_with_path(data)[0])
+#    ser.save(flat_data, path)  # save a flatten with path to avoid custom
+#
+#
+#def load_pytree(path, sharding=None):
+#    flat_sharding = odict(("weights" + "".join(map(str, k)), v) for k, v in jax.tree.flatten_with_path(sharding)[0])
+#    return jax.tree.unflatten(jax.tree.structure(sharding), jax.tree.leaves(ser.load(path, flat_sharding)))
+
+
 def prefill(tokens: jax.Array, weights: Weights, cache: KVCache | None, cfg: Config, pad_id: int = 0):
     """Samples from a prompt."""
     # Calculate the next power of 2 for padding, up to cfg.max_seq.
 
@@ -8,8 +8,8 @@
 import time
 from typing import AsyncGenerator
 from contextlib import asynccontextmanager
-import os
 from argparse import ArgumentParser
+from typing import Any
 
 import jax
 from jax import random
@@ -24,16 +24,15 @@
 import serving_jax as serving
 from serving_jax import attention_cache_utils
 
+Config = Any
 
 TOKENIZER, SERVE_LOOP, SERVING_THREAD, ARGS = None, None, None, None
 
 jax.config.update("jax_explain_cache_misses", True)
-jax.config.update("jax_compilation_cache_dir", str(Path("~/.cache/jax").expanduser()))
-jax.config.update("jax_enable_empty_arrays", True)
+#jax.config.update("jax_compilation_cache_dir", str(Path("~/.cache/jax").expanduser()))
 
 try:  # newer JAX only
-    assert False
-    my_id = int(socket.gethostname().split("-")[-1]) - 1
+    my_id = int(socket.gethostname().split("-")[-1])
     my_ip = socket.getaddrinfo(socket.gethostname(), 80)[0][-1][0]
     jax.config.update("jax_cross_host_transfer_socket_address", f"{my_ip}:{17007 + my_id}")
     jax.config.update("jax_cross_host_transport_addresses", ",".join([f"{my_ip}:0"] * 8))
@@ -60,39 +59,60 @@ def load_model():
 
     #process_idx = int(socket.gethostname().split("-")[-1]) - 1  # a scheme where hosts are (host-1, host-2, ...)
     #jax.distributed.initialize(os.environ["COORDINATOR_ADDRESS"], 2, process_idx)
+    jax.distributed.initialize()
     print(jax.devices())
     print("-" * 80)
     print(jax.local_devices())
 
-    model_name = "Llama-3.1-8B-Instruct"
-    ckpt_path = Path(f"~/{model_name}").expanduser()
+    #model_name = "Llama-3.1-8B-Instruct"
+    #ckpt_path = Path(f"~/{model_name}").expanduser()
+    #model_name = "Llama-3.1-8B-Instruct-quant"
+    model_name = "Llama-3.1-70B-Instruct-quant"
+    ckpt_path = Path(f"~/bucket/llama3_jax_old/{model_name}").expanduser()
     cfg = l3jax.load_config(ckpt_path / "config.json")
     TOKENIZER = l3jax.load_tokenizer(ckpt_path / "tokenizer.json", ckpt_path / "tokenizer_config.json")
     assert ckpt_path.is_dir()
     print("---> Model config loaded")
 
     # two hosts, different device and host meshes
-    local_mesh = jax.make_mesh((1, 8, 1), P("x", "y", "z"), devices=jax.local_devices(), axis_types=(AxisType.Explicit,) * 3)
-    decode_mesh, prefill_mesh = local_mesh, local_mesh
+    #local_mesh = jax.make_mesh((1, 8, 1), P("x", "y", "z"), devices=jax.local_devices(), axis_types=(AxisType.Explicit,) * 3)
+    #local_mesh = jax.make_mesh((1, 1, 1), P("x", "y", "z"), devices=jax.local_devices(), axis_types=(AxisType.Explicit,) * 3)
+    #decode_mesh, prefill_mesh = local_mesh, local_mesh
+    decode_mesh = jax.make_mesh((1, 8, 1), P("x", "y", "z"), devices=jax.devices()[:8], axis_types=(AxisType.Explicit,) * 3)
+    prefill_mesh = jax.make_mesh((1, 8, 1), P("x", "y", "z"), devices=jax.devices()[8:], axis_types=(AxisType.Explicit,) * 3)
+    #decode_mesh = jax.make_mesh((1, 8, 2), P("x", "y", "z"), devices=jax.devices(), axis_types=(AxisType.Explicit,) * 3)
+    #prefill_mesh = jax.make_mesh((1, 8, 2), P("x", "y", "z"), devices=jax.devices(), axis_types=(AxisType.Explicit,) * 3)
     cfg = dataclasses.replace(cfg, mesh=decode_mesh, quant_layer=True, quant_cache=True)
-    cfg = dataclasses.replace(cfg, use_prefill_attn_kernel=False, use_decode_attn_kernel=False, max_seq_len=8192)
-    cfg = dataclasses.replace(cfg, quant_layer=False, quant_cache=False)
-    cfg.quant_cache = True
+    cfg = dataclasses.replace(cfg, use_prefill_attn_kernel=False, use_decode_attn_kernel=False, max_seq_len=2048)
+    cfg.quant_cache = False
 
     decode_weights = l3jax.load_pytree(ckpt_path, l3jax.Weights.shardings(dataclasses.replace(cfg, mesh=decode_mesh)))
     prefill_weights = l3jax.load_pytree(ckpt_path, l3jax.Weights.shardings(dataclasses.replace(cfg, mesh=prefill_mesh)))
 
     print("---> Weights loaded")
 
-    serve_cfg = serving.ServingConfig(decode_steps=32, max_decode_length=64)
-    #decode_cache = l3jax.KVCache.init(random.key(0), cfg, serve_cfg.decode_batch_size)
-    #decode_cache.get_sequence = attention_cache_utils.kvcache_get_entry
-    #decode_cache.insert_sequences = attention_cache_utils.kvcache_update_cache
-    decode_cache = l3jax.PagedKVCache.init(random.key(0), cfg, serve_cfg.decode_batch_size, 2048, 32)
-    decode_cache.get_sequence = attention_cache_utils.batch_paged_get_entry
-    decode_cache.insert_sequences = attention_cache_utils.batch_paged_update_sequences
+    serve_cfg = serving.ServingConfig(decode_steps=32, max_decode_length=64, prefix_chunk_size=64)
+    decode_cache = l3jax.KVCache.init(random.key(0), cfg, serve_cfg.decode_batch_size)
+    decode_cache.get_sequence = attention_cache_utils.kvcache_get_entry
+    decode_cache.insert_sequences = attention_cache_utils.kvcache_update_cache
+    #decode_cache = l3jax.PagedKVCache.init(random.key(0), cfg, serve_cfg.decode_batch_size, 2048, 32)
+    #decode_cache.get_sequence = attention_cache_utils.batch_paged_get_entry
+    #decode_cache.insert_sequences = attention_cache_utils.batch_paged_update_sequences
+
+    def init_cache(cfg: Config, batch_size: int, actual_len: int):
+        cache = l3jax.KVCache.init(random.key(0), cfg, batch_size)
+        cache.get_sequence = attention_cache_utils.kvcache_get_entry
+        cache.insert_sequences = attention_cache_utils.kvcache_update_cache
+        cache.iter = actual_len
+        return cache
+
+    with jax.sharding.set_mesh(prefill_mesh):
+        prefill_cache = init_cache(dataclasses.replace(cfg, mesh=prefill_mesh), serve_cfg.prefill_batch_size, 8192)
+
+    forward_fn = l3jax.decode_step   # TODO: the model file needs to call it forward explcitly
     SERVE_LOOP = serving.ServingLoop(
-        serve_cfg, cfg, l3jax.prefill, prefill_weights, l3jax.decode_step, decode_weights, decode_cache, ARGS.server
+        #serve_cfg, cfg, init_cache, l3jax.decode_step, prefill_weights, decode_weights, decode_cache, ARGS.server
+        serve_cfg, cfg, forward_fn, prefill_weights, prefill_cache, decode_weights, decode_cache, ARGS.server
     )
     print("---> Created the serving loop")