From 195ef79a6ca36a37d068cb4ad14d732a5edcda12 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Sun, 28 Sep 2025 22:48:17 +0800
Subject: [PATCH 01/10] add RWKV

---
 keras_hub/api/__init__.py                     |  16 +-
 keras_hub/api/layers/__init__.py              | 144 ++--
 keras_hub/api/metrics/__init__.py             |  10 +-
 keras_hub/api/models/__init__.py              | 742 +++++++-----------
 keras_hub/api/samplers/__init__.py            |  22 +-
 keras_hub/api/tokenizers/__init__.py          | 143 ++--
 keras_hub/api/utils/__init__.py               |  18 +-
 keras_hub/src/models/rwkv7/rwkv7_backbone.py  | 119 +++
 keras_hub/src/models/rwkv7/rwkv7_casual_lm.py |  50 ++
 .../rwkv7/rwkv7_causal_lm_preprocessor.py     |  88 +++
 keras_hub/src/models/rwkv7/rwkv7_layer.py     | 612 +++++++++++++++
 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py | 224 ++++++
 .../convert_rwkv7_checkpoints.py              | 464 +++++++++++
 13 files changed, 1946 insertions(+), 706 deletions(-)
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_backbone.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_layer.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
 create mode 100644 tools/checkpoint_conversion/convert_rwkv7_checkpoints.py

diff --git a/keras_hub/api/__init__.py b/keras_hub/api/__init__.py
index 2aa98bf3f9..3796e4c7f4 100644
--- a/keras_hub/api/__init__.py
+++ b/keras_hub/api/__init__.py
@@ -4,12 +4,12 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub import layers as layers
-from keras_hub import metrics as metrics
-from keras_hub import models as models
-from keras_hub import samplers as samplers
-from keras_hub import tokenizers as tokenizers
-from keras_hub import utils as utils
-from keras_hub.src.utils.preset_utils import upload_preset as upload_preset
+from keras_hub import layers
+from keras_hub import metrics
+from keras_hub import models
+from keras_hub import samplers
+from keras_hub import tokenizers
+from keras_hub import utils
+from keras_hub.src.utils.preset_utils import upload_preset
 from keras_hub.src.version import __version__ as __version__
-from keras_hub.src.version import version as version
+from keras_hub.src.version import version
diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py
index 4550cf8689..c4411ba889 100644
--- a/keras_hub/api/layers/__init__.py
+++ b/keras_hub/api/layers/__init__.py
@@ -4,149 +4,105 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.layers.modeling.alibi_bias import AlibiBias as AlibiBias
-from keras_hub.src.layers.modeling.anchor_generator import (
-    AnchorGenerator as AnchorGenerator,
-)
-from keras_hub.src.layers.modeling.box_matcher import BoxMatcher as BoxMatcher
+from keras_hub.src.layers.modeling.alibi_bias import AlibiBias
+from keras_hub.src.layers.modeling.anchor_generator import AnchorGenerator
+from keras_hub.src.layers.modeling.box_matcher import BoxMatcher
 from keras_hub.src.layers.modeling.cached_multi_head_attention import (
-    CachedMultiHeadAttention as CachedMultiHeadAttention,
-)
-from keras_hub.src.layers.modeling.f_net_encoder import (
-    FNetEncoder as FNetEncoder,
-)
-from keras_hub.src.layers.modeling.masked_lm_head import (
-    MaskedLMHead as MaskedLMHead,
-)
-from keras_hub.src.layers.modeling.non_max_supression import (
-    NonMaxSuppression as NonMaxSuppression,
-)
-from keras_hub.src.layers.modeling.position_embedding import (
-    PositionEmbedding as PositionEmbedding,
+    CachedMultiHeadAttention,
 )
+from keras_hub.src.layers.modeling.f_net_encoder import FNetEncoder
+from keras_hub.src.layers.modeling.masked_lm_head import MaskedLMHead
+from keras_hub.src.layers.modeling.non_max_supression import NonMaxSuppression
+from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
 from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding as ReversibleEmbedding,
-)
-from keras_hub.src.layers.modeling.rms_normalization import (
-    RMSNormalization as RMSNormalization,
-)
-from keras_hub.src.layers.modeling.rotary_embedding import (
-    RotaryEmbedding as RotaryEmbedding,
+    ReversibleEmbedding,
 )
+from keras_hub.src.layers.modeling.rms_normalization import RMSNormalization
+from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.layers.modeling.sine_position_encoding import (
-    SinePositionEncoding as SinePositionEncoding,
+    SinePositionEncoding,
 )
 from keras_hub.src.layers.modeling.token_and_position_embedding import (
-    TokenAndPositionEmbedding as TokenAndPositionEmbedding,
-)
-from keras_hub.src.layers.modeling.transformer_decoder import (
-    TransformerDecoder as TransformerDecoder,
-)
-from keras_hub.src.layers.modeling.transformer_encoder import (
-    TransformerEncoder as TransformerEncoder,
-)
-from keras_hub.src.layers.preprocessing.audio_converter import (
-    AudioConverter as AudioConverter,
-)
-from keras_hub.src.layers.preprocessing.image_converter import (
-    ImageConverter as ImageConverter,
+    TokenAndPositionEmbedding,
 )
+from keras_hub.src.layers.modeling.transformer_decoder import TransformerDecoder
+from keras_hub.src.layers.modeling.transformer_encoder import TransformerEncoder
+from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
+from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
 from keras_hub.src.layers.preprocessing.masked_lm_mask_generator import (
-    MaskedLMMaskGenerator as MaskedLMMaskGenerator,
+    MaskedLMMaskGenerator,
 )
 from keras_hub.src.layers.preprocessing.multi_segment_packer import (
-    MultiSegmentPacker as MultiSegmentPacker,
-)
-from keras_hub.src.layers.preprocessing.random_deletion import (
-    RandomDeletion as RandomDeletion,
-)
-from keras_hub.src.layers.preprocessing.random_swap import (
-    RandomSwap as RandomSwap,
-)
-from keras_hub.src.layers.preprocessing.start_end_packer import (
-    StartEndPacker as StartEndPacker,
+    MultiSegmentPacker,
 )
+from keras_hub.src.layers.preprocessing.random_deletion import RandomDeletion
+from keras_hub.src.layers.preprocessing.random_swap import RandomSwap
+from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.models.basnet.basnet_image_converter import (
-    BASNetImageConverter as BASNetImageConverter,
-)
-from keras_hub.src.models.clip.clip_image_converter import (
-    CLIPImageConverter as CLIPImageConverter,
+    BASNetImageConverter,
 )
+from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
 from keras_hub.src.models.cspnet.cspnet_image_converter import (
-    CSPNetImageConverter as CSPNetImageConverter,
+    CSPNetImageConverter,
 )
 from keras_hub.src.models.d_fine.d_fine_image_converter import (
-    DFineImageConverter as DFineImageConverter,
+    DFineImageConverter,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_image_converter import (
-    DeepLabV3ImageConverter as DeepLabV3ImageConverter,
-)
-from keras_hub.src.models.deit.deit_image_converter import (
-    DeiTImageConverter as DeiTImageConverter,
+    DeepLabV3ImageConverter,
 )
+from keras_hub.src.models.deit.deit_image_converter import DeiTImageConverter
 from keras_hub.src.models.densenet.densenet_image_converter import (
-    DenseNetImageConverter as DenseNetImageConverter,
+    DenseNetImageConverter,
 )
 from keras_hub.src.models.depth_anything.depth_anything_image_converter import (
-    DepthAnythingImageConverter as DepthAnythingImageConverter,
+    DepthAnythingImageConverter,
 )
 from keras_hub.src.models.dinov2.dinov2_image_converter import (
-    DINOV2ImageConverter as DINOV2ImageConverter,
+    DINOV2ImageConverter,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
-    EfficientNetImageConverter as EfficientNetImageConverter,
+    EfficientNetImageConverter,
 )
 from keras_hub.src.models.gemma3.gemma3_image_converter import (
-    Gemma3ImageConverter as Gemma3ImageConverter,
+    Gemma3ImageConverter,
 )
 from keras_hub.src.models.hgnetv2.hgnetv2_image_converter import (
-    HGNetV2ImageConverter as HGNetV2ImageConverter,
-)
-from keras_hub.src.models.mit.mit_image_converter import (
-    MiTImageConverter as MiTImageConverter,
+    HGNetV2ImageConverter,
 )
+from keras_hub.src.models.mit.mit_image_converter import MiTImageConverter
 from keras_hub.src.models.mobilenet.mobilenet_image_converter import (
-    MobileNetImageConverter as MobileNetImageConverter,
+    MobileNetImageConverter,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_converter import (
-    MoonshineAudioConverter as MoonshineAudioConverter,
+    MoonshineAudioConverter,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_image_converter import (
-    PaliGemmaImageConverter as PaliGemmaImageConverter,
+    PaliGemmaImageConverter,
 )
 from keras_hub.src.models.parseq.parseq_image_converter import (
-    PARSeqImageConverter as PARSeqImageConverter,
+    PARSeqImageConverter,
 )
 from keras_hub.src.models.resnet.resnet_image_converter import (
-    ResNetImageConverter as ResNetImageConverter,
+    ResNetImageConverter,
 )
 from keras_hub.src.models.retinanet.retinanet_image_converter import (
-    RetinaNetImageConverter as RetinaNetImageConverter,
-)
-from keras_hub.src.models.sam.sam_image_converter import (
-    SAMImageConverter as SAMImageConverter,
-)
-from keras_hub.src.models.sam.sam_mask_decoder import (
-    SAMMaskDecoder as SAMMaskDecoder,
-)
-from keras_hub.src.models.sam.sam_prompt_encoder import (
-    SAMPromptEncoder as SAMPromptEncoder,
+    RetinaNetImageConverter,
 )
+from keras_hub.src.models.sam.sam_image_converter import SAMImageConverter
+from keras_hub.src.models.sam.sam_mask_decoder import SAMMaskDecoder
+from keras_hub.src.models.sam.sam_prompt_encoder import SAMPromptEncoder
 from keras_hub.src.models.segformer.segformer_image_converter import (
-    SegFormerImageConverter as SegFormerImageConverter,
+    SegFormerImageConverter,
 )
 from keras_hub.src.models.siglip.siglip_image_converter import (
-    SigLIPImageConverter as SigLIPImageConverter,
-)
-from keras_hub.src.models.vgg.vgg_image_converter import (
-    VGGImageConverter as VGGImageConverter,
-)
-from keras_hub.src.models.vit.vit_image_converter import (
-    ViTImageConverter as ViTImageConverter,
+    SigLIPImageConverter,
 )
+from keras_hub.src.models.vgg.vgg_image_converter import VGGImageConverter
+from keras_hub.src.models.vit.vit_image_converter import ViTImageConverter
 from keras_hub.src.models.whisper.whisper_audio_converter import (
-    WhisperAudioConverter as WhisperAudioConverter,
+    WhisperAudioConverter,
 )
 from keras_hub.src.models.xception.xception_image_converter import (
-    XceptionImageConverter as XceptionImageConverter,
+    XceptionImageConverter,
 )
diff --git a/keras_hub/api/metrics/__init__.py b/keras_hub/api/metrics/__init__.py
index 100c2c66fb..88a0a7df2b 100644
--- a/keras_hub/api/metrics/__init__.py
+++ b/keras_hub/api/metrics/__init__.py
@@ -4,8 +4,8 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.metrics.bleu import Bleu as Bleu
-from keras_hub.src.metrics.edit_distance import EditDistance as EditDistance
-from keras_hub.src.metrics.perplexity import Perplexity as Perplexity
-from keras_hub.src.metrics.rouge_l import RougeL as RougeL
-from keras_hub.src.metrics.rouge_n import RougeN as RougeN
+from keras_hub.src.metrics.bleu import Bleu
+from keras_hub.src.metrics.edit_distance import EditDistance
+from keras_hub.src.metrics.perplexity import Perplexity
+from keras_hub.src.metrics.rouge_l import RougeL
+from keras_hub.src.metrics.rouge_n import RougeN
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index 308321717c..403bfb65ad 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -4,743 +4,535 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.models.albert.albert_backbone import (
-    AlbertBackbone as AlbertBackbone,
-)
-from keras_hub.src.models.albert.albert_masked_lm import (
-    AlbertMaskedLM as AlbertMaskedLM,
-)
+from keras_hub.src.models.albert.albert_backbone import AlbertBackbone
+from keras_hub.src.models.albert.albert_masked_lm import AlbertMaskedLM
 from keras_hub.src.models.albert.albert_masked_lm_preprocessor import (
-    AlbertMaskedLMPreprocessor as AlbertMaskedLMPreprocessor,
+    AlbertMaskedLMPreprocessor,
 )
 from keras_hub.src.models.albert.albert_text_classifier import (
-    AlbertTextClassifier as AlbertClassifier,
+    AlbertTextClassifier,
 )
 from keras_hub.src.models.albert.albert_text_classifier import (
-    AlbertTextClassifier as AlbertTextClassifier,
+    AlbertTextClassifier as AlbertClassifier,
 )
 from keras_hub.src.models.albert.albert_text_classifier_preprocessor import (
-    AlbertTextClassifierPreprocessor as AlbertPreprocessor,
+    AlbertTextClassifierPreprocessor,
 )
 from keras_hub.src.models.albert.albert_text_classifier_preprocessor import (
-    AlbertTextClassifierPreprocessor as AlbertTextClassifierPreprocessor,
-)
-from keras_hub.src.models.albert.albert_tokenizer import (
-    AlbertTokenizer as AlbertTokenizer,
-)
-from keras_hub.src.models.backbone import Backbone as Backbone
-from keras_hub.src.models.bart.bart_backbone import BartBackbone as BartBackbone
-from keras_hub.src.models.bart.bart_seq_2_seq_lm import (
-    BartSeq2SeqLM as BartSeq2SeqLM,
+    AlbertTextClassifierPreprocessor as AlbertPreprocessor,
 )
+from keras_hub.src.models.albert.albert_tokenizer import AlbertTokenizer
+from keras_hub.src.models.backbone import Backbone
+from keras_hub.src.models.bart.bart_backbone import BartBackbone
+from keras_hub.src.models.bart.bart_seq_2_seq_lm import BartSeq2SeqLM
 from keras_hub.src.models.bart.bart_seq_2_seq_lm_preprocessor import (
-    BartSeq2SeqLMPreprocessor as BartSeq2SeqLMPreprocessor,
-)
-from keras_hub.src.models.bart.bart_tokenizer import (
-    BartTokenizer as BartTokenizer,
-)
-from keras_hub.src.models.basnet.basnet import (
-    BASNetImageSegmenter as BASNetImageSegmenter,
-)
-from keras_hub.src.models.basnet.basnet_backbone import (
-    BASNetBackbone as BASNetBackbone,
-)
-from keras_hub.src.models.basnet.basnet_preprocessor import (
-    BASNetPreprocessor as BASNetPreprocessor,
-)
-from keras_hub.src.models.bert.bert_backbone import BertBackbone as BertBackbone
-from keras_hub.src.models.bert.bert_masked_lm import (
-    BertMaskedLM as BertMaskedLM,
-)
+    BartSeq2SeqLMPreprocessor,
+)
+from keras_hub.src.models.bart.bart_tokenizer import BartTokenizer
+from keras_hub.src.models.basnet.basnet import BASNetImageSegmenter
+from keras_hub.src.models.basnet.basnet_backbone import BASNetBackbone
+from keras_hub.src.models.basnet.basnet_preprocessor import BASNetPreprocessor
+from keras_hub.src.models.bert.bert_backbone import BertBackbone
+from keras_hub.src.models.bert.bert_masked_lm import BertMaskedLM
 from keras_hub.src.models.bert.bert_masked_lm_preprocessor import (
-    BertMaskedLMPreprocessor as BertMaskedLMPreprocessor,
+    BertMaskedLMPreprocessor,
 )
+from keras_hub.src.models.bert.bert_text_classifier import BertTextClassifier
 from keras_hub.src.models.bert.bert_text_classifier import (
     BertTextClassifier as BertClassifier,
 )
-from keras_hub.src.models.bert.bert_text_classifier import (
-    BertTextClassifier as BertTextClassifier,
-)
 from keras_hub.src.models.bert.bert_text_classifier_preprocessor import (
-    BertTextClassifierPreprocessor as BertPreprocessor,
+    BertTextClassifierPreprocessor,
 )
 from keras_hub.src.models.bert.bert_text_classifier_preprocessor import (
-    BertTextClassifierPreprocessor as BertTextClassifierPreprocessor,
-)
-from keras_hub.src.models.bert.bert_tokenizer import (
-    BertTokenizer as BertTokenizer,
-)
-from keras_hub.src.models.bloom.bloom_backbone import (
-    BloomBackbone as BloomBackbone,
-)
-from keras_hub.src.models.bloom.bloom_causal_lm import (
-    BloomCausalLM as BloomCausalLM,
+    BertTextClassifierPreprocessor as BertPreprocessor,
 )
+from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
+from keras_hub.src.models.bloom.bloom_backbone import BloomBackbone
+from keras_hub.src.models.bloom.bloom_causal_lm import BloomCausalLM
 from keras_hub.src.models.bloom.bloom_causal_lm_preprocessor import (
-    BloomCausalLMPreprocessor as BloomCausalLMPreprocessor,
-)
-from keras_hub.src.models.bloom.bloom_tokenizer import (
-    BloomTokenizer as BloomTokenizer,
-)
-from keras_hub.src.models.causal_lm import CausalLM as CausalLM
-from keras_hub.src.models.causal_lm_preprocessor import (
-    CausalLMPreprocessor as CausalLMPreprocessor,
-)
-from keras_hub.src.models.clip.clip_backbone import CLIPBackbone as CLIPBackbone
-from keras_hub.src.models.clip.clip_preprocessor import (
-    CLIPPreprocessor as CLIPPreprocessor,
-)
-from keras_hub.src.models.clip.clip_text_encoder import (
-    CLIPTextEncoder as CLIPTextEncoder,
-)
-from keras_hub.src.models.clip.clip_tokenizer import (
-    CLIPTokenizer as CLIPTokenizer,
-)
-from keras_hub.src.models.clip.clip_vision_encoder import (
-    CLIPVisionEncoder as CLIPVisionEncoder,
-)
-from keras_hub.src.models.cspnet.cspnet_backbone import (
-    CSPNetBackbone as CSPNetBackbone,
-)
+    BloomCausalLMPreprocessor,
+)
+from keras_hub.src.models.bloom.bloom_tokenizer import BloomTokenizer
+from keras_hub.src.models.causal_lm import CausalLM
+from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
+from keras_hub.src.models.clip.clip_preprocessor import CLIPPreprocessor
+from keras_hub.src.models.clip.clip_text_encoder import CLIPTextEncoder
+from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
+from keras_hub.src.models.clip.clip_vision_encoder import CLIPVisionEncoder
+from keras_hub.src.models.cspnet.cspnet_backbone import CSPNetBackbone
 from keras_hub.src.models.cspnet.cspnet_image_classifier import (
-    CSPNetImageClassifier as CSPNetImageClassifier,
+    CSPNetImageClassifier,
 )
 from keras_hub.src.models.cspnet.cspnet_image_classifier_preprocessor import (
-    CSPNetImageClassifierPreprocessor as CSPNetImageClassifierPreprocessor,
-)
-from keras_hub.src.models.d_fine.d_fine_backbone import (
-    DFineBackbone as DFineBackbone,
+    CSPNetImageClassifierPreprocessor,
 )
+from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
 from keras_hub.src.models.d_fine.d_fine_object_detector import (
-    DFineObjectDetector as DFineObjectDetector,
+    DFineObjectDetector,
 )
 from keras_hub.src.models.d_fine.d_fine_object_detector_preprocessor import (
-    DFineObjectDetectorPreprocessor as DFineObjectDetectorPreprocessor,
+    DFineObjectDetectorPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_backbone import (
-    DebertaV3Backbone as DebertaV3Backbone,
+    DebertaV3Backbone,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_masked_lm import (
-    DebertaV3MaskedLM as DebertaV3MaskedLM,
+    DebertaV3MaskedLM,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
-    DebertaV3MaskedLMPreprocessor as DebertaV3MaskedLMPreprocessor,
+    DebertaV3MaskedLMPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier import (
-    DebertaV3TextClassifier as DebertaV3Classifier,
+    DebertaV3TextClassifier,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier import (
-    DebertaV3TextClassifier as DebertaV3TextClassifier,
+    DebertaV3TextClassifier as DebertaV3Classifier,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier_preprocessor import (
-    DebertaV3TextClassifierPreprocessor as DebertaV3Preprocessor,
+    DebertaV3TextClassifierPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier_preprocessor import (
-    DebertaV3TextClassifierPreprocessor as DebertaV3TextClassifierPreprocessor,
+    DebertaV3TextClassifierPreprocessor as DebertaV3Preprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
-    DebertaV3Tokenizer as DebertaV3Tokenizer,
+    DebertaV3Tokenizer,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_backbone import (
-    DeepLabV3Backbone as DeepLabV3Backbone,
+    DeepLabV3Backbone,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_image_segmeter_preprocessor import (
-    DeepLabV3ImageSegmenterPreprocessor as DeepLabV3ImageSegmenterPreprocessor,
+    DeepLabV3ImageSegmenterPreprocessor,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_segmenter import (
-    DeepLabV3ImageSegmenter as DeepLabV3ImageSegmenter,
-)
-from keras_hub.src.models.deit.deit_backbone import DeiTBackbone as DeiTBackbone
-from keras_hub.src.models.deit.deit_image_classifier import (
-    DeiTImageClassifier as DeiTImageClassifier,
+    DeepLabV3ImageSegmenter,
 )
+from keras_hub.src.models.deit.deit_backbone import DeiTBackbone
+from keras_hub.src.models.deit.deit_image_classifier import DeiTImageClassifier
 from keras_hub.src.models.deit.deit_image_classifier_preprocessor import (
-    DeiTImageClassifierPreprocessor as DeiTImageClassifierPreprocessor,
-)
-from keras_hub.src.models.densenet.densenet_backbone import (
-    DenseNetBackbone as DenseNetBackbone,
+    DeiTImageClassifierPreprocessor,
 )
+from keras_hub.src.models.densenet.densenet_backbone import DenseNetBackbone
 from keras_hub.src.models.densenet.densenet_image_classifier import (
-    DenseNetImageClassifier as DenseNetImageClassifier,
+    DenseNetImageClassifier,
 )
 from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
-    DenseNetImageClassifierPreprocessor as DenseNetImageClassifierPreprocessor,
+    DenseNetImageClassifierPreprocessor,
 )
 from keras_hub.src.models.depth_anything.depth_anything_backbone import (
-    DepthAnythingBackbone as DepthAnythingBackbone,
+    DepthAnythingBackbone,
 )
 from keras_hub.src.models.depth_anything.depth_anything_depth_estimator import (
-    DepthAnythingDepthEstimator as DepthAnythingDepthEstimator,
+    DepthAnythingDepthEstimator,
 )
 from keras_hub.src.models.depth_anything.depth_anything_depth_estimator_preprocessor import (
-    DepthAnythingDepthEstimatorPreprocessor as DepthAnythingDepthEstimatorPreprocessor,
-)
-from keras_hub.src.models.depth_estimator import (
-    DepthEstimator as DepthEstimator,
+    DepthAnythingDepthEstimatorPreprocessor,
 )
+from keras_hub.src.models.depth_estimator import DepthEstimator
 from keras_hub.src.models.depth_estimator_preprocessor import (
-    DepthEstimatorPreprocessor as DepthEstimatorPreprocessor,
-)
-from keras_hub.src.models.dinov2.dinov2_backbone import (
-    DINOV2Backbone as DINOV2Backbone,
+    DepthEstimatorPreprocessor,
 )
+from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
 from keras_hub.src.models.distil_bert.distil_bert_backbone import (
-    DistilBertBackbone as DistilBertBackbone,
+    DistilBertBackbone,
 )
 from keras_hub.src.models.distil_bert.distil_bert_masked_lm import (
-    DistilBertMaskedLM as DistilBertMaskedLM,
+    DistilBertMaskedLM,
 )
 from keras_hub.src.models.distil_bert.distil_bert_masked_lm_preprocessor import (
-    DistilBertMaskedLMPreprocessor as DistilBertMaskedLMPreprocessor,
+    DistilBertMaskedLMPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier import (
-    DistilBertTextClassifier as DistilBertClassifier,
+    DistilBertTextClassifier,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier import (
-    DistilBertTextClassifier as DistilBertTextClassifier,
+    DistilBertTextClassifier as DistilBertClassifier,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier_preprocessor import (
-    DistilBertTextClassifierPreprocessor as DistilBertPreprocessor,
+    DistilBertTextClassifierPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier_preprocessor import (
-    DistilBertTextClassifierPreprocessor as DistilBertTextClassifierPreprocessor,
+    DistilBertTextClassifierPreprocessor as DistilBertPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
-    DistilBertTokenizer as DistilBertTokenizer,
+    DistilBertTokenizer,
 )
 from keras_hub.src.models.efficientnet.efficientnet_backbone import (
-    EfficientNetBackbone as EfficientNetBackbone,
+    EfficientNetBackbone,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_classifier import (
-    EfficientNetImageClassifier as EfficientNetImageClassifier,
+    EfficientNetImageClassifier,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_classifier_preprocessor import (
-    EfficientNetImageClassifierPreprocessor as EfficientNetImageClassifierPreprocessor,
-)
-from keras_hub.src.models.electra.electra_backbone import (
-    ElectraBackbone as ElectraBackbone,
-)
-from keras_hub.src.models.electra.electra_tokenizer import (
-    ElectraTokenizer as ElectraTokenizer,
+    EfficientNetImageClassifierPreprocessor,
 )
+from keras_hub.src.models.electra.electra_backbone import ElectraBackbone
+from keras_hub.src.models.electra.electra_tokenizer import ElectraTokenizer
+from keras_hub.src.models.esm.esm_backbone import ESMBackbone
 from keras_hub.src.models.esm.esm_backbone import ESMBackbone as ESM2Backbone
-from keras_hub.src.models.esm.esm_backbone import ESMBackbone as ESMBackbone
-from keras_hub.src.models.esm.esm_classifier import (
-    ESMProteinClassifier as ESMProteinClassifier,
-)
+from keras_hub.src.models.esm.esm_classifier import ESMProteinClassifier
 from keras_hub.src.models.esm.esm_classifier_preprocessor import (
-    ESMProteinClassifierPreprocessor as ESMProteinClassifierPreprocessor,
+    ESMProteinClassifierPreprocessor,
 )
+from keras_hub.src.models.esm.esm_masked_plm import ESMMaskedPLM
 from keras_hub.src.models.esm.esm_masked_plm import (
     ESMMaskedPLM as ESM2MaskedPLM,
 )
-from keras_hub.src.models.esm.esm_masked_plm import ESMMaskedPLM as ESMMaskedPLM
 from keras_hub.src.models.esm.esm_masked_plm_preprocessor import (
-    ESMMaskedPLMPreprocessor as ESMMaskedPLMPreprocessor,
-)
-from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer as ESMTokenizer
-from keras_hub.src.models.f_net.f_net_backbone import (
-    FNetBackbone as FNetBackbone,
-)
-from keras_hub.src.models.f_net.f_net_masked_lm import (
-    FNetMaskedLM as FNetMaskedLM,
+    ESMMaskedPLMPreprocessor,
 )
+from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer
+from keras_hub.src.models.f_net.f_net_backbone import FNetBackbone
+from keras_hub.src.models.f_net.f_net_masked_lm import FNetMaskedLM
 from keras_hub.src.models.f_net.f_net_masked_lm_preprocessor import (
-    FNetMaskedLMPreprocessor as FNetMaskedLMPreprocessor,
+    FNetMaskedLMPreprocessor,
 )
+from keras_hub.src.models.f_net.f_net_text_classifier import FNetTextClassifier
 from keras_hub.src.models.f_net.f_net_text_classifier import (
     FNetTextClassifier as FNetClassifier,
 )
-from keras_hub.src.models.f_net.f_net_text_classifier import (
-    FNetTextClassifier as FNetTextClassifier,
-)
 from keras_hub.src.models.f_net.f_net_text_classifier_preprocessor import (
-    FNetTextClassifierPreprocessor as FNetPreprocessor,
+    FNetTextClassifierPreprocessor,
 )
 from keras_hub.src.models.f_net.f_net_text_classifier_preprocessor import (
-    FNetTextClassifierPreprocessor as FNetTextClassifierPreprocessor,
-)
-from keras_hub.src.models.f_net.f_net_tokenizer import (
-    FNetTokenizer as FNetTokenizer,
-)
-from keras_hub.src.models.falcon.falcon_backbone import (
-    FalconBackbone as FalconBackbone,
-)
-from keras_hub.src.models.falcon.falcon_causal_lm import (
-    FalconCausalLM as FalconCausalLM,
+    FNetTextClassifierPreprocessor as FNetPreprocessor,
 )
+from keras_hub.src.models.f_net.f_net_tokenizer import FNetTokenizer
+from keras_hub.src.models.falcon.falcon_backbone import FalconBackbone
+from keras_hub.src.models.falcon.falcon_causal_lm import FalconCausalLM
 from keras_hub.src.models.falcon.falcon_causal_lm_preprocessor import (
-    FalconCausalLMPreprocessor as FalconCausalLMPreprocessor,
-)
-from keras_hub.src.models.falcon.falcon_tokenizer import (
-    FalconTokenizer as FalconTokenizer,
-)
-from keras_hub.src.models.feature_pyramid_backbone import (
-    FeaturePyramidBackbone as FeaturePyramidBackbone,
-)
-from keras_hub.src.models.flux.flux_model import FluxBackbone as FluxBackbone
-from keras_hub.src.models.flux.flux_text_to_image import (
-    FluxTextToImage as FluxTextToImage,
+    FalconCausalLMPreprocessor,
 )
+from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
+from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone
+from keras_hub.src.models.flux.flux_model import FluxBackbone
+from keras_hub.src.models.flux.flux_text_to_image import FluxTextToImage
 from keras_hub.src.models.flux.flux_text_to_image_preprocessor import (
-    FluxTextToImagePreprocessor as FluxTextToImagePreprocessor,
-)
-from keras_hub.src.models.gemma.gemma_backbone import (
-    GemmaBackbone as GemmaBackbone,
-)
-from keras_hub.src.models.gemma.gemma_causal_lm import (
-    GemmaCausalLM as GemmaCausalLM,
+    FluxTextToImagePreprocessor,
 )
+from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
+from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
 from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
-    GemmaCausalLMPreprocessor as GemmaCausalLMPreprocessor,
-)
-from keras_hub.src.models.gemma.gemma_tokenizer import (
-    GemmaTokenizer as GemmaTokenizer,
-)
-from keras_hub.src.models.gemma3.gemma3_backbone import (
-    Gemma3Backbone as Gemma3Backbone,
-)
-from keras_hub.src.models.gemma3.gemma3_causal_lm import (
-    Gemma3CausalLM as Gemma3CausalLM,
+    GemmaCausalLMPreprocessor,
 )
+from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
+from keras_hub.src.models.gemma3.gemma3_backbone import Gemma3Backbone
+from keras_hub.src.models.gemma3.gemma3_causal_lm import Gemma3CausalLM
 from keras_hub.src.models.gemma3.gemma3_causal_lm_preprocessor import (
-    Gemma3CausalLMPreprocessor as Gemma3CausalLMPreprocessor,
-)
-from keras_hub.src.models.gemma3.gemma3_tokenizer import (
-    Gemma3Tokenizer as Gemma3Tokenizer,
+    Gemma3CausalLMPreprocessor,
 )
+from keras_hub.src.models.gemma3.gemma3_tokenizer import Gemma3Tokenizer
 from keras_hub.src.models.gemma3.gemma3_vision_encoder import (
-    Gemma3VisionEncoder as Gemma3VisionEncoder,
-)
-from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone as GPT2Backbone
-from keras_hub.src.models.gpt2.gpt2_causal_lm import (
-    GPT2CausalLM as GPT2CausalLM,
+    Gemma3VisionEncoder,
 )
+from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone
+from keras_hub.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM
 from keras_hub.src.models.gpt2.gpt2_causal_lm_preprocessor import (
-    GPT2CausalLMPreprocessor as GPT2CausalLMPreprocessor,
-)
-from keras_hub.src.models.gpt2.gpt2_preprocessor import (
-    GPT2Preprocessor as GPT2Preprocessor,
-)
-from keras_hub.src.models.gpt2.gpt2_tokenizer import (
-    GPT2Tokenizer as GPT2Tokenizer,
-)
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_backbone import (
-    GPTNeoXBackbone as GPTNeoXBackbone,
-)
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm import (
-    GPTNeoXCausalLM as GPTNeoXCausalLM,
+    GPT2CausalLMPreprocessor,
 )
+from keras_hub.src.models.gpt2.gpt2_preprocessor import GPT2Preprocessor
+from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm import GPTNeoXCausalLM
 from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import (
-    GPTNeoXCausalLMPreprocessor as GPTNeoXCausalLMPreprocessor,
-)
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
-    GPTNeoXTokenizer as GPTNeoXTokenizer,
-)
-from keras_hub.src.models.hgnetv2.hgnetv2_backbone import (
-    HGNetV2Backbone as HGNetV2Backbone,
+    GPTNeoXCausalLMPreprocessor,
 )
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
+from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
 from keras_hub.src.models.hgnetv2.hgnetv2_image_classifier import (
-    HGNetV2ImageClassifier as HGNetV2ImageClassifier,
+    HGNetV2ImageClassifier,
 )
 from keras_hub.src.models.hgnetv2.hgnetv2_image_classifier_preprocessor import (
-    HGNetV2ImageClassifierPreprocessor as HGNetV2ImageClassifierPreprocessor,
-)
-from keras_hub.src.models.image_classifier import (
-    ImageClassifier as ImageClassifier,
+    HGNetV2ImageClassifierPreprocessor,
 )
+from keras_hub.src.models.image_classifier import ImageClassifier
 from keras_hub.src.models.image_classifier_preprocessor import (
-    ImageClassifierPreprocessor as ImageClassifierPreprocessor,
-)
-from keras_hub.src.models.image_segmenter import (
-    ImageSegmenter as ImageSegmenter,
+    ImageClassifierPreprocessor,
 )
+from keras_hub.src.models.image_segmenter import ImageSegmenter
 from keras_hub.src.models.image_segmenter_preprocessor import (
-    ImageSegmenterPreprocessor as ImageSegmenterPreprocessor,
-)
-from keras_hub.src.models.image_to_image import ImageToImage as ImageToImage
-from keras_hub.src.models.inpaint import Inpaint as Inpaint
-from keras_hub.src.models.llama.llama_backbone import (
-    LlamaBackbone as LlamaBackbone,
-)
-from keras_hub.src.models.llama.llama_causal_lm import (
-    LlamaCausalLM as LlamaCausalLM,
+    ImageSegmenterPreprocessor,
 )
+from keras_hub.src.models.image_to_image import ImageToImage
+from keras_hub.src.models.inpaint import Inpaint
+from keras_hub.src.models.llama.llama_backbone import LlamaBackbone
+from keras_hub.src.models.llama.llama_causal_lm import LlamaCausalLM
 from keras_hub.src.models.llama.llama_causal_lm_preprocessor import (
-    LlamaCausalLMPreprocessor as LlamaCausalLMPreprocessor,
-)
-from keras_hub.src.models.llama.llama_tokenizer import (
-    LlamaTokenizer as LlamaTokenizer,
-)
-from keras_hub.src.models.llama3.llama3_backbone import (
-    Llama3Backbone as Llama3Backbone,
-)
-from keras_hub.src.models.llama3.llama3_causal_lm import (
-    Llama3CausalLM as Llama3CausalLM,
+    LlamaCausalLMPreprocessor,
 )
+from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
+from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
+from keras_hub.src.models.llama3.llama3_causal_lm import Llama3CausalLM
 from keras_hub.src.models.llama3.llama3_causal_lm_preprocessor import (
-    Llama3CausalLMPreprocessor as Llama3CausalLMPreprocessor,
-)
-from keras_hub.src.models.llama3.llama3_tokenizer import (
-    Llama3Tokenizer as Llama3Tokenizer,
-)
-from keras_hub.src.models.masked_lm import MaskedLM as MaskedLM
-from keras_hub.src.models.masked_lm_preprocessor import (
-    MaskedLMPreprocessor as MaskedLMPreprocessor,
-)
-from keras_hub.src.models.mistral.mistral_backbone import (
-    MistralBackbone as MistralBackbone,
-)
-from keras_hub.src.models.mistral.mistral_causal_lm import (
-    MistralCausalLM as MistralCausalLM,
+    Llama3CausalLMPreprocessor,
 )
+from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
+from keras_hub.src.models.masked_lm import MaskedLM
+from keras_hub.src.models.masked_lm_preprocessor import MaskedLMPreprocessor
+from keras_hub.src.models.mistral.mistral_backbone import MistralBackbone
+from keras_hub.src.models.mistral.mistral_causal_lm import MistralCausalLM
 from keras_hub.src.models.mistral.mistral_causal_lm_preprocessor import (
-    MistralCausalLMPreprocessor as MistralCausalLMPreprocessor,
-)
-from keras_hub.src.models.mistral.mistral_tokenizer import (
-    MistralTokenizer as MistralTokenizer,
-)
-from keras_hub.src.models.mit.mit_backbone import MiTBackbone as MiTBackbone
-from keras_hub.src.models.mit.mit_image_classifier import (
-    MiTImageClassifier as MiTImageClassifier,
+    MistralCausalLMPreprocessor,
 )
+from keras_hub.src.models.mistral.mistral_tokenizer import MistralTokenizer
+from keras_hub.src.models.mit.mit_backbone import MiTBackbone
+from keras_hub.src.models.mit.mit_image_classifier import MiTImageClassifier
 from keras_hub.src.models.mit.mit_image_classifier_preprocessor import (
-    MiTImageClassifierPreprocessor as MiTImageClassifierPreprocessor,
-)
-from keras_hub.src.models.mixtral.mixtral_backbone import (
-    MixtralBackbone as MixtralBackbone,
-)
-from keras_hub.src.models.mixtral.mixtral_causal_lm import (
-    MixtralCausalLM as MixtralCausalLM,
+    MiTImageClassifierPreprocessor,
 )
+from keras_hub.src.models.mixtral.mixtral_backbone import MixtralBackbone
+from keras_hub.src.models.mixtral.mixtral_causal_lm import MixtralCausalLM
 from keras_hub.src.models.mixtral.mixtral_causal_lm_preprocessor import (
-    MixtralCausalLMPreprocessor as MixtralCausalLMPreprocessor,
-)
-from keras_hub.src.models.mixtral.mixtral_tokenizer import (
-    MixtralTokenizer as MixtralTokenizer,
-)
-from keras_hub.src.models.mobilenet.mobilenet_backbone import (
-    MobileNetBackbone as MobileNetBackbone,
+    MixtralCausalLMPreprocessor,
 )
+from keras_hub.src.models.mixtral.mixtral_tokenizer import MixtralTokenizer
+from keras_hub.src.models.mobilenet.mobilenet_backbone import MobileNetBackbone
 from keras_hub.src.models.mobilenet.mobilenet_image_classifier import (
-    MobileNetImageClassifier as MobileNetImageClassifier,
+    MobileNetImageClassifier,
 )
 from keras_hub.src.models.mobilenet.mobilenet_image_classifier_preprocessor import (
-    MobileNetImageClassifierPreprocessor as MobileNetImageClassifierPreprocessor,
+    MobileNetImageClassifierPreprocessor,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
-    MoonshineAudioToText as MoonshineAudioToText,
+    MoonshineAudioToText,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import (
-    MoonshineAudioToTextPreprocessor as MoonshineAudioToTextPreprocessor,
-)
-from keras_hub.src.models.moonshine.moonshine_backbone import (
-    MoonshineBackbone as MoonshineBackbone,
+    MoonshineAudioToTextPreprocessor,
 )
+from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
 from keras_hub.src.models.moonshine.moonshine_tokenizer import (
-    MoonshineTokenizer as MoonshineTokenizer,
+    MoonshineTokenizer,
 )
+from keras_hub.src.models.object_detector import ObjectDetector
 from keras_hub.src.models.object_detector import (
     ObjectDetector as ImageObjectDetector,
 )
-from keras_hub.src.models.object_detector import (
-    ObjectDetector as ObjectDetector,
-)
 from keras_hub.src.models.object_detector_preprocessor import (
-    ObjectDetectorPreprocessor as ImageObjectDetectorPreprocessor,
+    ObjectDetectorPreprocessor,
 )
 from keras_hub.src.models.object_detector_preprocessor import (
-    ObjectDetectorPreprocessor as ObjectDetectorPreprocessor,
+    ObjectDetectorPreprocessor as ImageObjectDetectorPreprocessor,
 )
-from keras_hub.src.models.opt.opt_backbone import OPTBackbone as OPTBackbone
-from keras_hub.src.models.opt.opt_causal_lm import OPTCausalLM as OPTCausalLM
+from keras_hub.src.models.opt.opt_backbone import OPTBackbone
+from keras_hub.src.models.opt.opt_causal_lm import OPTCausalLM
 from keras_hub.src.models.opt.opt_causal_lm_preprocessor import (
-    OPTCausalLMPreprocessor as OPTCausalLMPreprocessor,
+    OPTCausalLMPreprocessor,
 )
-from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
+from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer
 from keras_hub.src.models.pali_gemma.pali_gemma_backbone import (
-    PaliGemmaBackbone as PaliGemmaBackbone,
+    PaliGemmaBackbone,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm import (
-    PaliGemmaCausalLM as PaliGemmaCausalLM,
+    PaliGemmaCausalLM,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm_preprocessor import (
-    PaliGemmaCausalLMPreprocessor as PaliGemmaCausalLMPreprocessor,
+    PaliGemmaCausalLMPreprocessor,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
-    PaliGemmaTokenizer as PaliGemmaTokenizer,
-)
-from keras_hub.src.models.parseq.parseq_backbone import (
-    PARSeqBackbone as PARSeqBackbone,
-)
-from keras_hub.src.models.parseq.parseq_causal_lm import (
-    PARSeqCausalLM as PARSeqCausalLM,
+    PaliGemmaTokenizer,
 )
+from keras_hub.src.models.parseq.parseq_backbone import PARSeqBackbone
+from keras_hub.src.models.parseq.parseq_causal_lm import PARSeqCausalLM
 from keras_hub.src.models.parseq.parseq_causal_lm_preprocessor import (
-    PARSeqCausalLMPreprocessor as PARSeqCausalLMPreprocessor,
-)
-from keras_hub.src.models.parseq.parseq_tokenizer import (
-    PARSeqTokenizer as PARSeqTokenizer,
-)
-from keras_hub.src.models.phi3.phi3_backbone import Phi3Backbone as Phi3Backbone
-from keras_hub.src.models.phi3.phi3_causal_lm import (
-    Phi3CausalLM as Phi3CausalLM,
+    PARSeqCausalLMPreprocessor,
 )
+from keras_hub.src.models.parseq.parseq_tokenizer import PARSeqTokenizer
+from keras_hub.src.models.phi3.phi3_backbone import Phi3Backbone
+from keras_hub.src.models.phi3.phi3_causal_lm import Phi3CausalLM
 from keras_hub.src.models.phi3.phi3_causal_lm_preprocessor import (
-    Phi3CausalLMPreprocessor as Phi3CausalLMPreprocessor,
+    Phi3CausalLMPreprocessor,
 )
-from keras_hub.src.models.phi3.phi3_tokenizer import (
-    Phi3Tokenizer as Phi3Tokenizer,
-)
-from keras_hub.src.models.preprocessor import Preprocessor as Preprocessor
+from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
+from keras_hub.src.models.preprocessor import Preprocessor
+from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
 from keras_hub.src.models.qwen.qwen_backbone import (
     QwenBackbone as Qwen2Backbone,
 )
-from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone as QwenBackbone
+from keras_hub.src.models.qwen.qwen_causal_lm import QwenCausalLM
 from keras_hub.src.models.qwen.qwen_causal_lm import (
     QwenCausalLM as Qwen2CausalLM,
 )
-from keras_hub.src.models.qwen.qwen_causal_lm import (
-    QwenCausalLM as QwenCausalLM,
-)
 from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
-    QwenCausalLMPreprocessor as Qwen2CausalLMPreprocessor,
+    QwenCausalLMPreprocessor,
 )
 from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
-    QwenCausalLMPreprocessor as QwenCausalLMPreprocessor,
+    QwenCausalLMPreprocessor as Qwen2CausalLMPreprocessor,
 )
+from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
 from keras_hub.src.models.qwen.qwen_tokenizer import (
     QwenTokenizer as Qwen2Tokenizer,
 )
-from keras_hub.src.models.qwen.qwen_tokenizer import (
-    QwenTokenizer as QwenTokenizer,
-)
-from keras_hub.src.models.qwen3.qwen3_backbone import (
-    Qwen3Backbone as Qwen3Backbone,
-)
-from keras_hub.src.models.qwen3.qwen3_causal_lm import (
-    Qwen3CausalLM as Qwen3CausalLM,
-)
+from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
+from keras_hub.src.models.qwen3.qwen3_causal_lm import Qwen3CausalLM
 from keras_hub.src.models.qwen3.qwen3_causal_lm_preprocessor import (
-    Qwen3CausalLMPreprocessor as Qwen3CausalLMPreprocessor,
-)
-from keras_hub.src.models.qwen3.qwen3_tokenizer import (
-    Qwen3Tokenizer as Qwen3Tokenizer,
-)
-from keras_hub.src.models.qwen3_moe.qwen3_moe_backbone import (
-    Qwen3MoeBackbone as Qwen3MoeBackbone,
-)
-from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm import (
-    Qwen3MoeCausalLM as Qwen3MoeCausalLM,
+    Qwen3CausalLMPreprocessor,
 )
+from keras_hub.src.models.qwen3.qwen3_tokenizer import Qwen3Tokenizer
+from keras_hub.src.models.qwen3_moe.qwen3_moe_backbone import Qwen3MoeBackbone
+from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm import Qwen3MoeCausalLM
 from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm_preprocessor import (
-    Qwen3MoeCausalLMPreprocessor as Qwen3MoeCausalLMPreprocessor,
-)
-from keras_hub.src.models.qwen_moe.qwen_moe_backbone import (
-    QwenMoeBackbone as QwenMoeBackbone,
-)
-from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm import (
-    QwenMoeCausalLM as QwenMoeCausalLM,
+    Qwen3MoeCausalLMPreprocessor,
 )
+from keras_hub.src.models.qwen_moe.qwen_moe_backbone import QwenMoeBackbone
+from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm import QwenMoeCausalLM
 from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm_preprocessor import (
-    QwenMoeCausalLMPreprocessor as QwenMoeCausalLMPreprocessor,
-)
-from keras_hub.src.models.resnet.resnet_backbone import (
-    ResNetBackbone as ResNetBackbone,
+    QwenMoeCausalLMPreprocessor,
 )
+from keras_hub.src.models.resnet.resnet_backbone import ResNetBackbone
 from keras_hub.src.models.resnet.resnet_image_classifier import (
-    ResNetImageClassifier as ResNetImageClassifier,
+    ResNetImageClassifier,
 )
 from keras_hub.src.models.resnet.resnet_image_classifier_preprocessor import (
-    ResNetImageClassifierPreprocessor as ResNetImageClassifierPreprocessor,
-)
-from keras_hub.src.models.retinanet.retinanet_backbone import (
-    RetinaNetBackbone as RetinaNetBackbone,
+    ResNetImageClassifierPreprocessor,
 )
+from keras_hub.src.models.retinanet.retinanet_backbone import RetinaNetBackbone
 from keras_hub.src.models.retinanet.retinanet_object_detector import (
-    RetinaNetObjectDetector as RetinaNetObjectDetector,
+    RetinaNetObjectDetector,
 )
 from keras_hub.src.models.retinanet.retinanet_object_detector_preprocessor import (
-    RetinaNetObjectDetectorPreprocessor as RetinaNetObjectDetectorPreprocessor,
-)
-from keras_hub.src.models.roberta.roberta_backbone import (
-    RobertaBackbone as RobertaBackbone,
-)
-from keras_hub.src.models.roberta.roberta_masked_lm import (
-    RobertaMaskedLM as RobertaMaskedLM,
+    RetinaNetObjectDetectorPreprocessor,
 )
+from keras_hub.src.models.roberta.roberta_backbone import RobertaBackbone
+from keras_hub.src.models.roberta.roberta_masked_lm import RobertaMaskedLM
 from keras_hub.src.models.roberta.roberta_masked_lm_preprocessor import (
-    RobertaMaskedLMPreprocessor as RobertaMaskedLMPreprocessor,
+    RobertaMaskedLMPreprocessor,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier import (
-    RobertaTextClassifier as RobertaClassifier,
+    RobertaTextClassifier,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier import (
-    RobertaTextClassifier as RobertaTextClassifier,
+    RobertaTextClassifier as RobertaClassifier,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier_preprocessor import (
-    RobertaTextClassifierPreprocessor as RobertaPreprocessor,
+    RobertaTextClassifierPreprocessor,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier_preprocessor import (
-    RobertaTextClassifierPreprocessor as RobertaTextClassifierPreprocessor,
-)
-from keras_hub.src.models.roberta.roberta_tokenizer import (
-    RobertaTokenizer as RobertaTokenizer,
+    RobertaTextClassifierPreprocessor as RobertaPreprocessor,
 )
+from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_hub.src.models.roformer_v2.roformer_v2_backbone import (
-    RoformerV2Backbone as RoformerV2Backbone,
+    RoformerV2Backbone,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm import (
-    RoformerV2MaskedLM as RoformerV2MaskedLM,
+    RoformerV2MaskedLM,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm_preprocessor import (
-    RoformerV2MaskedLMPreprocessor as RoformerV2MaskedLMPreprocessor,
+    RoformerV2MaskedLMPreprocessor,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier import (
-    RoformerV2TextClassifier as RoformerV2TextClassifier,
+    RoformerV2TextClassifier,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier_preprocessor import (
-    RoformerV2TextClassifierPreprocessor as RoformerV2TextClassifierPreprocessor,
+    RoformerV2TextClassifierPreprocessor,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
-    RoformerV2Tokenizer as RoformerV2Tokenizer,
+    RoformerV2Tokenizer,
 )
-from keras_hub.src.models.sam.sam_backbone import SAMBackbone as SAMBackbone
-from keras_hub.src.models.sam.sam_image_segmenter import (
-    SAMImageSegmenter as SAMImageSegmenter,
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_casual_lm import RWKV7CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
+    RWKV7CausalLMPreprocessor,
 )
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.models.sam.sam_backbone import SAMBackbone
+from keras_hub.src.models.sam.sam_image_segmenter import SAMImageSegmenter
 from keras_hub.src.models.sam.sam_image_segmenter_preprocessor import (
-    SAMImageSegmenterPreprocessor as SAMImageSegmenterPreprocessor,
-)
-from keras_hub.src.models.segformer.segformer_backbone import (
-    SegFormerBackbone as SegFormerBackbone,
+    SAMImageSegmenterPreprocessor,
 )
+from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone
 from keras_hub.src.models.segformer.segformer_image_segmenter import (
-    SegFormerImageSegmenter as SegFormerImageSegmenter,
+    SegFormerImageSegmenter,
 )
 from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor import (
-    SegFormerImageSegmenterPreprocessor as SegFormerImageSegmenterPreprocessor,
-)
-from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM as Seq2SeqLM
-from keras_hub.src.models.seq_2_seq_lm_preprocessor import (
-    Seq2SeqLMPreprocessor as Seq2SeqLMPreprocessor,
-)
-from keras_hub.src.models.siglip.siglip_backbone import (
-    SigLIPBackbone as SigLIPBackbone,
-)
-from keras_hub.src.models.siglip.siglip_preprocessor import (
-    SigLIPPreprocessor as SigLIPPreprocessor,
-)
-from keras_hub.src.models.siglip.siglip_text_encoder import (
-    SigLIPTextEncoder as SigLIPTextEncoder,
-)
-from keras_hub.src.models.siglip.siglip_tokenizer import (
-    SigLIPTokenizer as SigLIPTokenizer,
-)
+    SegFormerImageSegmenterPreprocessor,
+)
+from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
+from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
+from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
+from keras_hub.src.models.siglip.siglip_preprocessor import SigLIPPreprocessor
+from keras_hub.src.models.siglip.siglip_text_encoder import SigLIPTextEncoder
+from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
 from keras_hub.src.models.siglip.siglip_vision_encoder import (
-    SigLIPVisionEncoder as SigLIPVisionEncoder,
+    SigLIPVisionEncoder,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import (
-    StableDiffusion3Backbone as StableDiffusion3Backbone,
+    StableDiffusion3Backbone,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_image_to_image import (
-    StableDiffusion3ImageToImage as StableDiffusion3ImageToImage,
+    StableDiffusion3ImageToImage,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_inpaint import (
-    StableDiffusion3Inpaint as StableDiffusion3Inpaint,
+    StableDiffusion3Inpaint,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image import (
-    StableDiffusion3TextToImage as StableDiffusion3TextToImage,
+    StableDiffusion3TextToImage,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image_preprocessor import (
-    StableDiffusion3TextToImagePreprocessor as StableDiffusion3TextToImagePreprocessor,
-)
-from keras_hub.src.models.t5.t5_backbone import T5Backbone as T5Backbone
-from keras_hub.src.models.t5.t5_preprocessor import (
-    T5Preprocessor as T5Preprocessor,
-)
-from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
-from keras_hub.src.models.t5gemma.t5gemma_backbone import (
-    T5GemmaBackbone as T5GemmaBackbone,
-)
-from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm import (
-    T5GemmaSeq2SeqLM as T5GemmaSeq2SeqLM,
+    StableDiffusion3TextToImagePreprocessor,
 )
+from keras_hub.src.models.t5.t5_backbone import T5Backbone
+from keras_hub.src.models.t5.t5_preprocessor import T5Preprocessor
+from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer
+from keras_hub.src.models.t5gemma.t5gemma_backbone import T5GemmaBackbone
+from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm import T5GemmaSeq2SeqLM
 from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm_preprocessor import (
-    T5GemmaSeq2SeqLMPreprocessor as T5GemmaSeq2SeqLMPreprocessor,
-)
-from keras_hub.src.models.t5gemma.t5gemma_tokenizer import (
-    T5GemmaTokenizer as T5GemmaTokenizer,
+    T5GemmaSeq2SeqLMPreprocessor,
 )
-from keras_hub.src.models.task import Task as Task
+from keras_hub.src.models.t5gemma.t5gemma_tokenizer import T5GemmaTokenizer
+from keras_hub.src.models.task import Task
+from keras_hub.src.models.text_classifier import TextClassifier
 from keras_hub.src.models.text_classifier import TextClassifier as Classifier
-from keras_hub.src.models.text_classifier import (
-    TextClassifier as TextClassifier,
-)
 from keras_hub.src.models.text_classifier_preprocessor import (
-    TextClassifierPreprocessor as TextClassifierPreprocessor,
+    TextClassifierPreprocessor,
 )
-from keras_hub.src.models.text_to_image import TextToImage as TextToImage
+from keras_hub.src.models.text_to_image import TextToImage
 from keras_hub.src.models.text_to_image_preprocessor import (
-    TextToImagePreprocessor as TextToImagePreprocessor,
-)
-from keras_hub.src.models.vgg.vgg_backbone import VGGBackbone as VGGBackbone
-from keras_hub.src.models.vgg.vgg_image_classifier import (
-    VGGImageClassifier as VGGImageClassifier,
+    TextToImagePreprocessor,
 )
+from keras_hub.src.models.vgg.vgg_backbone import VGGBackbone
+from keras_hub.src.models.vgg.vgg_image_classifier import VGGImageClassifier
 from keras_hub.src.models.vgg.vgg_image_classifier_preprocessor import (
-    VGGImageClassifierPreprocessor as VGGImageClassifierPreprocessor,
-)
-from keras_hub.src.models.vit.vit_backbone import ViTBackbone as ViTBackbone
-from keras_hub.src.models.vit.vit_image_classifier import (
-    ViTImageClassifier as ViTImageClassifier,
+    VGGImageClassifierPreprocessor,
 )
+from keras_hub.src.models.vit.vit_backbone import ViTBackbone
+from keras_hub.src.models.vit.vit_image_classifier import ViTImageClassifier
 from keras_hub.src.models.vit.vit_image_classifier_preprocessor import (
-    ViTImageClassifierPreprocessor as ViTImageClassifierPreprocessor,
-)
-from keras_hub.src.models.vit_det.vit_det_backbone import (
-    ViTDetBackbone as ViTDetBackbone,
-)
-from keras_hub.src.models.whisper.whisper_backbone import (
-    WhisperBackbone as WhisperBackbone,
-)
-from keras_hub.src.models.whisper.whisper_tokenizer import (
-    WhisperTokenizer as WhisperTokenizer,
-)
-from keras_hub.src.models.xception.xception_backbone import (
-    XceptionBackbone as XceptionBackbone,
+    ViTImageClassifierPreprocessor,
 )
+from keras_hub.src.models.vit_det.vit_det_backbone import ViTDetBackbone
+from keras_hub.src.models.whisper.whisper_backbone import WhisperBackbone
+from keras_hub.src.models.whisper.whisper_tokenizer import WhisperTokenizer
+from keras_hub.src.models.xception.xception_backbone import XceptionBackbone
 from keras_hub.src.models.xception.xception_image_classifier import (
-    XceptionImageClassifier as XceptionImageClassifier,
+    XceptionImageClassifier,
 )
 from keras_hub.src.models.xception.xception_image_classifier_preprocessor import (
-    XceptionImageClassifierPreprocessor as XceptionImageClassifierPreprocessor,
+    XceptionImageClassifierPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
-    XLMRobertaBackbone as XLMRobertaBackbone,
+    XLMRobertaBackbone,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm import (
-    XLMRobertaMaskedLM as XLMRobertaMaskedLM,
+    XLMRobertaMaskedLM,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (
-    XLMRobertaMaskedLMPreprocessor as XLMRobertaMaskedLMPreprocessor,
+    XLMRobertaMaskedLMPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier import (
-    XLMRobertaTextClassifier as XLMRobertaClassifier,
+    XLMRobertaTextClassifier,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier import (
-    XLMRobertaTextClassifier as XLMRobertaTextClassifier,
+    XLMRobertaTextClassifier as XLMRobertaClassifier,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (
-    XLMRobertaTextClassifierPreprocessor as XLMRobertaPreprocessor,
+    XLMRobertaTextClassifierPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (
-    XLMRobertaTextClassifierPreprocessor as XLMRobertaTextClassifierPreprocessor,
+    XLMRobertaTextClassifierPreprocessor as XLMRobertaPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
-    XLMRobertaTokenizer as XLMRobertaTokenizer,
-)
-from keras_hub.src.models.xlnet.xlnet_backbone import (
-    XLNetBackbone as XLNetBackbone,
+    XLMRobertaTokenizer,
 )
-from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
+from keras_hub.src.models.xlnet.xlnet_backbone import XLNetBackbone
+from keras_hub.src.tokenizers.tokenizer import Tokenizer
diff --git a/keras_hub/api/samplers/__init__.py b/keras_hub/api/samplers/__init__.py
index 29bfef00fc..9feb76c669 100644
--- a/keras_hub/api/samplers/__init__.py
+++ b/keras_hub/api/samplers/__init__.py
@@ -4,15 +4,13 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.samplers.beam_sampler import BeamSampler as BeamSampler
-from keras_hub.src.samplers.contrastive_sampler import (
-    ContrastiveSampler as ContrastiveSampler,
-)
-from keras_hub.src.samplers.greedy_sampler import GreedySampler as GreedySampler
-from keras_hub.src.samplers.random_sampler import RandomSampler as RandomSampler
-from keras_hub.src.samplers.sampler import Sampler as Sampler
-from keras_hub.src.samplers.serialization import deserialize as deserialize
-from keras_hub.src.samplers.serialization import get as get
-from keras_hub.src.samplers.serialization import serialize as serialize
-from keras_hub.src.samplers.top_k_sampler import TopKSampler as TopKSampler
-from keras_hub.src.samplers.top_p_sampler import TopPSampler as TopPSampler
+from keras_hub.src.samplers.beam_sampler import BeamSampler
+from keras_hub.src.samplers.contrastive_sampler import ContrastiveSampler
+from keras_hub.src.samplers.greedy_sampler import GreedySampler
+from keras_hub.src.samplers.random_sampler import RandomSampler
+from keras_hub.src.samplers.sampler import Sampler
+from keras_hub.src.samplers.serialization import deserialize
+from keras_hub.src.samplers.serialization import get
+from keras_hub.src.samplers.serialization import serialize
+from keras_hub.src.samplers.top_k_sampler import TopKSampler
+from keras_hub.src.samplers.top_p_sampler import TopPSampler
diff --git a/keras_hub/api/tokenizers/__init__.py b/keras_hub/api/tokenizers/__init__.py
index b155d0e6e1..b13023ef3e 100644
--- a/keras_hub/api/tokenizers/__init__.py
+++ b/keras_hub/api/tokenizers/__init__.py
@@ -4,124 +4,69 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.models.albert.albert_tokenizer import (
-    AlbertTokenizer as AlbertTokenizer,
-)
-from keras_hub.src.models.bart.bart_tokenizer import (
-    BartTokenizer as BartTokenizer,
-)
-from keras_hub.src.models.bert.bert_tokenizer import (
-    BertTokenizer as BertTokenizer,
-)
-from keras_hub.src.models.bloom.bloom_tokenizer import (
-    BloomTokenizer as BloomTokenizer,
-)
-from keras_hub.src.models.clip.clip_tokenizer import (
-    CLIPTokenizer as CLIPTokenizer,
-)
+from keras_hub.src.models.albert.albert_tokenizer import AlbertTokenizer
+from keras_hub.src.models.bart.bart_tokenizer import BartTokenizer
+from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
+from keras_hub.src.models.bloom.bloom_tokenizer import BloomTokenizer
+from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
 from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
-    DebertaV3Tokenizer as DebertaV3Tokenizer,
+    DebertaV3Tokenizer,
 )
 from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
-    DistilBertTokenizer as DistilBertTokenizer,
-)
-from keras_hub.src.models.electra.electra_tokenizer import (
-    ElectraTokenizer as ElectraTokenizer,
-)
-from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer as ESMTokenizer
-from keras_hub.src.models.f_net.f_net_tokenizer import (
-    FNetTokenizer as FNetTokenizer,
-)
-from keras_hub.src.models.falcon.falcon_tokenizer import (
-    FalconTokenizer as FalconTokenizer,
-)
-from keras_hub.src.models.gemma.gemma_tokenizer import (
-    GemmaTokenizer as GemmaTokenizer,
-)
-from keras_hub.src.models.gemma3.gemma3_tokenizer import (
-    Gemma3Tokenizer as Gemma3Tokenizer,
-)
-from keras_hub.src.models.gpt2.gpt2_tokenizer import (
-    GPT2Tokenizer as GPT2Tokenizer,
-)
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
-    GPTNeoXTokenizer as GPTNeoXTokenizer,
-)
-from keras_hub.src.models.llama.llama_tokenizer import (
-    LlamaTokenizer as LlamaTokenizer,
-)
-from keras_hub.src.models.llama3.llama3_tokenizer import (
-    Llama3Tokenizer as Llama3Tokenizer,
-)
-from keras_hub.src.models.mistral.mistral_tokenizer import (
-    MistralTokenizer as MistralTokenizer,
-)
-from keras_hub.src.models.mixtral.mixtral_tokenizer import (
-    MixtralTokenizer as MixtralTokenizer,
-)
+    DistilBertTokenizer,
+)
+from keras_hub.src.models.electra.electra_tokenizer import ElectraTokenizer
+from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer
+from keras_hub.src.models.f_net.f_net_tokenizer import FNetTokenizer
+from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
+from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
+from keras_hub.src.models.gemma3.gemma3_tokenizer import Gemma3Tokenizer
+from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
+from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
+from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
+from keras_hub.src.models.mistral.mistral_tokenizer import MistralTokenizer
+from keras_hub.src.models.mixtral.mixtral_tokenizer import MixtralTokenizer
 from keras_hub.src.models.moonshine.moonshine_tokenizer import (
-    MoonshineTokenizer as MoonshineTokenizer,
+    MoonshineTokenizer,
 )
-from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
+from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer
 from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
-    PaliGemmaTokenizer as PaliGemmaTokenizer,
-)
-from keras_hub.src.models.parseq.parseq_tokenizer import (
-    PARSeqTokenizer as PARSeqTokenizer,
-)
-from keras_hub.src.models.phi3.phi3_tokenizer import (
-    Phi3Tokenizer as Phi3Tokenizer,
+    PaliGemmaTokenizer,
 )
+from keras_hub.src.models.parseq.parseq_tokenizer import PARSeqTokenizer
+from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
+from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
 from keras_hub.src.models.qwen.qwen_tokenizer import (
     QwenTokenizer as Qwen2Tokenizer,
 )
-from keras_hub.src.models.qwen.qwen_tokenizer import (
-    QwenTokenizer as QwenTokenizer,
-)
-from keras_hub.src.models.qwen3_moe.qwen3_moe_tokenizer import (
-    Qwen3MoeTokenizer as Qwen3MoeTokenizer,
-)
-from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import (
-    QwenMoeTokenizer as QwenMoeTokenizer,
-)
-from keras_hub.src.models.roberta.roberta_tokenizer import (
-    RobertaTokenizer as RobertaTokenizer,
-)
+from keras_hub.src.models.qwen3_moe.qwen3_moe_tokenizer import Qwen3MoeTokenizer
+from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import QwenMoeTokenizer
+from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
-    RoformerV2Tokenizer as RoformerV2Tokenizer,
-)
-from keras_hub.src.models.siglip.siglip_tokenizer import (
-    SigLIPTokenizer as SigLIPTokenizer,
-)
-from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
-from keras_hub.src.models.t5gemma.t5gemma_tokenizer import (
-    T5GemmaTokenizer as T5GemmaTokenizer,
-)
-from keras_hub.src.models.whisper.whisper_tokenizer import (
-    WhisperTokenizer as WhisperTokenizer,
+    RoformerV2Tokenizer,
 )
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
+from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer
+from keras_hub.src.models.t5gemma.t5gemma_tokenizer import T5GemmaTokenizer
+from keras_hub.src.models.whisper.whisper_tokenizer import WhisperTokenizer
 from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
-    XLMRobertaTokenizer as XLMRobertaTokenizer,
-)
-from keras_hub.src.tokenizers.byte_pair_tokenizer import (
-    BytePairTokenizer as BytePairTokenizer,
-)
-from keras_hub.src.tokenizers.byte_tokenizer import (
-    ByteTokenizer as ByteTokenizer,
+    XLMRobertaTokenizer,
 )
+from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
+from keras_hub.src.tokenizers.byte_tokenizer import ByteTokenizer
 from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
-    SentencePieceTokenizer as SentencePieceTokenizer,
+    SentencePieceTokenizer,
 )
 from keras_hub.src.tokenizers.sentence_piece_tokenizer_trainer import (
-    compute_sentence_piece_proto as compute_sentence_piece_proto,
+    compute_sentence_piece_proto,
 )
-from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
+from keras_hub.src.tokenizers.tokenizer import Tokenizer
 from keras_hub.src.tokenizers.unicode_codepoint_tokenizer import (
-    UnicodeCodepointTokenizer as UnicodeCodepointTokenizer,
-)
-from keras_hub.src.tokenizers.word_piece_tokenizer import (
-    WordPieceTokenizer as WordPieceTokenizer,
+    UnicodeCodepointTokenizer,
 )
+from keras_hub.src.tokenizers.word_piece_tokenizer import WordPieceTokenizer
 from keras_hub.src.tokenizers.word_piece_tokenizer_trainer import (
-    compute_word_piece_vocabulary as compute_word_piece_vocabulary,
+    compute_word_piece_vocabulary,
 )
diff --git a/keras_hub/api/utils/__init__.py b/keras_hub/api/utils/__init__.py
index 0bd8cb642e..8ce47790b0 100644
--- a/keras_hub/api/utils/__init__.py
+++ b/keras_hub/api/utils/__init__.py
@@ -4,18 +4,10 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.utils.coco.coco_utils import (
-    coco_id_to_name as coco_id_to_name,
-)
-from keras_hub.src.utils.coco.coco_utils import (
-    coco_name_to_id as coco_name_to_id,
-)
-from keras_hub.src.utils.imagenet.imagenet_utils import (
-    decode_imagenet_predictions as decode_imagenet_predictions,
-)
-from keras_hub.src.utils.imagenet.imagenet_utils import (
-    imagenet_id_to_name as imagenet_id_to_name,
-)
+from keras_hub.src.utils.coco.coco_utils import coco_id_to_name
+from keras_hub.src.utils.coco.coco_utils import coco_name_to_id
 from keras_hub.src.utils.imagenet.imagenet_utils import (
-    imagenet_name_to_id as imagenet_name_to_id,
+    decode_imagenet_predictions,
 )
+from keras_hub.src.utils.imagenet.imagenet_utils import imagenet_id_to_name
+from keras_hub.src.utils.imagenet.imagenet_utils import imagenet_name_to_id
diff --git a/keras_hub/src/models/rwkv7/rwkv7_backbone.py b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
new file mode 100644
index 0000000000..d6d3d9a36b
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
@@ -0,0 +1,119 @@
+import keras
+from keras import ops
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.backbone import Backbone
+from keras_hub.src.models.rwkv7.rwkv7_layer import RWKV7_Block
+
+
+def rwkv7_kernel_initializer(stddev=0.02):
+    return keras.initializers.TruncatedNormal(stddev=stddev)
+
+
+@keras_hub_export("keras_hub.models.RWKV7Backbone")
+class RWKV7Backbone(Backbone):
+    def __init__(
+        self,
+        hidden_size,
+        head_size,
+        num_layers,
+        vocabulary_size,
+        intermediate_dim,
+        gate_lora=128,
+        mv_lora=32,
+        aaa_lora=64,
+        decay_lora=64,
+        dtype=None,
+        dropout_rate=0,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.token_embedding = keras.layers.Embedding(
+            input_dim=vocabulary_size,
+            output_dim=hidden_size,
+            embeddings_initializer=rwkv7_kernel_initializer(),
+            dtype=dtype,
+            name="token_embedding",
+        )
+        self.token_embedding.build([None, None])
+
+        self.output_layer_norm = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="output_norm"
+        )
+        self.output_layer_norm.build([None, None, hidden_size])
+        self.dropout = keras.layers.Dropout(
+            dropout_rate,
+            dtype=dtype,
+            name="dropout",
+        )
+        self.rwkv_layers = []
+        for i in range(num_layers):
+            layer = RWKV7_Block(
+                hidden_size,
+                head_size,
+                intermediate_dim,
+                gate_lora,
+                mv_lora,
+                aaa_lora,
+                decay_lora,
+                use_initial_norm=i == 0,
+                kernel_initializer=rwkv7_kernel_initializer(),
+                dtype=dtype,
+                name=f"rwkv_layer_{i}",
+            )
+
+            self.rwkv_layers.append(layer)
+        self.head = keras.layers.Dense(
+            units=vocabulary_size,
+            kernel_initializer=rwkv7_kernel_initializer(),
+            use_bias=False,
+            name="head",
+        )
+        # === Functional Model ===
+        token_id_input = keras.Input(
+            shape=(None,), dtype="int32", name="token_ids"
+        )
+
+        padding_mask = ops.not_equal(token_id_input, 0)
+
+        x = self.token_embedding(token_id_input)
+        padding_mask = ops.cast(padding_mask, dtype=x.dtype)
+        v_first = None
+        for rwkv_layer in self.rwkv_layers:
+            x, v_first = rwkv_layer(x, v_first, padding_mask)
+            x = self.dropout(x)
+        sequence_output = self.output_layer_norm(x)
+        sequence_output = self.head(sequence_output)
+        super().__init__(
+            inputs=token_id_input,
+            outputs=sequence_output,
+            dtype=dtype,
+            **kwargs,
+        )
+
+        self.num_layers = num_layers
+        self.head_size = head_size
+        self.hidden_size = hidden_size
+        self.gate_lora = gate_lora
+        self.mv_lora = mv_lora
+        self.aaa_lora = aaa_lora
+        self.decay_lora = decay_lora
+        self.vocabulary_size = vocabulary_size
+        self.dropout_rate = dropout_rate
+        self.intermediate_dim = intermediate_dim
+
+    def get_config(self):
+        config = {
+            "hidden_size": self.hidden_size,
+            "head_size": self.head_size,
+            "gate_lora": self.gate_lora,
+            "mv_lora": self.mv_lora,
+            "aaa_lora": self.aaa_lora,
+            "decay_lora": self.decay_lora,
+            "vocabulary_size": self.vocabulary_size,
+            "dropout_rate": self.dropout_rate,
+            "intermediate_dim": self.intermediate_dim,
+            "num_layers": self.num_layers,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py b/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
new file mode 100644
index 0000000000..c78154129b
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
@@ -0,0 +1,50 @@
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.causal_lm import CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
+    RWKV7CausalLMPreprocessor,
+)
+
+
+@keras_hub_export("keras_hub.models.RWKV7CausalLM")
+class RWKV7CausalLM(CausalLM):
+    backbone_cls = RWKV7Backbone
+    preprocessor_cls = RWKV7CausalLMPreprocessor
+
+    def __init__(self, backbone, preprocessor=None, **kwargs):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        super().__init__(
+            inputs=backbone.inputs,
+            outputs=backbone.outputs,
+            **kwargs,
+        )
+
+    def call_with_cache(
+        self,
+        token_ids,
+        cache,
+        cache_update_index,
+    ):
+        pass  # TODO
+
+    def _build_cache(self, token_ids):
+        pass  # TODO
+
+    def generate_step(
+        self,
+        inputs,
+        stop_token_ids=None,
+    ):
+        pass  # TODO
+
+    def score(
+        self,
+        token_ids,
+        padding_mask=None,
+        scoring_mode="logits",
+        layer_intercept_fn=None,
+        target_ids=None,
+    ):
+        pass  # TODO
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
new file mode 100644
index 0000000000..9a8a88211c
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
@@ -0,0 +1,88 @@
+import keras
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.utils.tensor_utils import strip_to_ragged
+
+
+@keras_hub_export("keras_hub.models.RWKV7CausalLMPreprocessor")
+class RWKV7CausalLMPreprocessor(CausalLMPreprocessor):
+    backbone_cls = RWKV7Backbone
+    tokenizer_cls = RWKVTokenizer
+
+    def __init__(
+        self,
+        tokenizer,
+        add_start_token=False,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer=tokenizer, add_start_token=add_start_token, **kwargs
+        )
+
+    def call(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        sequence_length=None,
+    ):
+        sequence_length = sequence_length or self.sequence_length
+        x = self.tokenizer(x)
+        # Pad with one extra token to account for the truncation below.
+        token_ids, padding_mask = self.packer(
+            x,
+            sequence_length=sequence_length + 1,
+            add_start_value=self.add_start_token,
+            add_end_value=self.add_end_token,
+        )
+        # The last token does not have a next token, so we truncate it out.
+        x = token_ids[..., :-1]
+        # Target `y` will be the next token.
+        y, sample_weight = token_ids[..., 1:], padding_mask[..., 1:]
+        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
+
+    def generate_preprocess(
+        self,
+        x,
+        sequence_length=None,
+    ):
+        """Convert strings to integer token input for generation.
+
+        Similar to calling the layer for training, this method takes in strings
+        or tensor strings, tokenizes and packs the input, and computes a padding
+        mask masking all inputs not filled in with a padded value.
+
+        Unlike calling the layer for training, this method does not compute
+        labels and will never append a `tokenizer.end_token_id` to the end of
+        the sequence (as generation is expected to continue at the end of the
+        inputted prompt).
+        """
+        if not self.built:
+            self.build(None)
+
+        x = self.tokenizer(x)
+        token_ids, padding_mask = self.packer(
+            x, sequence_length=sequence_length, add_end_value=False
+        )
+        return token_ids
+
+    def generate_postprocess(
+        self,
+        x,
+    ):
+        """Convert integer token output to strings for generation.
+
+        This method reverses `generate_preprocess()`, by first removing all
+        padding and start/end tokens, and then converting the integer sequence
+        back to a string.
+        """
+        if not self.built:
+            self.build(None)
+
+        token_ids, padding_mask = x["token_ids"], x["padding_mask"]
+        ids_to_strip = self.tokenizer.special_token_ids
+        token_ids = strip_to_ragged(token_ids, padding_mask, ids_to_strip)
+        return self.tokenizer.detokenize(token_ids)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_layer.py b/keras_hub/src/models/rwkv7/rwkv7_layer.py
new file mode 100644
index 0000000000..217f42ee32
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_layer.py
@@ -0,0 +1,612 @@
+import warnings
+
+import keras
+from keras import initializers
+from keras import ops
+from keras.layers import Layer
+
+
+def transpose_head(x, head_first):
+    """
+    Transpose the input tensor.
+
+    Parameters:
+    x: Input tensor.
+    head_first: Boolean flag indicating whether to transpose.
+
+    Returns:
+    Transposed tensor if head_first is True, otherwise the original tensor.
+    """
+    x = ops.cast(x, "float32")
+    if head_first:
+        return ops.transpose(x, (0, 2, 1, 3))
+    else:
+        return x
+
+
+def rnn_generalized_delta_rule(
+    r,
+    w,
+    k,
+    v,
+    a,
+    b,
+    initial_state=None,
+    output_final_state: bool = True,
+    head_first: bool = False,
+):
+    """
+    Implements the generalized delta rule.
+
+    Parameters:
+    r: Input tensor.
+    w: Weight tensor.
+    k, v, a, b: Other input tensors.
+    initial_state: Initial state tensor.
+    output_final_state: Whether to return the final state.
+    head_first: Whether to place the head dimension first during computation.
+
+    Returns:
+    Final state if output_final_state is True, otherwise only the output.
+    """
+    DTYPE = r.dtype
+    B, T, H, N = ops.shape(r)
+    r = transpose_head(r, head_first)
+
+    k = transpose_head(k, head_first)
+
+    v = transpose_head(v, head_first)
+    a = transpose_head(a, head_first)
+    b = transpose_head(b, head_first)
+    w = transpose_head(w, head_first)
+    w = ops.exp(-ops.exp(w))
+
+    if initial_state is not None:
+        state = initial_state
+        if ops.shape(state)[0] == 1:
+            state = ops.broadcast_to(state, (B, H, N, N))
+    else:
+        state = ops.zeros((B, H, N, N), dtype="float32")
+    out = ops.zeros((B, T, H, N), dtype=r.dtype)
+
+    def step(t, inputs):
+        """
+        Performs computation for a single time step.
+
+        Parameters:
+        t: Current time step.
+        inputs: List containing current state and output.
+
+        Returns:
+        Updated state and output.
+        """
+        state, out = inputs
+        kk = ops.reshape(k[:, t, :], (B, H, 1, N))
+        rr = ops.reshape(r[:, t, :], (B, H, N, 1))
+        vv = ops.reshape(v[:, t, :], (B, H, N, 1))
+        aa = ops.reshape(a[:, t, :], (B, H, N, 1))
+        bb = ops.reshape(b[:, t, :], (B, H, 1, N))
+        state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
+        out = ops.slice_update(
+            out, [0, t, 0, 0], ops.reshape((state @ rr), (B, 1, H, N))
+        )
+        return [state, out]
+
+    state, out = ops.fori_loop(0, T, step, [state, out])
+
+    if output_final_state:
+        return ops.cast(out, DTYPE), state
+    return ops.cast(out, DTYPE)
+
+
+class TimeShift(Layer):
+    def __init__(self, name="time_shift"):
+        super(TimeShift, self).__init__(name=name)
+
+    def call(self, inputs, cache_x=None):
+        x = ops.pad(inputs, [[0, 0], [1, 0], [0, 0]], constant_values=0.0)[
+            :, :-1, :
+        ]
+        if cache_x is not None:
+            x = ops.slice_update(x, [0, 0, 0], cache_x)
+        return x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class RWKV7_ChannelMix(Layer):
+    def __init__(self, dim_ffn, kernel_initializer="glorot_uniform", **kwargs):
+        super().__init__(**kwargs)
+        self.dim_ffn = dim_ffn
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    def call(self, x, last_cache_x=None):
+        if last_cache_x is None:
+            xx = self.time_shift(x) - x
+        else:
+            xx = self.time_shift(x, last_cache_x) - x
+            last_cache_x = x[:, -1:, :]
+        k = x + xx * self.x_k
+        k = ops.relu(self.key(k)) ** 2
+        output = self.value(k)
+        if last_cache_x is not None:
+            output = [output, last_cache_x]
+        return output
+
+    def compute_output_shape(self, input_shape):
+        if isinstance(input_shape, list):
+            return input_shape[0]
+        return input_shape
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+        self.x_k = self.add_weight(
+            shape=(1, 1, input_shape[-1]),
+            name="time_mix_k",
+            initializer=self.kernel_initializer,
+        )
+        self.time_shift = TimeShift()
+        self.key = keras.layers.Dense(
+            self.dim_ffn,
+            use_bias=False,
+            name="dense_k",
+            kernel_initializer=self.kernel_initializer,
+        )
+        self.value = keras.layers.Dense(
+            input_shape[-1],
+            use_bias=False,
+            name="dense_v",
+            kernel_initializer=self.kernel_initializer,
+        )
+        self.key.build(input_shape)
+        self.value.build([None, None, self.dim_ffn])
+
+    def get_config(self):
+        config = {
+            "dim_ffn": self.dim_ffn,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class GroupNorm(keras.layers.GroupNormalization):
+    def call(self, inputs):
+        if keras.config.backend() == "torch":
+            import torch.nn.functional as F
+
+            return F.group_norm(
+                inputs, self.groups, self.gamma, self.beta, self.epsilon
+            )
+        return super().call(inputs)
+
+
+class RWKV7_TimeMix(Layer):
+    def __init__(
+        self,
+        hidden_size,
+        head_size,
+        gate_lora=128,
+        mv_lora=32,
+        aaa_lora=64,
+        decay_lora=64,
+        kernel_initializer="glorot_uniform",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.head_size = head_size
+        self.hidden_size = hidden_size
+        self.n_head = hidden_size // self.head_size
+        self.gate_lora = gate_lora
+        self.mv_lora = mv_lora
+        self.aaa_lora = aaa_lora
+        self.decay_lora = decay_lora
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.initial_state = None
+        try:
+            from rwkv_ops import RWKV7_USE_KERNEL
+            from rwkv_ops import generalized_delta_rule
+        except ImportError:
+            warnings.warn(
+                "The 'rwkv_ops' package is not installed. "
+                "Falling back to a pure-Python operator,that will very slow."
+                "Please install 'rwkv_ops' to enable the optimized kernels.",
+                UserWarning,
+                stacklevel=2,
+            )
+            generalized_delta_rule = rnn_generalized_delta_rule
+            RWKV7_USE_KERNEL = False
+        self.RWKV7_OP, self.USE_KERNEL = (
+            generalized_delta_rule,
+            RWKV7_USE_KERNEL,
+        )
+        assert self.hidden_size % self.n_head == 0
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if isinstance(input_shape[0], list):
+            input_shape = input_shape[0]
+        H = self.n_head
+        N = self.head_size
+        B, T, C = input_shape
+
+        self.x_r = self.add_weight(
+            shape=(1, 1, C), name="x_r", initializer=self.kernel_initializer
+        )
+        self.x_w = self.add_weight(
+            shape=(1, 1, C), name="x_w", initializer=self.kernel_initializer
+        )
+        self.x_k = self.add_weight(
+            shape=(1, 1, C), name="x_k", initializer=self.kernel_initializer
+        )
+        self.x_v = self.add_weight(
+            shape=(1, 1, C), name="x_v", initializer=self.kernel_initializer
+        )
+        self.x_a = self.add_weight(
+            shape=(1, 1, C), name="x_a", initializer=self.kernel_initializer
+        )
+        self.x_g = self.add_weight(
+            shape=(1, 1, C), name="x_g", initializer=self.kernel_initializer
+        )
+
+        self.w0 = self.add_weight(
+            shape=(1, 1, C), name="w0", initializer=self.kernel_initializer
+        )
+        self.w1 = self.add_weight(
+            shape=(C, self.decay_lora),
+            name="w1",
+            initializer=self.kernel_initializer,
+        )
+        self.w2 = self.add_weight(
+            shape=(self.decay_lora, C),
+            name="w2",
+            initializer=self.kernel_initializer,
+        )
+
+        self.a0 = self.add_weight(
+            shape=(1, 1, C), name="a0", initializer=self.kernel_initializer
+        )
+        self.a1 = self.add_weight(
+            shape=(C, self.aaa_lora),
+            name="a1",
+            initializer=self.kernel_initializer,
+        )
+        self.a2 = self.add_weight(
+            shape=(self.aaa_lora, C),
+            name="a2",
+            initializer=self.kernel_initializer,
+        )
+
+        self.v0 = self.add_weight(
+            shape=(1, 1, C), name="v0", initializer=self.kernel_initializer
+        )
+        self.v1 = self.add_weight(
+            shape=(C, self.mv_lora),
+            name="v1",
+            initializer=self.kernel_initializer,
+        )
+        self.v2 = self.add_weight(
+            shape=(self.mv_lora, C),
+            name="v2",
+            initializer=self.kernel_initializer,
+        )
+
+        self.g1 = self.add_weight(
+            shape=(C, self.gate_lora),
+            name="g1",
+            initializer=self.kernel_initializer,
+        )
+        self.g2 = self.add_weight(
+            shape=(self.gate_lora, C),
+            name="g2",
+            initializer=self.kernel_initializer,
+        )
+
+        self.k_k = self.add_weight(
+            shape=(1, 1, C), name="k_k", initializer=self.kernel_initializer
+        )
+        self.k_a = self.add_weight(
+            shape=(1, 1, C), name="k_a", initializer=self.kernel_initializer
+        )
+        self.r_k = self.add_weight(
+            shape=(H, N), name="r_k", initializer=self.kernel_initializer
+        )
+
+        self.time_shift = TimeShift()
+        self.receptance = keras.layers.Dense(
+            C,
+            use_bias=False,
+            kernel_initializer=self.kernel_initializer,
+            name="receptance",
+        )
+        self.key = keras.layers.Dense(
+            C,
+            use_bias=False,
+            kernel_initializer=self.kernel_initializer,
+            name="key",
+        )
+        self.value = keras.layers.Dense(
+            C,
+            use_bias=False,
+            kernel_initializer=self.kernel_initializer,
+            name="value",
+        )
+        self.output_layer = keras.layers.Dense(
+            C,
+            use_bias=False,
+            kernel_initializer=self.kernel_initializer,
+            name="output_layer",
+        )
+        self.ln_x = GroupNorm(groups=H, epsilon=64e-5)
+
+        self.receptance.build(input_shape)
+        self.value.build(input_shape)
+        self.key.build(input_shape)
+        self.output_layer.build(input_shape)
+        self.ln_x.build((None, C))
+
+    def call(
+        self,
+        x,
+        v_first=None,
+        padding_mask=None,
+        last_cache_x=None,
+        cache_state=None,
+        rnn_mode=False,
+    ):
+        if cache_state is None:
+            initial_state = self.initial_state
+        else:
+            initial_state = cache_state
+        if padding_mask is not None:
+            if ops.ndim(padding_mask) == 2:
+                padding_mask = padding_mask[..., None]
+            padding_mask = ops.cast(padding_mask, x.dtype)
+            x *= padding_mask
+        B, T, C = ops.shape(x)
+        H = self.n_head
+        if last_cache_x is None:
+            xx = self.time_shift(x) - x
+        else:
+            xx = self.time_shift(x, last_cache_x) - x
+            last_cache_x = x[:, -1:, :]
+
+        xr = x + xx * self.x_r
+        xw = x + xx * self.x_w
+        xk = x + xx * self.x_k
+        xv = x + xx * self.x_v
+        xa = x + xx * self.x_a
+        xg = x + xx * self.x_g
+
+        r = self.receptance(xr)
+        w = (
+            -ops.softplus(
+                -(
+                    self.w0
+                    + ops.matmul(ops.tanh(ops.matmul(xw, self.w1)), self.w2)
+                )
+            )
+            - 0.5
+        )  # soft-clamp to (-inf, -0.5)
+        k = self.key(xk)
+        v = self.value(xv)
+        if v_first is None:
+            v_first = v
+        else:
+            v = v + (v_first - v) * ops.sigmoid(
+                self.v0 + ops.matmul(ops.matmul(xv, self.v1), self.v2)
+            )
+
+        a = ops.sigmoid(
+            self.a0 + ops.matmul(ops.matmul(xa, self.a1), self.a2)
+        )  # a is "in-context learning rate"
+        g = ops.matmul(ops.sigmoid(ops.matmul(xg, self.g1)), self.g2)
+
+        kk = k * self.k_k
+
+        kk = self.normalize(ops.reshape(kk, (B, T, H, -1)))
+        kk = ops.reshape(kk, (B, T, C))
+
+        k = k * (1 + (a - 1) * self.k_a)
+        if padding_mask is not None:
+            v *= padding_mask
+            if self.USE_KERNEL:
+                w += (1 - padding_mask) * -1e9
+            else:
+                w = w * padding_mask + 1 - padding_mask
+        # N = self.head_size
+        if rnn_mode:
+            rwkv7_op = rnn_generalized_delta_rule
+        else:
+            rwkv7_op = self.RWKV7_OP
+        x, finnal_state = rwkv7_op(
+            ops.reshape(r, (B, T, self.n_head, self.head_size)),
+            ops.reshape(w, (B, T, self.n_head, self.head_size)),
+            ops.reshape(k, (B, T, self.n_head, self.head_size)),
+            ops.reshape(v, (B, T, self.n_head, self.head_size)),
+            ops.reshape(-kk, (B, T, self.n_head, self.head_size)),
+            ops.reshape(kk * a, (B, T, self.n_head, self.head_size)),
+            initial_state=initial_state,
+        )
+
+        x = ops.reshape(x, (B, T, C))
+
+        x = ops.reshape(self.ln_x(ops.reshape(x, (B * T, C))), ops.shape(x))
+
+        x = ops.reshape(x, (B, T, C))
+        r = ops.reshape(r, (B, T, H, -1))
+        k = ops.reshape(k, (B, T, H, -1))
+        v = ops.reshape(v, (B, T, C))
+
+        rwkv = ops.sum(r * k * self.r_k, axis=-1, keepdims=True) * ops.reshape(
+            v, (B, T, H, -1)
+        )
+
+        x = x + ops.reshape(rwkv, (B, T, C))
+        x = self.output_layer(x * g)
+        output = [x, v_first]
+        if last_cache_x is not None:
+            output.extend([last_cache_x, finnal_state])
+        return output
+
+    def compute_output_shape(self, input_shape):
+        output_shapes = [
+            [None, None, self.hidden_size],
+            [None, None, self.hidden_size],
+        ]
+        return output_shapes
+
+    def normalize(
+        self,
+        x,
+        eps: float = 1e-12,
+    ):
+        # F.normalize like api
+        if keras.config.backend() == "torch":
+            import torch.nn.functional as F
+
+            return F.normalize(x, dim=-1, p=2.0)
+        square_sum = ops.sum(ops.square(x), axis=-1, keepdims=True)
+        inv_norm = ops.rsqrt(square_sum + eps)
+        inv_norm = ops.maximum(inv_norm, eps)
+        return x * inv_norm
+
+    def get_config(self):
+        config = {
+            "hidden_size": self.hidden_size,
+            "head_size": self.head_size,
+            "gate_lora": self.gate_lora,
+            "mv_lora": self.mv_lora,
+            "aaa_lora": self.aaa_lora,
+            "decay_lora": self.decay_lora,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class RWKV7_Block(Layer):
+    def __init__(
+        self,
+        hidden_size,
+        head_size,
+        intermediate_dim,
+        gate_lora=128,
+        mv_lora=32,
+        aaa_lora=64,
+        decay_lora=64,
+        use_initial_norm=False,
+        kernel_initializer="glorot_uniform",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.head_size = head_size
+        self.hidden_size = hidden_size
+        self.gate_lora = gate_lora
+        self.mv_lora = mv_lora
+        self.aaa_lora = aaa_lora
+        self.decay_lora = decay_lora
+        self.intermediate_dim = intermediate_dim
+        self.use_initial_norm = use_initial_norm
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if self.use_initial_norm:
+            self.ln0 = keras.layers.LayerNormalization(
+                epsilon=1e-5, name="init_norm"
+            )
+            self.ln0.build(input_shape)
+
+        self.ln1 = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="att_norm"
+        )
+        self.ln1.build(input_shape)
+
+        self.ln2 = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="ffn_norm"
+        )
+        self.ln2.build(input_shape)
+
+        self.att = RWKV7_TimeMix(
+            self.hidden_size,
+            self.head_size,
+            self.gate_lora,
+            self.mv_lora,
+            self.aaa_lora,
+            self.decay_lora,
+            name="RWKV_TIME_MIX",
+            kernel_initializer=self.kernel_initializer,
+        )
+        self.att.build(input_shape)
+
+        self.ffn = RWKV7_ChannelMix(
+            self.intermediate_dim,
+            name="RWKV_CMIX",
+            kernel_initializer=self.kernel_initializer,
+        )
+        self.ffn.build(input_shape)
+
+    def call(
+        self,
+        x,
+        v_first=None,
+        padding_mask=None,
+        cache_state=None,
+        cache_tmix_x=None,
+        cache_cmix_x=None,
+        rnn_mode=False,
+    ):
+        if self.use_initial_norm:
+            x = self.ln0(x)
+        if cache_state is None:
+            xx, v_first = self.att(
+                self.ln1(x), v_first=v_first, padding_mask=padding_mask
+            )
+            x = x + xx
+            x = x + self.ffn(self.ln2(x))
+            return x, v_first
+        else:
+            xx, v_first, cache_tmix_x, cache_state = self.att(
+                self.ln1(x),
+                v_first=v_first,
+                padding_mask=padding_mask,
+                last_cache_x=cache_tmix_x,
+                cache_state=cache_state,
+                rnn_mode=rnn_mode,
+            )
+            x = x + xx
+            xx, cache_cmix_x = self.ffn(self.ln2(x), cache_cmix_x)
+            x = x + xx
+            return x, v_first, cache_state, cache_tmix_x, cache_cmix_x
+
+    def compute_output_shape(self, input_shape):
+        output_shapes = [
+            [None, None, self.hidden_size],
+            [None, None, self.hidden_size],
+        ]
+        return output_shapes
+
+    def get_config(self):
+        config = {
+            "hidden_size": self.hidden_size,
+            "head_size": self.head_size,
+            "gate_lora": self.gate_lora,
+            "mv_lora": self.mv_lora,
+            "aaa_lora": self.aaa_lora,
+            "decay_lora": self.decay_lora,
+            "intermediate_dim": self.intermediate_dim,
+            "use_initial_norm": self.use_initial_norm,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
new file mode 100644
index 0000000000..ce2e49535a
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
@@ -0,0 +1,224 @@
+import os
+
+import keras
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.tokenizers import tokenizer
+from keras_hub.src.utils.tensor_utils import is_int_dtype
+from keras_hub.src.utils.tensor_utils import is_string_dtype
+from keras_hub.src.utils.tensor_utils import tensor_to_list
+
+VOCAB_FILENAME = "vocab.txt"
+
+
+class TRIE:
+    __slots__ = tuple("ch,to,values,front".split(","))
+    to: list
+    values: set
+
+    def __init__(self, front=None, ch=None):
+        self.ch = ch
+        self.to = [None for ch in range(256)]
+        self.values = set()
+        self.front = front
+
+    def __repr__(self):
+        fr = self
+        ret = []
+        while fr is not None:
+            if fr.ch is not None:
+                ret.append(fr.ch)
+            fr = fr.front
+        return "<TRIE %s %s>" % (ret[::-1], self.values)
+
+    def add(self, key: bytes, idx: int = 0, val=None):
+        if idx == len(key):
+            if val is None:
+                val = key
+            self.values.add(val)
+            return self
+        ch = key[idx]
+        if self.to[ch] is None:
+            self.to[ch] = TRIE(front=self, ch=ch)
+        return self.to[ch].add(key, idx=idx + 1, val=val)
+
+    def find_longest(self, key: bytes, idx: int = 0):
+        u: TRIE = self
+        ch: int = key[idx]
+
+        while u.to[ch] is not None:
+            u = u.to[ch]
+            idx += 1
+            if u.values:
+                ret = idx, u, u.values
+            if idx == len(key):
+                break
+            ch = key[idx]
+        return ret
+
+
+class RWKV_TOKENIZER:
+    def __init__(self, vocabs):
+        self.idx2token = {}
+        sorted = []  # must be already sorted
+        for l in vocabs:
+            idx = int(l[: l.index(" ")])
+            x = eval(l[l.index(" ") : l.rindex(" ")])
+            x = x.encode("utf-8") if isinstance(x, str) else x
+            assert isinstance(x, bytes)
+            assert len(x) == int(l[l.rindex(" ") :])
+            sorted += [x]
+            self.idx2token[idx] = x
+
+        self.token2idx = {}
+        for k, v in self.idx2token.items():
+            self.token2idx[v] = int(k)
+
+        self.root = TRIE()
+        for t, i in self.token2idx.items():
+            _ = self.root.add(t, val=(t, i))
+
+    def encodeBytes(self, src: bytes):
+        idx: int = 0
+        tokens = []
+        while idx < len(src):
+            _idx: int = idx
+            idx, _, values = self.root.find_longest(src, idx)
+            assert idx != _idx
+            _, token = next(iter(values))
+            tokens.append(token)
+        return tokens
+
+    def decodeBytes(self, tokens):
+        return b"".join(map(lambda i: self.idx2token[i], tokens))
+
+    def encode(self, src):
+        if isinstance(src, str):
+            return self.encodeBytes(src.encode("utf-8"))
+        else:
+            return [self.encodeBytes(s.encode("utf-8")) for s in src]
+
+    def decode(self, tokens):
+        return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
+        # try:
+        #     return self.decodeBytes(tokens).decode('utf-8')
+        # except:
+        #     return '\ufffd' # bad utf-8
+
+    def printTokens(self, tokens):
+        for i in tokens:
+            s = self.idx2token[i]
+            try:
+                s = s.decode("utf-8")
+            except BaseException:
+                pass
+            print(f"{repr(s)}{i}", end=" ")
+        print()
+
+
+@keras_hub_export(
+    [
+        "keras_hub.tokenizers.RWKVTokenizer",
+        "keras_hub.models.RWKVTokenizer",
+    ]
+)
+class RWKVTokenizer(tokenizer.Tokenizer):
+    def __init__(
+        self,
+        vocabulary=None,
+        dtype="int32",
+        **kwargs,
+    ) -> None:
+        if not is_int_dtype(dtype) and not is_string_dtype(dtype):
+            raise ValueError(
+                "Output dtype must be an integer type or a string. "
+                f"Received: dtype={dtype}"
+            )
+
+        super().__init__(dtype=dtype, **kwargs)
+
+        self.vocabulary = None
+        if vocabulary is not None:
+            self.set_vocabulary(vocabulary)
+        self.file_assets = [VOCAB_FILENAME]
+
+    def set_vocabulary(self, vocabulary):
+        self.vocabulary = vocabulary
+        self._tokenizer = RWKV_TOKENIZER(vocabulary)
+        self.pad_token_id = 0
+        self.start_token_id = None
+        self.end_token_id = self.tokenize(["\n\n"])[0][0]
+
+    def save_assets(self, dir_path):
+        path = os.path.join(dir_path, VOCAB_FILENAME)
+        with open(path, "wb") as file:
+            file.write("\n".join(self.vocabulary))
+
+    def load_assets(self, dir_path=""):
+        path = os.path.join(dir_path, VOCAB_FILENAME)
+        with open(path, "r", encoding="utf-8") as f:
+            vocabulary = f.readlines()
+        self.set_vocabulary(vocabulary)
+
+    def _check_vocabulary(self):
+        if self.vocabulary is None:
+            raise ValueError(
+                "No vocabulary has been set for RWKVTokenizer. Make "
+                "sure to pass a `vocabulary` argument when creating the layer."
+            )
+
+    def vocabulary_size(self):
+        self._check_vocabulary()
+        return int(len(self.vocabulary))
+
+    def get_vocabulary(self):
+        self._check_vocabulary()
+        return tensor_to_list(self.vocabulary)
+
+    def id_to_token(self, id):
+        self._check_vocabulary()
+        if id >= self.vocabulary_size() or id < 0:
+            raise ValueError(
+                f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
+                f"Received: {id}"
+            )
+        return self._tokenizer.idx2token[id]
+
+    def token_to_id(self, token):
+        """Convert a string token to an integer id."""
+        self._check_vocabulary()
+        return int(self._tokenizer.token2idx[token])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "vocabulary": None,  # Save vocabulary via an asset!
+            }
+        )
+        return config
+
+    def tokenize(self, inputs):
+        self._check_vocabulary()
+        tokens = self._tokenizer.encode(inputs)
+
+        def tokens2ids(x):
+            return [self.token_to_id(t) for t in x]
+
+        if is_string_dtype(self.dtype):
+            if isinstance(inputs, str):
+                return tokens2ids(tokens)
+            return [tokens2ids(t) for t in tokens]
+        return tokens
+
+    def detokenize(self, inputs):
+        self._check_vocabulary()
+        return self._tokenizer.decode(inputs)
+
+    def compute_output_spec(self, input_spec):
+        return keras.KerasTensor(
+            input_spec.shape + (None,), dtype=self.compute_dtype
+        )
+
+    def call(self, inputs):
+        return self.tokenize(inputs)
diff --git a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
new file mode 100644
index 0000000000..e51e0c4d79
--- /dev/null
+++ b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
@@ -0,0 +1,464 @@
+# ==============================================================================
+# Environment & Dependency Setup
+# ==============================================================================
+import os
+
+import numpy as np
+import requests
+import torch
+from absl import app
+from absl import flags
+
+# Force CPU only (GPU index -1 disables CUDA)
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+# Use native kernel implementations
+os.environ["KERNEL_TYPE"] = "native"
+
+# Keras-Ops is imported **after** environment variables are set
+import types
+
+import torch.nn as nn
+import torch.nn.functional as F
+from keras import ops  # noqa: E402
+from modelscope import snapshot_download
+
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_casual_lm import RWKV7CausalLM
+
+# Local modules
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+
+# ==============================================================================
+# Model Preset Registry
+# ==============================================================================
+PRESET_MAP = {
+    "rwkv7_world_0.1B": "RWKV-x070-World-0.1B-v2.8-20241210-ctx4096.pth",
+    "rwkv7_world_0.3B": "RWKV-x070-World-0.4B-v2.9-20250107-ctx4096.pth",
+    "rwkv7_world_1.5B": "RWKV-x070-World-1.5B-v3-20250127-ctx4096.pth",
+    "rwkv7_world_2.9B": "RWKV-x070-World-2.9B-v3-20250211-ctx4096.pth",
+}
+
+# ==============================================================================
+# Command-line Interface
+# ==============================================================================
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "preset", None, f"Must be one of {','.join(PRESET_MAP.keys())}"
+)
+
+# ==============================================================================
+# RWKV-v7 official PyTorch implementation
+# From https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v7/rwkv_v7_demo.py
+# ==============================================================================
+HEAD_SIZE = 64
+D_DECAY_LORA = 64
+D_AAA_LORA = 64
+D_MV_LORA = 32
+D_GATE_LORA = 128
+
+
+def RWKV7_OP(r, w, k, v, a, b):
+    """
+    Official RWKV-7 core operator.
+    Performs the time-mix recurrence with delta-rule based learning.
+    """
+    DTYPE = r.dtype
+    B, T, C = r.size()
+    H = C // HEAD_SIZE
+    N = HEAD_SIZE
+    r = r.view(B, T, H, N).float()
+    k = k.view(B, T, H, N).float()
+    v = v.view(B, T, H, N).float()
+    a = a.view(B, T, H, N).float()
+    b = b.view(B, T, H, N).float()
+
+    # Compute decay factor (log-space)
+    w = torch.exp(-torch.exp(w.view(B, T, H, N).float()))
+    out = torch.zeros((B, T, H, N), device=r.device, dtype=torch.float)
+    state = torch.zeros((B, H, N, N), device=r.device, dtype=torch.float)
+
+    # Recurrent inference loop over time
+    for t in range(T):
+        kk = k[:, t, :].view(B, H, 1, N)
+        rr = r[:, t, :].view(B, H, N, 1)
+        vv = v[:, t, :].view(B, H, N, 1)
+        aa = a[:, t, :].view(B, H, N, 1)
+        bb = b[:, t, :].view(B, H, 1, N)
+        # State update: decay + delta-rule + residual
+        state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
+        # Read-out for current position
+        out[:, t, :] = (state @ rr).view(B, H, N)
+    return out.view(B, T, C).to(DTYPE)
+
+
+# ==============================================================================
+# RWKV Time-Mix Layer (Attention)
+# ==============================================================================
+class RWKV_Tmix_x070(nn.Module):
+    def __init__(self, args, layer_id):
+        super().__init__()
+        self.args = args
+        self.layer_id = layer_id
+        self.head_size = args.head_size_a
+        self.n_head = args.dim_att // self.head_size
+        assert args.dim_att % self.n_head == 0
+
+        H, N, C = self.n_head, self.head_size, args.n_embd
+
+        # Low-rank adaptation & shift scalars
+        self.x_r = nn.Parameter(torch.empty(1, 1, C))
+        self.x_w = nn.Parameter(torch.empty(1, 1, C))
+        self.x_k = nn.Parameter(torch.empty(1, 1, C))
+        self.x_v = nn.Parameter(torch.empty(1, 1, C))
+        self.x_a = nn.Parameter(torch.empty(1, 1, C))
+        self.x_g = nn.Parameter(torch.empty(1, 1, C))
+
+        # Decay (w) modulation
+        self.w0 = nn.Parameter(torch.empty(1, 1, C))
+        self.w1 = nn.Parameter(torch.empty(C, D_DECAY_LORA))
+        self.w2 = nn.Parameter(torch.empty(D_DECAY_LORA, C))
+
+        # In-context learning rate (a) modulation
+        self.a0 = nn.Parameter(torch.empty(1, 1, C))
+        self.a1 = nn.Parameter(torch.empty(C, D_AAA_LORA))
+        self.a2 = nn.Parameter(torch.empty(D_AAA_LORA, C))
+
+        # Value residual modulation
+        self.v0 = nn.Parameter(torch.empty(1, 1, C))
+        self.v1 = nn.Parameter(torch.empty(C, D_MV_LORA))
+        self.v2 = nn.Parameter(torch.empty(D_MV_LORA, C))
+
+        # Gate modulation
+        self.g1 = nn.Parameter(torch.empty(C, D_GATE_LORA))
+        self.g2 = nn.Parameter(torch.empty(D_GATE_LORA, C))
+
+        # Normalization & positional factors
+        self.k_k = nn.Parameter(torch.empty(1, 1, C))
+        self.k_a = nn.Parameter(torch.empty(1, 1, C))
+        self.r_k = nn.Parameter(torch.empty(H, N))
+
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.receptance = nn.Linear(C, C, bias=False)
+        self.key = nn.Linear(C, C, bias=False)
+        self.value = nn.Linear(C, C, bias=False)
+        self.output = nn.Linear(C, C, bias=False)
+        # GroupNorm with very small epsilon for numerical stability
+        self.ln_x = nn.GroupNorm(H, C, eps=64e-5)
+
+    # --------------------------------------------------------------------------
+    def forward(self, x, v_first=None):
+        B, T, C = x.size()
+        H = self.n_head
+        xx = self.time_shift(x) - x  # Difference token shift
+
+        # Apply token-shift to each branch
+        xr = x + xx * self.x_r
+        xw = x + xx * self.x_w
+        xk = x + xx * self.x_k
+        xv = x + xx * self.x_v
+        xa = x + xx * self.x_a
+        xg = x + xx * self.x_g
+
+        r = self.receptance(xr)
+        w = (
+            -F.softplus(-(self.w0 + torch.tanh(xw @ self.w1) @ self.w2)) - 0.5
+        )  # Clamp
+        k = self.key(xk)
+        v = self.value(xv)
+
+        # Value residual: only active on non-first layers
+        if self.layer_id == 0:
+            v_first = v
+        else:
+            v = v + (v_first - v) * torch.sigmoid(
+                self.v0 + (xv @ self.v1) @ self.v2
+            )
+
+        a = torch.sigmoid(self.a0 + (xa @ self.a1) @ self.a2)  # In-context LR
+        g = torch.sigmoid(xg @ self.g1) @ self.g2  # Gate
+
+        # Normalize keys for stability
+        kk = k * self.k_k
+        kk = F.normalize(kk.view(B, T, H, -1), dim=-1, p=2.0).view(B, T, C)
+        k = k * (1 + (a - 1) * self.k_a)
+
+        # Core recurrence
+        x = RWKV7_OP(r, w, k, v, -kk, kk * a).to(r.dtype)
+        x = self.ln_x(x.view(B * T, C)).view(B, T, C)
+
+        # Additional local mix (receptance * key * r_k) * value
+        x = x + (
+            (r.view(B, T, H, -1) * k.view(B, T, H, -1) * self.r_k).sum(
+                dim=-1, keepdim=True
+            )
+            * v.view(B, T, H, -1)
+        ).view(B, T, C)
+        x = self.output(x * g)
+        return x, v_first
+
+
+# ==============================================================================
+# RWKV Channel-Mix Layer (Feed-Forward)
+# ==============================================================================
+class RWKV_CMix_x070(nn.Module):
+    def __init__(self, args, layer_id):
+        super().__init__()
+        self.args = args
+        self.layer_id = layer_id
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        with torch.no_grad():
+            self.x_k = nn.Parameter(torch.empty(1, 1, args.n_embd))
+
+        self.key = nn.Linear(args.n_embd, args.dim_ffn, bias=False)
+        self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)
+
+    def forward(self, x):
+        xx = self.time_shift(x) - x
+        k = x + xx * self.x_k
+        k = torch.relu(self.key(k)) ** 2  # Squared ReLU
+        return self.value(k)
+
+
+# ==============================================================================
+# RWKV Building Block (Time-Mix + Channel-Mix + Norms)
+# ==============================================================================
+class Block(nn.Module):
+    def __init__(self, args, layer_id):
+        super().__init__()
+        self.args = args
+        self.layer_id = layer_id
+        self.ln0 = nn.LayerNorm(args.n_embd) if layer_id == 0 else None
+        self.ln1 = nn.LayerNorm(args.n_embd)
+        self.ln2 = nn.LayerNorm(args.n_embd)
+
+        self.att = RWKV_Tmix_x070(args, layer_id)
+        self.ffn = RWKV_CMix_x070(args, layer_id)
+
+    def forward(self, x, v_first):
+        if self.layer_id == 0:
+            x = self.ln0(x)
+        xx, v_first = self.att(self.ln1(x), v_first)
+        x = x + xx
+        x = x + self.ffn(self.ln2(x))
+        return x, v_first
+
+
+# ==============================================================================
+# Full RWKV Model
+# ==============================================================================
+class RWKV(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        args.dim_att = args.n_embd
+        args.dim_ffn = args.n_embd * 4
+        self.emb = nn.Embedding(args.vocab_size, args.n_embd)
+
+        self.blocks = nn.ModuleList(
+            [Block(args, i) for i in range(args.n_layer)]
+        )
+        self.ln_out = nn.LayerNorm(args.n_embd)
+        self.head = nn.Linear(args.n_embd, args.vocab_size, bias=False)
+
+    def forward(self, idx):
+        x = self.emb(idx)
+        v_first = torch.empty_like(x)
+        for block in self.blocks:
+            x, v_first = block(x, v_first)
+        x = self.ln_out(x)
+        x = self.head(x)
+        return x
+
+
+# ==============================================================================
+# Weight Conversion Utilities (PyTorch ↔ Keras)
+# ==============================================================================
+def convert_cmix(my_chnnal_mix, weights, i):
+    my_chnnal_mix.set_weights(
+        [
+            weights.pop("blocks.%d.ffn.x_k" % i),
+            weights.pop("blocks.%d.ffn.key.weight" % i).T,
+            weights.pop("blocks.%d.ffn.value.weight" % i).T,
+        ]
+    )
+
+
+def convert_tmix(my_time_mix, weights, i):
+    weights_list = [
+        weights.pop("blocks.%d.att.x_r" % i),
+        weights.pop("blocks.%d.att.x_w" % i),
+        weights.pop("blocks.%d.att.x_k" % i),
+        weights.pop("blocks.%d.att.x_v" % i),
+        weights.pop("blocks.%d.att.x_a" % i),
+        weights.pop("blocks.%d.att.x_g" % i),
+        weights.pop("blocks.%d.att.w0" % i),
+        weights.pop("blocks.%d.att.w1" % i),
+        weights.pop("blocks.%d.att.w2" % i),
+        weights.pop("blocks.%d.att.a0" % i),
+        weights.pop("blocks.%d.att.a1" % i),
+        weights.pop("blocks.%d.att.a2" % i),
+        weights.pop("blocks.%d.att.v0" % i),
+        weights.pop("blocks.%d.att.v1" % i),
+        weights.pop("blocks.%d.att.v2" % i),
+        weights.pop("blocks.%d.att.g1" % i),
+        weights.pop("blocks.%d.att.g2" % i),
+        weights.pop("blocks.%d.att.k_k" % i),
+        weights.pop("blocks.%d.att.k_a" % i),
+        weights.pop("blocks.%d.att.r_k" % i),
+        weights.pop("blocks.%d.att.receptance.weight" % i).T,
+        weights.pop("blocks.%d.att.key.weight" % i).T,
+        weights.pop("blocks.%d.att.value.weight" % i).T,
+        weights.pop("blocks.%d.att.output.weight" % i).T,
+        weights.pop("blocks.%d.att.ln_x.weight" % i),
+        weights.pop("blocks.%d.att.ln_x.bias" % i),
+    ]
+    my_time_mix.set_weights(weights_list)
+
+
+def convert_layernorm(myln, weights, ln_id, layer_id):
+    myln.set_weights(
+        [
+            weights.pop("blocks.%d.ln%d.weight" % (layer_id, ln_id)),
+            weights.pop("blocks.%d.ln%d.bias" % (layer_id, ln_id)),
+        ]
+    )
+
+
+def convert_block(my_block, weights, i):
+    convert_cmix(my_block.ffn, weights, i)
+    convert_tmix(my_block.att, weights, i)
+    if my_block.use_initial_norm:
+        convert_layernorm(my_block.ln0, weights, 0, i)
+    convert_layernorm(my_block.ln1, weights, 1, i)
+    convert_layernorm(my_block.ln2, weights, 2, i)
+
+
+def convert_backbone(my_backbone, standard_RWKV):
+    for i in range(my_backbone.num_layers):
+        convert_block(my_backbone.rwkv_layers[i], standard_RWKV.blocks[i])
+    my_backbone.token_embedding.set_weights(
+        [standard_RWKV.emb.weight.detach().cpu()]
+    )
+    convert_layernorm(my_backbone.output_layer_norm, standard_RWKV.ln_out)
+
+
+# ==============================================================================
+# Checkpoint Conversion Entry Point
+# ==============================================================================
+def convert_rwkv7_checkpoints(weights_path):
+    weights = torch.load(weights_path, map_location="cpu")
+    weights = {k: v.float().numpy() for k, v in weights.items()}
+    w = weights
+    n_layer = 0
+    for k in w.keys():
+        layer_id = int(k.split(".")[1]) if ("blocks." in k) else 0
+        n_layer = max(n_layer, layer_id + 1)
+
+    config = {
+        "hidden_size": w["emb.weight"].shape[1],
+        "num_layers": n_layer,
+        "intermediate_dim": w["blocks.0.ffn.key.weight"].shape[0],
+        "vocabulary_size": 65536,
+        "head_size": 64,
+    }
+    my_backbone = RWKV7Backbone(**config)
+
+    # Copy layer-1 value-residual params to layer-0 (compatibility)
+    weights["blocks.0.att.v0"] = weights["blocks.1.att.v0"]
+    weights["blocks.0.att.v1"] = weights["blocks.1.att.v1"]
+    weights["blocks.0.att.v2"] = weights["blocks.1.att.v2"]
+
+    my_backbone.get_layer("token_embedding").set_weights(
+        [weights.pop("emb.weight")]
+    )
+    for i in range(config["num_layers"]):
+        my_block = my_backbone.get_layer(f"rwkv_layer_{i}")
+        convert_block(my_block, weights, i)
+
+    my_backbone.output_layer_norm.set_weights(
+        [
+            weights.pop("ln_out.weight"),
+            weights.pop("ln_out.bias"),
+        ]
+    )
+    model = RWKV7CausalLM(my_backbone)
+    my_backbone.head.set_weights([weights.pop("head.weight").T])
+    return model
+
+
+# ==============================================================================
+# Main Script
+# ==============================================================================
+url = "https://raw.githubusercontent.com/BlinkDL/RWKV-LM/main/RWKV-v7/rwkv_vocab_v20230424.txt"
+
+
+def main(_):
+    if not os.path.exists(FLAGS.preset):
+        os.makedirs(FLAGS.preset)
+
+    souce_model_name = PRESET_MAP[FLAGS.preset]
+    # Download vocabulary file
+
+    vocabs = requests.get(url, timeout=30).text
+    with open(
+        os.path.join(FLAGS.preset, "vocab.txt"), "w", encoding="utf-8"
+    ) as f:
+        f.write(vocabs)
+    tokenizer = RWKVTokenizer(FLAGS.preset)
+    tokenizer.load_assets()
+
+    # Download checkpoint
+    download_path = snapshot_download(
+        repo_id="Blink_DL/rwkv-7-world",
+        allow_patterns=souce_model_name,
+    )
+    weights_path = os.path.join(download_path, souce_model_name)
+
+    # Convert to Keras format
+    my_model = convert_rwkv7_checkpoints(weights_path)
+
+    # Re-build PyTorch reference model
+    args = types.SimpleNamespace()
+    args.n_layer = my_model.backbone.num_layers
+    args.n_embd = my_model.backbone.hidden_size
+    args.vocab_size = my_model.backbone.vocabulary_size
+    args.head_size_a = 64
+    args.dim_att = args.n_embd
+    args.dim_ffn = my_model.backbone.intermediate_dim
+
+    if os.environ["CUDA_VISIBLE_DEVICES"] != "-1":
+        standard_model = RWKV(args).cuda()
+    else:
+        standard_model = RWKV(args)
+
+    weights = torch.load(weights_path, map_location="cpu")
+    standard_model.load_state_dict(weights, strict=False)
+
+    # Sanity check: tokenize & compare outputs
+    x = tokenizer(["i love u"])
+    x = np.reshape(x, [1, -1])
+    my_output = my_model(ops.convert_to_tensor(x, "int32"))
+    xx = torch.from_numpy(x).int()
+    if torch.cuda.is_available():
+        xx = xx.cuda()
+    standard_output = standard_model(xx)
+
+    standard_output = standard_output.cpu().float().detach().numpy()
+    my_output = ops.convert_to_numpy(ops.cast(my_output, "float32"))
+
+    try:
+        np.testing.assert_allclose(my_output, standard_output, atol=1e-4)
+    except AssertionError as err:
+        print("\n")
+        print(err.args[0])
+        print("\n")
+
+    # Export final Keras model
+    my_model.backbone.save_to_preset(f"./{FLAGS.preset}")
+
+
+# ==============================================================================
+# Entry Guard
+# ==============================================================================
+if __name__ == "__main__":
+    flags.mark_flag_as_required("preset")
+    app.run(main)

From 7bc36b59921d8aa5dbffda4937ed331099e136e7 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Sun, 28 Sep 2025 22:52:26 +0800
Subject: [PATCH 02/10] fix

---
 tools/checkpoint_conversion/convert_rwkv7_checkpoints.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
index e51e0c4d79..e4d5f00d5e 100644
--- a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
@@ -403,8 +403,8 @@ def main(_):
         os.path.join(FLAGS.preset, "vocab.txt"), "w", encoding="utf-8"
     ) as f:
         f.write(vocabs)
-    tokenizer = RWKVTokenizer(FLAGS.preset)
-    tokenizer.load_assets()
+    tokenizer = RWKVTokenizer()
+    tokenizer.load_assets(FLAGS.preset)
 
     # Download checkpoint
     download_path = snapshot_download(

From 7d4a7a1c48c382cf110189a37487d60014bd64d9 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Sun, 28 Sep 2025 23:04:59 +0800
Subject: [PATCH 03/10] fix

---
 tools/checkpoint_conversion/convert_rwkv7_checkpoints.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
index e4d5f00d5e..5c4b76467e 100644
--- a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
@@ -447,6 +447,7 @@ def main(_):
 
     try:
         np.testing.assert_allclose(my_output, standard_output, atol=1e-4)
+        print("Successfully passed the numerical verification! 🎯✅📊")
     except AssertionError as err:
         print("\n")
         print(err.args[0])

From e5bb446e582f7149960c384d06e89b619c940b20 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Tue, 7 Oct 2025 23:15:10 +0800
Subject: [PATCH 04/10] add inference

---
 keras_hub/api/models/__init__.py              |   2 +-
 keras_hub/src/models/rwkv7/rwkv7_backbone.py  |   1 +
 keras_hub/src/models/rwkv7/rwkv7_casual_lm.py |  50 -----
 keras_hub/src/models/rwkv7/rwkv7_causal_lm.py | 182 ++++++++++++++++++
 .../rwkv7/rwkv7_causal_lm_preprocessor.py     |  59 ++++--
 keras_hub/src/models/rwkv7/rwkv7_layer.py     | 181 ++++++++---------
 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py |  17 +-
 .../convert_rwkv7_checkpoints.py              |  13 +-
 8 files changed, 324 insertions(+), 181 deletions(-)
 delete mode 100644 keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_causal_lm.py

diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index 403bfb65ad..714b0e64ad 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -430,7 +430,7 @@
     RoformerV2Tokenizer,
 )
 from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
-from keras_hub.src.models.rwkv7.rwkv7_casual_lm import RWKV7CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm import RWKV7CausalLM
 from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
     RWKV7CausalLMPreprocessor,
 )
diff --git a/keras_hub/src/models/rwkv7/rwkv7_backbone.py b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
index d6d3d9a36b..3c3fcc85b8 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_backbone.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
@@ -90,6 +90,7 @@ def __init__(
             dtype=dtype,
             **kwargs,
         )
+        self.call(ops.ones([1, 16], "int32"))
 
         self.num_layers = num_layers
         self.head_size = head_size
diff --git a/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py b/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
deleted file mode 100644
index c78154129b..0000000000
--- a/keras_hub/src/models/rwkv7/rwkv7_casual_lm.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.models.causal_lm import CausalLM
-from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
-from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
-    RWKV7CausalLMPreprocessor,
-)
-
-
-@keras_hub_export("keras_hub.models.RWKV7CausalLM")
-class RWKV7CausalLM(CausalLM):
-    backbone_cls = RWKV7Backbone
-    preprocessor_cls = RWKV7CausalLMPreprocessor
-
-    def __init__(self, backbone, preprocessor=None, **kwargs):
-        # === Layers ===
-        self.backbone = backbone
-        self.preprocessor = preprocessor
-        super().__init__(
-            inputs=backbone.inputs,
-            outputs=backbone.outputs,
-            **kwargs,
-        )
-
-    def call_with_cache(
-        self,
-        token_ids,
-        cache,
-        cache_update_index,
-    ):
-        pass  # TODO
-
-    def _build_cache(self, token_ids):
-        pass  # TODO
-
-    def generate_step(
-        self,
-        inputs,
-        stop_token_ids=None,
-    ):
-        pass  # TODO
-
-    def score(
-        self,
-        token_ids,
-        padding_mask=None,
-        scoring_mode="logits",
-        layer_intercept_fn=None,
-        target_ids=None,
-    ):
-        pass  # TODO
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
new file mode 100644
index 0000000000..b19ce735c2
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
@@ -0,0 +1,182 @@
+from keras import ops
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.causal_lm import CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
+    RWKV7CausalLMPreprocessor,
+)
+from keras_hub.src.utils.tensor_utils import any_equal
+
+
+@keras_hub_export("keras_hub.models.RWKV7CausalLM")
+class RWKV7CausalLM(CausalLM):
+    backbone_cls = RWKV7Backbone
+    preprocessor_cls = RWKV7CausalLMPreprocessor
+
+    def __init__(self, backbone, preprocessor=None, **kwargs):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        super().__init__(
+            inputs=backbone.inputs,
+            outputs=backbone.outputs,
+            **kwargs,
+        )
+        self.call(ops.ones([1, 16], "int32"))
+
+    def call_with_cache(
+        self,
+        token_ids,
+        cache,
+        compute_head=True,
+        padding_mask=None,
+        rnn_mode=True,
+    ):
+        state_cachce, last_token_cache = cache
+        x = self.backbone.token_embedding(token_ids)
+        if padding_mask is None:
+            padding_mask = ops.not_equal(token_ids, 0)
+        v_first = None
+        updated_state_cachce = []
+        updated_last_token_cache = []
+
+        for i in range(self.backbone.num_layers):
+            current_state_cache = state_cachce[:, i, ...]
+            current_token_cache = last_token_cache[:, i, ...]
+            x, v_first, new_cache_state, cache_tmix_x, cache_cmix_x = (
+                self.backbone.rwkv_layers[i].call(
+                    x,
+                    v_first=v_first,
+                    padding_mask=padding_mask,
+                    cache_state=current_state_cache,
+                    cache_tmix_x=current_token_cache[:, 0],
+                    cache_cmix_x=current_token_cache[:, 1],
+                    rnn_mode=rnn_mode,
+                    train_mode=False,
+                )
+            )
+            new_token_cache = ops.stack([cache_tmix_x, cache_cmix_x], axis=1)
+            updated_state_cachce.append(new_cache_state)
+            updated_last_token_cache.append(new_token_cache)
+        cache = [
+            ops.stack(updated_state_cachce, axis=1),
+            ops.stack(updated_last_token_cache, axis=1),
+        ]
+        hidden_states = x = self.backbone.output_layer_norm(x)
+        if compute_head:
+            logits = self.backbone.head(x)
+        else:
+            logits = None
+        return logits, hidden_states, cache
+
+    def _build_cache(self, token_ids):
+        """Build an empty cache for use with `call_with_cache()`."""
+        batch_size = ops.shape(token_ids)[0]
+        num_layers = self.backbone.num_layers
+        head_dim = self.backbone.head_size
+        hidden_size = self.backbone.hidden_size
+        num_heads = hidden_size // head_dim
+
+        state_cachce = ops.zeros(
+            [batch_size, num_layers, num_heads, head_dim, head_dim],
+            dtype=self.compute_dtype,
+        )
+        last_token_cache = ops.zeros(
+            [batch_size, num_layers, 2, 1, hidden_size],
+            dtype=self.compute_dtype,
+        )
+        cache = [state_cachce, last_token_cache]
+
+        # Seed the cache.
+        # prefill阶段可以使用kernel,要快一点
+        _, hidden_states, cache = self.call_with_cache(
+            token_ids,
+            cache,
+            rnn_mode=False,
+            compute_head=False,
+        )
+
+        return hidden_states, cache
+
+    def generate_step(
+        self,
+        inputs,
+        stop_token_ids=None,
+    ):
+        """A compilable generation function for a single batch of inputs.
+
+        This function represents the inner, XLA-compilable, generation function
+        for a single batch of inputs. Inputs should have the same structure as
+        model inputs, a dictionary with keys `"token_ids"` and `"padding_mask"`.
+
+        Args:
+            inputs: A dictionary with two keys `"token_ids"` and
+                `"padding_mask"` and batched tensor values.
+            stop_token_ids: Tuple of id's of the end token to stop on. If all
+                sequences have produced a new stop token, generation
+                will stop.
+        """
+        token_ids, padding_mask, predict_token_ids = (
+            inputs["token_ids"],
+            inputs["padding_mask"],
+            inputs["predict_token_ids"],
+        )
+        # Create and seed cache with a single forward pass.
+
+        hidden_states, cache = self._build_cache(token_ids)
+
+        def next(prompt, cache, index):
+            # The cache index is the index of our previous token.
+            cache_update_index = index - 1
+            batch_size = ops.shape(prompt)[0]
+            prompt = ops.slice(prompt, [0, cache_update_index], [batch_size, 1])
+            logits, hidden_states, cache = self.call_with_cache(
+                prompt,
+                cache,
+            )
+            return (
+                ops.squeeze(logits, axis=1),
+                ops.squeeze(hidden_states, axis=1),
+                cache,
+            )
+
+        output_ids = self.sampler(
+            next=next,
+            prompt=predict_token_ids,
+            cache=cache,
+            index=1,
+            mask=padding_mask,
+            stop_token_ids=stop_token_ids,
+            hidden_states=hidden_states,
+            model=self,
+        )
+        padding_mask = ops.concatenate(
+            [
+                ops.cast(ops.not_equal(token_ids, 0), padding_mask.dtype),
+                padding_mask,
+            ],
+            axis=1,
+        )
+        token_ids = ops.concatenate([token_ids, output_ids], axis=1)
+
+        # Compute an output padding mask with the token ids we updated.
+        if stop_token_ids is not None:
+            # Build a mask of stop token locations not in the original
+            # prompt (not in locations where `padding_mask` is True).
+            end_locations = any_equal(
+                token_ids, stop_token_ids, ops.logical_not(padding_mask)
+            )
+            end_locations = ops.cast(end_locations, "int32")
+            # Use cumsum to get ones in all locations after end_locations.
+            cumsum = ops.cast(ops.cumsum(end_locations, axis=-1), "int32")
+            overflow = cumsum - end_locations
+            # Our padding mask is the inverse of these overflow locations.
+            padding_mask = ops.logical_not(ops.cast(overflow, "bool"))
+        else:
+            # Without early stopping, all locations will have been updated.
+            padding_mask = ops.ones_like(token_ids, dtype="bool")
+        return {
+            "token_ids": token_ids,
+            "padding_mask": padding_mask,
+        }
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
index 9a8a88211c..064e143c9e 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
@@ -1,10 +1,11 @@
 import keras
+from keras import ops
 
 from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
 from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
 from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
-from keras_hub.src.utils.tensor_utils import strip_to_ragged
 
 
 @keras_hub_export("keras_hub.models.RWKV7CausalLMPreprocessor")
@@ -30,20 +31,33 @@ def call(
         sequence_length=None,
     ):
         sequence_length = sequence_length or self.sequence_length
+        # padding 长度到16的倍数，适应kernel的需求
+        sequence_length = sequence_length + (16 - sequence_length % 16)
         x = self.tokenizer(x)
-        # Pad with one extra token to account for the truncation below.
+
         token_ids, padding_mask = self.packer(
-            x,
-            sequence_length=sequence_length + 1,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
+            x, sequence_length=sequence_length, add_end_value=False
         )
+
         # The last token does not have a next token, so we truncate it out.
         x = token_ids[..., :-1]
         # Target `y` will be the next token.
         y, sample_weight = token_ids[..., 1:], padding_mask[..., 1:]
         return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
 
+    def build(self, input_shape):
+        # Defer packer creation to `build()` so that we can be sure tokenizer
+        # assets have loaded when restoring a saved model.
+        self.packer = StartEndPacker(
+            start_value=None,
+            end_value=None,
+            pad_value=self.tokenizer.pad_token_id,
+            sequence_length=self.sequence_length,
+            return_padding_mask=True,
+            padding_side="left",
+        )
+        self.built = True
+
     def generate_preprocess(
         self,
         x,
@@ -62,12 +76,33 @@ def generate_preprocess(
         """
         if not self.built:
             self.build(None)
+        # 这么做的目的是为了对齐keras的api
+        # 输入的sequence_length是生成的最大长度
+        # 而本身sequence_length则对应于prefill的最大长度
+        generate_length = sequence_length
+        sequence_length = self.sequence_length
 
-        x = self.tokenizer(x)
-        token_ids, padding_mask = self.packer(
+        # padding 长度到16的倍数，适应kernel的需求
+        sequence_length = sequence_length + (16 - sequence_length % 16)
+        generate_length = generate_length + (16 - generate_length % 16)
+
+        x = [t[-sequence_length:] for t in self.tokenizer(x)]
+        y = ops.zeros((len(x), generate_length), "int32")
+        start_token = [[t[-1]] for t in x]
+        x = [t[:-1] if len(t) > 1 else [0] for t in x]
+
+        token_ids, __ = self.packer(
             x, sequence_length=sequence_length, add_end_value=False
         )
-        return token_ids
+        start_token = ops.convert_to_tensor(start_token, "int32")
+        y = ops.slice_update(y, [0, 0], start_token)
+        padding_mask = ops.not_equal(y, 0)
+
+        return {
+            "token_ids": token_ids,
+            "padding_mask": padding_mask,
+            "predict_token_ids": y,
+        }
 
     def generate_postprocess(
         self,
@@ -83,6 +118,6 @@ def generate_postprocess(
             self.build(None)
 
         token_ids, padding_mask = x["token_ids"], x["padding_mask"]
-        ids_to_strip = self.tokenizer.special_token_ids
-        token_ids = strip_to_ragged(token_ids, padding_mask, ids_to_strip)
-        return self.tokenizer.detokenize(token_ids)
+        token_ids = ops.convert_to_numpy(token_ids)
+        padding_mask = ops.convert_to_numpy(padding_mask)
+        return self.tokenizer.detokenize(token_ids * padding_mask)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_layer.py b/keras_hub/src/models/rwkv7/rwkv7_layer.py
index 217f42ee32..0df1c14322 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_layer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_layer.py
@@ -1,23 +1,12 @@
-import warnings
-
 import keras
 from keras import initializers
 from keras import ops
 from keras.layers import Layer
+import warnings
 
 
 def transpose_head(x, head_first):
-    """
-    Transpose the input tensor.
-
-    Parameters:
-    x: Input tensor.
-    head_first: Boolean flag indicating whether to transpose.
-
-    Returns:
-    Transposed tensor if head_first is True, otherwise the original tensor.
-    """
-    x = ops.cast(x, "float32")
+    x = ops.cast(x, dtype="float32")
     if head_first:
         return ops.transpose(x, (0, 2, 1, 3))
     else:
@@ -66,20 +55,11 @@ def rnn_generalized_delta_rule(
         if ops.shape(state)[0] == 1:
             state = ops.broadcast_to(state, (B, H, N, N))
     else:
-        state = ops.zeros((B, H, N, N), dtype="float32")
-    out = ops.zeros((B, T, H, N), dtype=r.dtype)
+        state = ops.zeros((B, H, N, N))
+    state = ops.cast(state, "float32")
+    out = ops.zeros((B, T, H, N), DTYPE)
 
     def step(t, inputs):
-        """
-        Performs computation for a single time step.
-
-        Parameters:
-        t: Current time step.
-        inputs: List containing current state and output.
-
-        Returns:
-        Updated state and output.
-        """
         state, out = inputs
         kk = ops.reshape(k[:, t, :], (B, H, 1, N))
         rr = ops.reshape(r[:, t, :], (B, H, N, 1))
@@ -87,9 +67,8 @@ def step(t, inputs):
         aa = ops.reshape(a[:, t, :], (B, H, N, 1))
         bb = ops.reshape(b[:, t, :], (B, H, 1, N))
         state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
-        out = ops.slice_update(
-            out, [0, t, 0, 0], ops.reshape((state @ rr), (B, 1, H, N))
-        )
+        o = ops.cast((state @ rr), out.dtype)
+        out = ops.slice_update(out, [0, t, 0, 0], ops.reshape(o, (B, 1, H, N)))
         return [state, out]
 
     state, out = ops.fori_loop(0, T, step, [state, out])
@@ -104,12 +83,11 @@ def __init__(self, name="time_shift"):
         super(TimeShift, self).__init__(name=name)
 
     def call(self, inputs, cache_x=None):
-        x = ops.pad(inputs, [[0, 0], [1, 0], [0, 0]], constant_values=0.0)[
-            :, :-1, :
-        ]
         if cache_x is not None:
-            x = ops.slice_update(x, [0, 0, 0], cache_x)
-        return x
+            x = ops.concatenate([cache_x, inputs], axis=1)
+        else:
+            x = ops.pad(inputs, [[0, 0], [1, 0], [0, 0]], constant_values=0.0)
+        return x[:, :-1, :]
 
     def compute_output_shape(self, input_shape):
         return input_shape
@@ -121,18 +99,16 @@ def __init__(self, dim_ffn, kernel_initializer="glorot_uniform", **kwargs):
         self.dim_ffn = dim_ffn
         self.kernel_initializer = initializers.get(kernel_initializer)
 
-    def call(self, x, last_cache_x=None):
-        if last_cache_x is None:
-            xx = self.time_shift(x) - x
-        else:
-            xx = self.time_shift(x, last_cache_x) - x
-            last_cache_x = x[:, -1:, :]
+    def call(self, x, last_cache_x=None, train_mode=True):
+        xx = self.time_shift(x, last_cache_x) - x
+        if last_cache_x is not None or not train_mode:
+            last_cache_x = x[:, -1:]
         k = x + xx * self.x_k
         k = ops.relu(self.key(k)) ** 2
         output = self.value(k)
-        if last_cache_x is not None:
-            output = [output, last_cache_x]
-        return output
+        if train_mode:
+            return output
+        return output, last_cache_x
 
     def compute_output_shape(self, input_shape):
         if isinstance(input_shape, list):
@@ -167,9 +143,7 @@ def build(self, input_shape):
     def get_config(self):
         config = {
             "dim_ffn": self.dim_ffn,
-            "kernel_initializer": initializers.serialize(
-                self.kernel_initializer
-            ),
+            "kernel_initializer": initializers.serialize(self.kernel_initializer),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -209,22 +183,19 @@ def __init__(
         self.kernel_initializer = initializers.get(kernel_initializer)
         self.initial_state = None
         try:
-            from rwkv_ops import RWKV7_USE_KERNEL
             from rwkv_ops import generalized_delta_rule
+
+            self.RWKV7_OP = generalized_delta_rule
         except ImportError:
             warnings.warn(
                 "The 'rwkv_ops' package is not installed. "
-                "Falling back to a pure-Python operator,that will very slow."
+                "Falling back to the default (pure-Python) operators, which will be very slow. "
                 "Please install 'rwkv_ops' to enable the optimized kernels.",
                 UserWarning,
                 stacklevel=2,
             )
-            generalized_delta_rule = rnn_generalized_delta_rule
-            RWKV7_USE_KERNEL = False
-        self.RWKV7_OP, self.USE_KERNEL = (
-            generalized_delta_rule,
-            RWKV7_USE_KERNEL,
-        )
+            self.RWKV7_OP = rnn_generalized_delta_rule
+
         assert self.hidden_size % self.n_head == 0
 
     def build(self, input_shape):
@@ -358,8 +329,9 @@ def call(
         last_cache_x=None,
         cache_state=None,
         rnn_mode=False,
+        train_mode=True,
     ):
-        if cache_state is None:
+        if cache_state == None:
             initial_state = self.initial_state
         else:
             initial_state = cache_state
@@ -370,11 +342,11 @@ def call(
             x *= padding_mask
         B, T, C = ops.shape(x)
         H = self.n_head
-        if last_cache_x is None:
-            xx = self.time_shift(x) - x
-        else:
-            xx = self.time_shift(x, last_cache_x) - x
-            last_cache_x = x[:, -1:, :]
+        xx = self.time_shift(x, last_cache_x) - x
+        if last_cache_x is not None or not train_mode:
+            last_cache_x = x[:, -1:]
+        if padding_mask is not None:
+            xx *= padding_mask
 
         xr = x + xx * self.x_r
         xw = x + xx * self.x_w
@@ -386,10 +358,7 @@ def call(
         r = self.receptance(xr)
         w = (
             -ops.softplus(
-                -(
-                    self.w0
-                    + ops.matmul(ops.tanh(ops.matmul(xw, self.w1)), self.w2)
-                )
+                -(self.w0 + ops.matmul(ops.tanh(ops.matmul(xw, self.w1)), self.w2))
             )
             - 0.5
         )  # soft-clamp to (-inf, -0.5)
@@ -414,27 +383,30 @@ def call(
 
         k = k * (1 + (a - 1) * self.k_a)
         if padding_mask is not None:
-            v *= padding_mask
-            if self.USE_KERNEL:
-                w += (1 - padding_mask) * -1e9
-            else:
-                w = w * padding_mask + 1 - padding_mask
-        # N = self.head_size
+            w = ops.where(padding_mask, w, -1e9)
         if rnn_mode:
             rwkv7_op = rnn_generalized_delta_rule
         else:
             rwkv7_op = self.RWKV7_OP
-        x, finnal_state = rwkv7_op(
-            ops.reshape(r, (B, T, self.n_head, self.head_size)),
-            ops.reshape(w, (B, T, self.n_head, self.head_size)),
-            ops.reshape(k, (B, T, self.n_head, self.head_size)),
-            ops.reshape(v, (B, T, self.n_head, self.head_size)),
-            ops.reshape(-kk, (B, T, self.n_head, self.head_size)),
-            ops.reshape(kk * a, (B, T, self.n_head, self.head_size)),
-            initial_state=initial_state,
-        )
 
-        x = ops.reshape(x, (B, T, C))
+        def reshape_and_cast(x, new_shape, dtype="float32"):
+            x = ops.reshape(x, new_shape)
+            if rnn_mode:
+                return x
+            return ops.cast(x, dtype)
+
+        x, finnal_state = rwkv7_op(
+            reshape_and_cast(r, (B, T, self.n_head, self.head_size)),
+            reshape_and_cast(w, (B, T, self.n_head, self.head_size)),
+            reshape_and_cast(k, (B, T, self.n_head, self.head_size)),
+            reshape_and_cast(v, (B, T, self.n_head, self.head_size)),
+            reshape_and_cast(-kk, (B, T, self.n_head, self.head_size)),
+            reshape_and_cast(kk * a, (B, T, self.n_head, self.head_size)),
+            initial_state=ops.cast(initial_state, "float32")
+            if initial_state is not None
+            else None,
+        )
+        x = reshape_and_cast(x, (B, T, C), self.compute_dtype)
 
         x = ops.reshape(self.ln_x(ops.reshape(x, (B * T, C))), ops.shape(x))
 
@@ -449,10 +421,9 @@ def call(
 
         x = x + ops.reshape(rwkv, (B, T, C))
         x = self.output_layer(x * g)
-        output = [x, v_first]
-        if last_cache_x is not None:
-            output.extend([last_cache_x, finnal_state])
-        return output
+        if train_mode:
+            return x, v_first
+        return x, v_first, last_cache_x, finnal_state
 
     def compute_output_shape(self, input_shape):
         output_shapes = [
@@ -484,9 +455,7 @@ def get_config(self):
             "mv_lora": self.mv_lora,
             "aaa_lora": self.aaa_lora,
             "decay_lora": self.decay_lora,
-            "kernel_initializer": initializers.serialize(
-                self.kernel_initializer
-            ),
+            "kernel_initializer": initializers.serialize(self.kernel_initializer),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -520,19 +489,13 @@ def __init__(
     def build(self, input_shape):
         super().build(input_shape)
         if self.use_initial_norm:
-            self.ln0 = keras.layers.LayerNormalization(
-                epsilon=1e-5, name="init_norm"
-            )
+            self.ln0 = keras.layers.LayerNormalization(epsilon=1e-5, name="init_norm")
             self.ln0.build(input_shape)
 
-        self.ln1 = keras.layers.LayerNormalization(
-            epsilon=1e-5, name="att_norm"
-        )
+        self.ln1 = keras.layers.LayerNormalization(epsilon=1e-5, name="att_norm")
         self.ln1.build(input_shape)
 
-        self.ln2 = keras.layers.LayerNormalization(
-            epsilon=1e-5, name="ffn_norm"
-        )
+        self.ln2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ffn_norm")
         self.ln2.build(input_shape)
 
         self.att = RWKV7_TimeMix(
@@ -563,27 +526,41 @@ def call(
         cache_tmix_x=None,
         cache_cmix_x=None,
         rnn_mode=False,
+        train_mode=True,
     ):
+        if padding_mask is not None:
+            padding_mask = ops.cast(padding_mask, x.dtype)
+            padding_mask = ops.expand_dims(padding_mask, axis=-1)
         if self.use_initial_norm:
             x = self.ln0(x)
-        if cache_state is None:
+        if train_mode:
             xx, v_first = self.att(
-                self.ln1(x), v_first=v_first, padding_mask=padding_mask
+                self.ln1(x),
+                v_first=v_first,
+                padding_mask=padding_mask,
+                train_mode=train_mode,
             )
             x = x + xx
-            x = x + self.ffn(self.ln2(x))
+            xx = self.ln2(x)
+            if padding_mask is not None:
+                xx = xx * padding_mask
+            x = x + self.ffn(xx, train_mode=train_mode)
             return x, v_first
         else:
-            xx, v_first, cache_tmix_x, cache_state = self.att(
+            xx, v_first, cache_tmix_x, cache_state = self.att.call(
                 self.ln1(x),
                 v_first=v_first,
                 padding_mask=padding_mask,
                 last_cache_x=cache_tmix_x,
                 cache_state=cache_state,
                 rnn_mode=rnn_mode,
+                train_mode=train_mode,
             )
             x = x + xx
-            xx, cache_cmix_x = self.ffn(self.ln2(x), cache_cmix_x)
+            xx = self.ln2(x)
+            if padding_mask is not None:
+                xx = xx * padding_mask
+            xx, cache_cmix_x = self.ffn(xx, cache_cmix_x, train_mode=train_mode)
             x = x + xx
             return x, v_first, cache_state, cache_tmix_x, cache_cmix_x
 
@@ -604,9 +581,7 @@ def get_config(self):
             "decay_lora": self.decay_lora,
             "intermediate_dim": self.intermediate_dim,
             "use_initial_norm": self.use_initial_norm,
-            "kernel_initializer": initializers.serialize(
-                self.kernel_initializer
-            ),
+            "kernel_initializer": initializers.serialize(self.kernel_initializer),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
index ce2e49535a..5f5f990e4f 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
@@ -1,7 +1,6 @@
 import os
 
 import keras
-
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.tokenizers import tokenizer
 from keras_hub.src.utils.tensor_utils import is_int_dtype
@@ -115,13 +114,7 @@ def printTokens(self, tokens):
             print(f"{repr(s)}{i}", end=" ")
         print()
 
-
-@keras_hub_export(
-    [
-        "keras_hub.tokenizers.RWKVTokenizer",
-        "keras_hub.models.RWKVTokenizer",
-    ]
-)
+@keras_hub_export("keras_hub.tokenizers.RWKVTokenizer")
 class RWKVTokenizer(tokenizer.Tokenizer):
     def __init__(
         self,
@@ -203,7 +196,7 @@ def tokenize(self, inputs):
         tokens = self._tokenizer.encode(inputs)
 
         def tokens2ids(x):
-            return [self.token_to_id(t) for t in x]
+            return [self.id_to_token(t) for t in x]
 
         if is_string_dtype(self.dtype):
             if isinstance(inputs, str):
@@ -213,7 +206,11 @@ def tokens2ids(x):
 
     def detokenize(self, inputs):
         self._check_vocabulary()
-        return self._tokenizer.decode(inputs)
+        strip_zero_inputs = []
+        for t in inputs:
+            strip_zero_inputs.append([x for x in t if x != 0])
+
+        return self._tokenizer.decode(strip_zero_inputs)
 
     def compute_output_spec(self, input_spec):
         return keras.KerasTensor(
diff --git a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
index 5c4b76467e..28cb1c0aff 100644
--- a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
@@ -23,7 +23,7 @@
 from modelscope import snapshot_download
 
 from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
-from keras_hub.src.models.rwkv7.rwkv7_casual_lm import RWKV7CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm import RWKV7CausalLM
 
 # Local modules
 from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
@@ -32,10 +32,11 @@
 # Model Preset Registry
 # ==============================================================================
 PRESET_MAP = {
-    "rwkv7_world_0.1B": "RWKV-x070-World-0.1B-v2.8-20241210-ctx4096.pth",
-    "rwkv7_world_0.3B": "RWKV-x070-World-0.4B-v2.9-20250107-ctx4096.pth",
-    "rwkv7_world_1.5B": "RWKV-x070-World-1.5B-v3-20250127-ctx4096.pth",
-    "rwkv7_world_2.9B": "RWKV-x070-World-2.9B-v3-20250211-ctx4096.pth",
+    "RWKV7_G1a_0.1B": "rwkv7-g1a-0.1b-20250728-ctx4096.pth",
+    "RWKV7_G1a_0.3B": "rwkv7-g1a-0.4b-20250905-ctx4096.pth",
+    "RWKV7_G1a_1.5B": "rwkv7-g1a-1.5b-20250922-ctx4096.pth",
+    "RWKV7_G1a_2.9B": "rwkv7-g1a-2.9b-20250924-ctx4096.pth",
+    "RWKV7_G0a_7.2B": "rwkv7-g0a-7.2b-20250829-ctx4096.pth",
 }
 
 # ==============================================================================
@@ -431,6 +432,8 @@ def main(_):
         standard_model = RWKV(args)
 
     weights = torch.load(weights_path, map_location="cpu")
+    # Some parameters are not present in the weights, but this does not matter.
+    # This is because these parameters are not used
     standard_model.load_state_dict(weights, strict=False)
 
     # Sanity check: tokenize & compare outputs

From afcff31bf865222f338ee40d6999d7d9e3da5f74 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Tue, 7 Oct 2025 23:19:52 +0800
Subject: [PATCH 05/10] add inference

---
 keras_hub/src/models/rwkv7/rwkv7_layer.py                | 3 +--
 tools/checkpoint_conversion/convert_rwkv7_checkpoints.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/keras_hub/src/models/rwkv7/rwkv7_layer.py b/keras_hub/src/models/rwkv7/rwkv7_layer.py
index 0df1c14322..65acf81195 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_layer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_layer.py
@@ -184,13 +184,12 @@ def __init__(
         self.initial_state = None
         try:
             from rwkv_ops import generalized_delta_rule
-
             self.RWKV7_OP = generalized_delta_rule
         except ImportError:
             warnings.warn(
                 "The 'rwkv_ops' package is not installed. "
                 "Falling back to the default (pure-Python) operators, which will be very slow. "
-                "Please install 'rwkv_ops' to enable the optimized kernels.",
+                "Please 'pip install rwkv_ops' to enable the optimized kernels.",
                 UserWarning,
                 stacklevel=2,
             )
diff --git a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
index 28cb1c0aff..4504b09253 100644
--- a/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_rwkv7_checkpoints.py
@@ -409,7 +409,7 @@ def main(_):
 
     # Download checkpoint
     download_path = snapshot_download(
-        repo_id="Blink_DL/rwkv-7-world",
+        repo_id="RWKV/rwkv7-g1",
         allow_patterns=souce_model_name,
     )
     weights_path = os.path.join(download_path, souce_model_name)

From ec0baf3b27eb4d52784b73d39429c7539af55345 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Wed, 8 Oct 2025 00:10:06 +0800
Subject: [PATCH 06/10] add tokenizer doc

---
 keras_hub/src/models/rwkv7/rwkv7_layer.py     |  15 +-
 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py | 179 +++++++++++++++++-
 2 files changed, 180 insertions(+), 14 deletions(-)

diff --git a/keras_hub/src/models/rwkv7/rwkv7_layer.py b/keras_hub/src/models/rwkv7/rwkv7_layer.py
index 65acf81195..1a46a59bd8 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_layer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_layer.py
@@ -24,19 +24,8 @@ def rnn_generalized_delta_rule(
     output_final_state: bool = True,
     head_first: bool = False,
 ):
-    """
-    Implements the generalized delta rule.
-
-    Parameters:
-    r: Input tensor.
-    w: Weight tensor.
-    k, v, a, b: Other input tensors.
-    initial_state: Initial state tensor.
-    output_final_state: Whether to return the final state.
-    head_first: Whether to place the head dimension first during computation.
-
-    Returns:
-    Final state if output_final_state is True, otherwise only the output.
+    """Implements the generalized delta rule.
+
     """
     DTYPE = r.dtype
     B, T, H, N = ops.shape(r)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
index 5f5f990e4f..e2ca50e418 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
@@ -7,21 +7,34 @@
 from keras_hub.src.utils.tensor_utils import is_string_dtype
 from keras_hub.src.utils.tensor_utils import tensor_to_list
 
+# Vocabulary file name constant
 VOCAB_FILENAME = "vocab.txt"
 
 
 class TRIE:
+    """Byte-level Trie structure for longest prefix matching.
+    
+    This class implements a trie data structure that stores byte
+    sequences and allows efficient longest prefix matching.
+    """
     __slots__ = tuple("ch,to,values,front".split(","))
     to: list
     values: set
 
     def __init__(self, front=None, ch=None):
+        """Initialize a TRIE node.
+        
+        Args:
+            front: Parent node reference.
+            ch: Byte value for this node.
+        """
         self.ch = ch
         self.to = [None for ch in range(256)]
         self.values = set()
         self.front = front
 
     def __repr__(self):
+        """String representation of the TRIE node."""
         fr = self
         ret = []
         while fr is not None:
@@ -31,6 +44,16 @@ def __repr__(self):
         return "<TRIE %s %s>" % (ret[::-1], self.values)
 
     def add(self, key: bytes, idx: int = 0, val=None):
+        """Add a key-value pair to the trie.
+        
+        Args:
+            key: Byte sequence to add.
+            idx: Current index in key processing.
+            val: Value to store (defaults to key).
+            
+        Returns:
+            Final node where key was inserted.
+        """
         if idx == len(key):
             if val is None:
                 val = key
@@ -42,6 +65,15 @@ def add(self, key: bytes, idx: int = 0, val=None):
         return self.to[ch].add(key, idx=idx + 1, val=val)
 
     def find_longest(self, key: bytes, idx: int = 0):
+        """Find longest match in trie for given key.
+        
+        Args:
+            key: Byte sequence to search for.
+            idx: Starting index for search.
+            
+        Returns:
+            Tuple of (end_index, node, values) for match.
+        """
         u: TRIE = self
         ch: int = key[idx]
 
@@ -57,7 +89,18 @@ def find_longest(self, key: bytes, idx: int = 0):
 
 
 class RWKV_TOKENIZER:
+    """RWKV tokenizer implementation using byte-level trie.
+    
+    Implements tokenization using a fixed vocabulary and greedy
+    longest-match algorithm on byte sequences.
+    """
     def __init__(self, vocabs):
+        """Initialize tokenizer with vocabulary.
+        
+        Args:
+            vocabs: List of vocabulary entries in format
+                   "<idx> <repr> <len>".
+        """
         self.idx2token = {}
         sorted = []  # must be already sorted
         for l in vocabs:
@@ -78,6 +121,14 @@ def __init__(self, vocabs):
             _ = self.root.add(t, val=(t, i))
 
     def encodeBytes(self, src: bytes):
+        """Encode byte sequence to token IDs.
+        
+        Args:
+            src: Byte sequence to encode.
+            
+        Returns:
+            List of token IDs.
+        """
         idx: int = 0
         tokens = []
         while idx < len(src):
@@ -89,15 +140,39 @@ def encodeBytes(self, src: bytes):
         return tokens
 
     def decodeBytes(self, tokens):
+        """Decode token IDs to byte sequence.
+        
+        Args:
+            tokens: List of token IDs.
+            
+        Returns:
+            Decoded byte sequence.
+        """
         return b"".join(map(lambda i: self.idx2token[i], tokens))
 
     def encode(self, src):
+        """Encode text to token IDs.
+        
+        Args:
+            src: Text string or list of strings.
+            
+        Returns:
+            Token IDs or list of token ID lists.
+        """
         if isinstance(src, str):
             return self.encodeBytes(src.encode("utf-8"))
         else:
             return [self.encodeBytes(s.encode("utf-8")) for s in src]
 
     def decode(self, tokens):
+        """Decode token IDs to text.
+        
+        Args:
+            tokens: Token IDs or list of token ID lists.
+            
+        Returns:
+            List of decoded text strings.
+        """
         return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
         # try:
         #     return self.decodeBytes(tokens).decode('utf-8')
@@ -105,6 +180,11 @@ def decode(self, tokens):
         #     return '\ufffd' # bad utf-8
 
     def printTokens(self, tokens):
+        """Print tokens with their string representations.
+        
+        Args:
+            tokens: List of token IDs to print.
+        """
         for i in tokens:
             s = self.idx2token[i]
             try:
@@ -114,14 +194,40 @@ def printTokens(self, tokens):
             print(f"{repr(s)}{i}", end=" ")
         print()
 
+
 @keras_hub_export("keras_hub.tokenizers.RWKVTokenizer")
 class RWKVTokenizer(tokenizer.Tokenizer):
+    """RWKV byte-level tokenizer with longest-match trie search.
+
+    This tokenizer maps raw text to a sequence of integer token ids
+    using a fixed vocabulary and a greedy longest-match algorithm.
+
+    Args:
+        vocabulary: list of strings, each line formatted as
+            "<idx> <repr> <len>".
+        dtype: output dtype for tensor operations. Must be integer
+            or string type.
+
+    Examples:
+
+    >>> vocab = ["0 ' ' 1", "1 '\\n' 1", "2 'the' 3", "3 'hello' 5"]
+    >>> tok = RWKVTokenizer(vocabulary=vocab)
+    >>> tok("hello the")
+    [3, 0, 2]
+    """
     def __init__(
         self,
         vocabulary=None,
         dtype="int32",
         **kwargs,
     ) -> None:
+        """Initialize RWKV tokenizer.
+        
+        Args:
+            vocabulary: Vocabulary list.
+            dtype: Output data type.
+            **kwargs: Additional keyword arguments.
+        """
         if not is_int_dtype(dtype) and not is_string_dtype(dtype):
             raise ValueError(
                 "Output dtype must be an integer type or a string. "
@@ -136,6 +242,11 @@ def __init__(
         self.file_assets = [VOCAB_FILENAME]
 
     def set_vocabulary(self, vocabulary):
+        """Set the tokenizer vocabulary.
+        
+        Args:
+            vocabulary: Vocabulary list to set.
+        """
         self.vocabulary = vocabulary
         self._tokenizer = RWKV_TOKENIZER(vocabulary)
         self.pad_token_id = 0
@@ -143,17 +254,28 @@ def set_vocabulary(self, vocabulary):
         self.end_token_id = self.tokenize(["\n\n"])[0][0]
 
     def save_assets(self, dir_path):
+        """Save vocabulary to directory.
+        
+        Args:
+            dir_path: Directory path to save to.
+        """
         path = os.path.join(dir_path, VOCAB_FILENAME)
         with open(path, "wb") as file:
             file.write("\n".join(self.vocabulary))
 
     def load_assets(self, dir_path=""):
+        """Load vocabulary from directory.
+        
+        Args:
+            dir_path: Directory path to load from.
+        """
         path = os.path.join(dir_path, VOCAB_FILENAME)
         with open(path, "r", encoding="utf-8") as f:
             vocabulary = f.readlines()
         self.set_vocabulary(vocabulary)
 
     def _check_vocabulary(self):
+        """Check if vocabulary is set, raise error if not."""
         if self.vocabulary is None:
             raise ValueError(
                 "No vocabulary has been set for RWKVTokenizer. Make "
@@ -161,14 +283,32 @@ def _check_vocabulary(self):
             )
 
     def vocabulary_size(self):
+        """Get the size of the vocabulary.
+        
+        Returns:
+            Number of tokens in vocabulary.
+        """
         self._check_vocabulary()
         return int(len(self.vocabulary))
 
     def get_vocabulary(self):
+        """Get the current vocabulary.
+        
+        Returns:
+            Current vocabulary list.
+        """
         self._check_vocabulary()
         return tensor_to_list(self.vocabulary)
 
     def id_to_token(self, id):
+        """Convert token ID to string representation.
+        
+        Args:
+            id: Token ID to convert.
+            
+        Returns:
+            String representation of token.
+        """
         self._check_vocabulary()
         if id >= self.vocabulary_size() or id < 0:
             raise ValueError(
@@ -183,6 +323,11 @@ def token_to_id(self, token):
         return int(self._tokenizer.token2idx[token])
 
     def get_config(self):
+        """Get tokenizer configuration.
+        
+        Returns:
+            Configuration dictionary.
+        """
         config = super().get_config()
         config.update(
             {
@@ -192,6 +337,14 @@ def get_config(self):
         return config
 
     def tokenize(self, inputs):
+        """Tokenize input text.
+        
+        Args:
+            inputs: Text to tokenize.
+            
+        Returns:
+            Tokenized representation.
+        """
         self._check_vocabulary()
         tokens = self._tokenizer.encode(inputs)
 
@@ -205,6 +358,14 @@ def tokens2ids(x):
         return tokens
 
     def detokenize(self, inputs):
+        """Convert tokens back to text.
+        
+        Args:
+            inputs: Tokens to convert.
+            
+        Returns:
+            Detokenized text.
+        """
         self._check_vocabulary()
         strip_zero_inputs = []
         for t in inputs:
@@ -213,9 +374,25 @@ def detokenize(self, inputs):
         return self._tokenizer.decode(strip_zero_inputs)
 
     def compute_output_spec(self, input_spec):
+        """Compute output specification.
+        
+        Args:
+            input_spec: Input specification.
+            
+        Returns:
+            Output tensor specification.
+        """
         return keras.KerasTensor(
             input_spec.shape + (None,), dtype=self.compute_dtype
         )
 
     def call(self, inputs):
-        return self.tokenize(inputs)
+        """Call the tokenizer on inputs.
+        
+        Args:
+            inputs: Input text.
+            
+        Returns:
+            Tokenized output.
+        """
+        return self.tokenize(inputs)
\ No newline at end of file

From bd6c6187449d3bf5e284a4f776ed5a1cf464ec71 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Wed, 8 Oct 2025 00:45:04 +0800
Subject: [PATCH 07/10] add doc

---
 keras_hub/src/models/rwkv7/rwkv7_backbone.py  |  65 ++++++++
 keras_hub/src/models/rwkv7/rwkv7_causal_lm.py |  78 +++++++++-
 .../rwkv7/rwkv7_causal_lm_preprocessor.py     | 141 +++++++++++++++---
 keras_hub/src/models/rwkv7/rwkv7_layer.py     | 129 ++++++++++++++--
 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py |  74 ++++-----
 5 files changed, 415 insertions(+), 72 deletions(-)

diff --git a/keras_hub/src/models/rwkv7/rwkv7_backbone.py b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
index 3c3fcc85b8..de460d95e9 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_backbone.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_backbone.py
@@ -12,6 +12,54 @@ def rwkv7_kernel_initializer(stddev=0.02):
 
 @keras_hub_export("keras_hub.models.RWKV7Backbone")
 class RWKV7Backbone(Backbone):
+    """The [RWKV-7](https://arxiv.org/abs/2503.14456) core architecture.
+
+    This network implements a Modern RNN architecture based on linear
+    attention mechanisms with recurrent processing, as described in the
+    RWKV papers. It includes the embedding lookups and RWKV-7 blocks.
+
+    The default constructor gives a fully customizable, randomly initialized
+    RWKV-7 model with any number of layers, heads, and embedding dimensions.
+    To load preset architectures and weights, use the `from_preset`
+    constructor.
+
+    Args:
+        hidden_size: int. The size of the transformer encoding and pooling
+            layers.
+        head_size: int. The size of each attention head.
+        num_layers: int. The number of transformer layers.
+        vocabulary_size: int. The size of the token vocabulary.
+        intermediate_dim: int. The output dimension of the first Dense layer in
+            a two-layer feedforward network for each transformer.
+        gate_lora: int. LoRA dimension for gating.
+        mv_lora: int. LoRA dimension for value mixing.
+        aaa_lora: int. LoRA dimension for alpha parameters.
+        decay_lora: int. LoRA dimension for decay parameters.
+        dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
+            for model computations and weights. Note that some computations,
+            such as softmax and layer normalization, will always be done at
+            float32 precision regardless of dtype.
+        dropout_rate: float. Dropout rate for the dropout layer.
+
+    Examples:
+
+    ```python
+    input_data = np.ones(shape=(1, 12), dtype="int32")
+
+
+    # Randomly initialized RWKV-7 decoder with custom config.
+    model = keras_hub.models.RWKV7Backbone(
+        vocabulary_size=10,
+        hidden_size=512,
+        num_layers=2,
+        head_size=64,
+        intermediate_dim=1024,
+        dtype="float32"
+    )
+    model(input_data)
+    ```
+    """
+
     def __init__(
         self,
         hidden_size,
@@ -27,6 +75,22 @@ def __init__(
         dropout_rate=0,
         **kwargs,
     ):
+        """Initialize RWKV7 backbone.
+
+        Args:
+            hidden_size: Hidden dimension size.
+            head_size: Attention head size.
+            num_layers: Number of RWKV blocks.
+            vocabulary_size: Size of vocabulary.
+            intermediate_dim: Intermediate dimension for FFN.
+            gate_lora: LoRA dimension for gating.
+            mv_lora: LoRA dimension for value mixing.
+            aaa_lora: LoRA dimension for alpha parameters.
+            decay_lora: LoRA dimension for decay parameters.
+            dtype: Data type for the layer.
+            dropout_rate: Dropout rate for regularization.
+            **kwargs: Additional arguments.
+        """
         # === Layers ===
         self.token_embedding = keras.layers.Embedding(
             input_dim=vocabulary_size,
@@ -90,6 +154,7 @@ def __init__(
             dtype=dtype,
             **kwargs,
         )
+        # Initialize the graph to avoid potential errors in some cases
         self.call(ops.ones([1, 16], "int32"))
 
         self.num_layers = num_layers
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
index b19ce735c2..a57d1d1773 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
@@ -11,10 +11,60 @@
 
 @keras_hub_export("keras_hub.models.RWKV7CausalLM")
 class RWKV7CausalLM(CausalLM):
+    """An end-to-end RWKV-7 model for causal language modeling.
+
+    A causal language model (LM) predicts the next token based on previous
+    tokens. This task setup can be used to train the model unsupervised on
+    plain text input, or to autoregressively generate plain text similar to
+    the data used for training. This task can be used for pre-training or
+    fine-tuning a RWKV-7 model, simply by calling `fit()`.
+
+    This model has a generate() method, which generates text based on a
+    prompt. The generation strategy used is controlled by an additional
+    sampler argument on `compile()`. You can recompile the model with
+    different `keras_hub.samplers` objects to control the generation. By
+    default, `"greedy"` sampling will be used.
+
+    Args:
+        backbone: A `keras_hub.models.RWKV7Backbone` instance.
+        preprocessor: A `keras_hub.models.RWKV7CausalLMPreprocessor` or `None`.
+            If `None`, this model will not apply preprocessing, and inputs
+            should be preprocessed before calling the model.
+
+    Examples:
+    ```python
+    # Initialize the tokenizer and load assets from a local path.
+    tokenizer = RWKVTokenizer()
+    tokenizer.load_assets(rwkv_path)
+    
+    # Create a preprocessor with a sequence length of 8.
+    preprocessor = RWKV7CausalLMPreprocessor(tokenizer, sequence_length=8)
+    
+    # Initialize the model with a backbone and preprocessor.
+    causal_lm = RWKV7CausalLM(backbone, preprocessor)
+
+    prompts = ["Bubble sort\n```python", "Hello World\n```python\n"]
+
+    causal_lm.compile(sampler="greedy")
+
+    outputs = causal_lm.generate(prompts, max_length=128)
+    for out in outputs:
+        print(out)
+        print("-" * 100)
+    ```
+    """
+
     backbone_cls = RWKV7Backbone
     preprocessor_cls = RWKV7CausalLMPreprocessor
 
     def __init__(self, backbone, preprocessor=None, **kwargs):
+        """Initialize the RWKV-7 causal language model.
+        
+        Args:
+            backbone: The backbone model.
+            preprocessor: The preprocessor for tokenization.
+            **kwargs: Additional keyword arguments.
+        """
         # === Layers ===
         self.backbone = backbone
         self.preprocessor = preprocessor
@@ -33,6 +83,26 @@ def call_with_cache(
         padding_mask=None,
         rnn_mode=True,
     ):
+        """Forward pass of `RWKV7CausalLM` with cache.
+
+        `call_with_cache` adds an additional forward pass for the model for
+        autoregressive inference. Unlike calling the model directly, this method
+        allows caching previous state Tensors in RWKV layers, and avoids 
+        recomputing the outputs of seen tokens.
+
+        Args:
+            token_ids: a dense int Tensor with shape `(batch_size, max_length)`.
+            cache: a dense float Tensor, the cache of state and token values.
+            compute_head: bool, whether to compute the output head.
+            padding_mask: a dense bool Tensor, the padding mask.
+            rnn_mode: bool, whether to use RNN mode.
+
+        Returns:
+            A (logits, hidden_states, cache) tuple. Where `logits` is the
+            language model logits for the input token_ids, `hidden_states` is
+            the final hidden representation of the input tokens, and `cache` is
+            the decoding cache.
+        """
         state_cachce, last_token_cache = cache
         x = self.backbone.token_embedding(token_ids)
         if padding_mask is None:
@@ -89,7 +159,7 @@ def _build_cache(self, token_ids):
         cache = [state_cachce, last_token_cache]
 
         # Seed the cache.
-        # prefill阶段可以使用kernel,要快一点
+        # Prefill stage can use kernel for better performance
         _, hidden_states, cache = self.call_with_cache(
             token_ids,
             cache,
@@ -111,8 +181,8 @@ def generate_step(
         model inputs, a dictionary with keys `"token_ids"` and `"padding_mask"`.
 
         Args:
-            inputs: A dictionary with two keys `"token_ids"` and
-                `"padding_mask"` and batched tensor values.
+            inputs: A dictionary with keys `"token_ids"`, `"padding_mask"`, and
+                `"predict_token_ids"` with batched tensor values.
             stop_token_ids: Tuple of id's of the end token to stop on. If all
                 sequences have produced a new stop token, generation
                 will stop.
@@ -179,4 +249,4 @@ def next(prompt, cache, index):
         return {
             "token_ids": token_ids,
             "padding_mask": padding_mask,
-        }
+        }
\ No newline at end of file
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
index 064e143c9e..6187a07f35 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
@@ -10,6 +10,87 @@
 
 @keras_hub_export("keras_hub.models.RWKV7CausalLMPreprocessor")
 class RWKV7CausalLMPreprocessor(CausalLMPreprocessor):
+    """RWKV-7 Causal LM preprocessor.
+
+    This preprocessing layer is meant for use with
+    `keras_hub.models.RWKV7CausalLM`. By default, it will take in batches of
+    strings, and return outputs in a `(x, y, sample_weight)` format, where the
+    `y` label is the next token id in the `x` sequence.
+
+    For use with generation, the layer also exposes two methods
+    `generate_preprocess()` and `generate_postprocess()`. When this preprocessor
+    is attached to a `keras_hub.models.RWKV7CausalLM` instance, these methods
+    will be called implicitly in generate(). They can also be called
+    standalone (e.g. to precompute preprocessing inputs for generation in a
+    separate process).
+
+    Args:
+        tokenizer: A `keras_hub.models.RWKVTokenizer` instance.
+        sequence_length: The length of the packed inputs.
+        add_start_token: If `True`, the preprocessor will prepend the tokenizer
+            start token to each input sequence. Default is `False`.
+
+    Call arguments:
+        x: A string, `tf.Tensor` or list of python strings.
+        y: Label data. Should always be `None` as the layer generates labels.
+        sample_weight: Label weights. Should always be `None` as the layer
+            generates label weights.
+        sequence_length: Pass to override the configured sequence_length of
+            the layer.
+
+
+    Examples:
+    ```python
+    # Initialize the tokenizer and load assets from a local path.
+    tokenizer = RWKVTokenizer()
+    tokenizer.load_assets(rwkv_path)
+    
+    # Create a preprocessor with a sequence length of 8.
+    preprocessor = RWKV7CausalLMPreprocessor(tokenizer, sequence_length=8)
+    
+    # Tokenize and pack a batch of sentences.
+    preprocessor(["Bubble sort\n```python", "Hello World\n```python\n"])
+    
+    # Preprocess inputs for generation with a maximum generation length of 16.
+    preprocessor.generate_preprocess(
+        ["Bubble sort\n```python", "Hello World\n```python\n"], 16
+    )
+    ```
+    Outputs (torch Backend) :
+    tensor([[    0,  0,  0,  0,  0,  0,  0,  0,  0,   893,
+          1760,  2011, 32082,    11,  6884],
+            [    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            33155, 37576,    11,  6884, 42114]], dtype=torch.int32),
+    tensor([[    0,  0,  0,  0,  0,  0,  0,  0,   893,  1760,
+            2011, 32082,    11,  6884, 42114],
+            [    0,  0,  0,  0,  0,  0,  0,  0,  0, 33155,
+            37576,    11,  6884, 42114,    11]], dtype=torch.int32),
+    tensor([[False, False, False, False, False, False, False, False,  True,
+            True,  True,  True,  True,  True,  True],
+            [False, False, False, False, False, False, False, False, False,
+            True,  True,  True,  True,  True,  True]])
+
+    {'token_ids': tensor([[    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            893,  1760,  2011, 32082,    11,  6884],
+            [    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0, 33155, 37576,    11,  6884, 42114]], dtype=torch.int32),
+    'padding_mask': tensor([[ True, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False],
+            [True, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False]]),
+    'predict_token_ids': tensor([[42114,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0],
+            [   11,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+             0,  0]], dtype=torch.int32)}
+    """
     backbone_cls = RWKV7Backbone
     tokenizer_cls = RWKVTokenizer
 
@@ -19,6 +100,13 @@ def __init__(
         add_start_token=False,
         **kwargs,
     ):
+        """Initialize the preprocessor.
+        
+        Args:
+            tokenizer: The tokenizer to use.
+            add_start_token: Whether to add start token.
+            **kwargs: Additional arguments.
+        """
         super().__init__(
             tokenizer=tokenizer, add_start_token=add_start_token, **kwargs
         )
@@ -30,8 +118,19 @@ def call(
         sample_weight=None,
         sequence_length=None,
     ):
+        """Preprocess the input for training.
+        
+        Args:
+            x: Input text data.
+            y: Target data (optional).
+            sample_weight: Sample weights (optional).
+            sequence_length: Desired sequence length.
+            
+        Returns:
+            Preprocessed data tuple (x, y, sample_weight).
+        """
         sequence_length = sequence_length or self.sequence_length
-        # padding 长度到16的倍数，适应kernel的需求
+        # Pad length to multiples of 16 to meet kernel requirements
         sequence_length = sequence_length + (16 - sequence_length % 16)
         x = self.tokenizer(x)
 
@@ -46,15 +145,13 @@ def call(
         return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
 
     def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
         self.packer = StartEndPacker(
             start_value=None,
             end_value=None,
             pad_value=self.tokenizer.pad_token_id,
             sequence_length=self.sequence_length,
             return_padding_mask=True,
-            padding_side="left",
+            padding_side="left",  # RWKV uses left-padding exclusively
         )
         self.built = True
 
@@ -63,31 +160,31 @@ def generate_preprocess(
         x,
         sequence_length=None,
     ):
-        """Convert strings to integer token input for generation.
-
-        Similar to calling the layer for training, this method takes in strings
-        or tensor strings, tokenizes and packs the input, and computes a padding
-        mask masking all inputs not filled in with a padded value.
-
-        Unlike calling the layer for training, this method does not compute
-        labels and will never append a `tokenizer.end_token_id` to the end of
-        the sequence (as generation is expected to continue at the end of the
-        inputted prompt).
+        """Preprocess input for generation.
+        
+        Args:
+            x: Input text data.
+            sequence_length: Maximum generation length.
+            
+        Returns:
+            Dictionary with preprocessed inputs for generation.
         """
         if not self.built:
             self.build(None)
-        # 这么做的目的是为了对齐keras的api
-        # 输入的sequence_length是生成的最大长度
-        # 而本身sequence_length则对应于prefill的最大长度
+        # Align with Keras API
+        # Input sequence_length is the maximum generation length
+        # While self.sequence_length corresponds to the prefill max length
         generate_length = sequence_length
         sequence_length = self.sequence_length
 
-        # padding 长度到16的倍数，适应kernel的需求
+        # Pad length to multiples of 16 to meet kernel requirements
         sequence_length = sequence_length + (16 - sequence_length % 16)
         generate_length = generate_length + (16 - generate_length % 16)
 
         x = [t[-sequence_length:] for t in self.tokenizer(x)]
         y = ops.zeros((len(x), generate_length), "int32")
+        # Utilize RNN characteristics where prefill and decode are two sequences
+        # But the first token of decode should be the last token of prefill
         start_token = [[t[-1]] for t in x]
         x = [t[:-1] if len(t) > 1 else [0] for t in x]
 
@@ -109,10 +206,16 @@ def generate_postprocess(
         x,
     ):
         """Convert integer token output to strings for generation.
-
+        
         This method reverses `generate_preprocess()`, by first removing all
         padding and start/end tokens, and then converting the integer sequence
         back to a string.
+        
+        Args:
+            x: Dictionary containing token_ids and padding_mask.
+            
+        Returns:
+            Detokenized string output.
         """
         if not self.built:
             self.build(None)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_layer.py b/keras_hub/src/models/rwkv7/rwkv7_layer.py
index 1a46a59bd8..309767eb80 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_layer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_layer.py
@@ -1,8 +1,9 @@
+import warnings
+
 import keras
 from keras import initializers
 from keras import ops
 from keras.layers import Layer
-import warnings
 
 
 def transpose_head(x, head_first):
@@ -24,9 +25,7 @@ def rnn_generalized_delta_rule(
     output_final_state: bool = True,
     head_first: bool = False,
 ):
-    """Implements the generalized delta rule.
-
-    """
+    """Implements the generalized delta rule for RWKV."""
     DTYPE = r.dtype
     B, T, H, N = ops.shape(r)
     r = transpose_head(r, head_first)
@@ -68,6 +67,10 @@ def step(t, inputs):
 
 
 class TimeShift(Layer):
+    """Time shift layer that shifts input sequence by one step.
+    It also be called short conv
+    """
+
     def __init__(self, name="time_shift"):
         super(TimeShift, self).__init__(name=name)
 
@@ -83,12 +86,31 @@ def compute_output_shape(self, input_shape):
 
 
 class RWKV7_ChannelMix(Layer):
+    """RWKV-7 channel mixing layer."""
+
     def __init__(self, dim_ffn, kernel_initializer="glorot_uniform", **kwargs):
+        """Initialize RWKV7 channel mixer.
+
+        Args:
+            dim_ffn: Feed-forward dimension.
+            kernel_initializer: Weight initializer.
+            **kwargs: Additional layer arguments.
+        """
         super().__init__(**kwargs)
         self.dim_ffn = dim_ffn
         self.kernel_initializer = initializers.get(kernel_initializer)
 
     def call(self, x, last_cache_x=None, train_mode=True):
+        """Process input through channel mixer.
+
+        Args:
+            x: Input tensor.
+            last_cache_x: Cached previous values.
+            train_mode: Whether in training mode.
+
+        Returns:
+            Mixed output tensor.
+        """
         xx = self.time_shift(x, last_cache_x) - x
         if last_cache_x is not None or not train_mode:
             last_cache_x = x[:, -1:]
@@ -132,13 +154,20 @@ def build(self, input_shape):
     def get_config(self):
         config = {
             "dim_ffn": self.dim_ffn,
-            "kernel_initializer": initializers.serialize(self.kernel_initializer),
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
 
 class GroupNorm(keras.layers.GroupNormalization):
+    """Group normalization with backend-specific handling.
+
+    Extends Keras GroupNormalization with PyTorch backend support.
+    """
+
     def call(self, inputs):
         if keras.config.backend() == "torch":
             import torch.nn.functional as F
@@ -150,6 +179,8 @@ def call(self, inputs):
 
 
 class RWKV7_TimeMix(Layer):
+    """RWKV-7 time mixing layer."""
+
     def __init__(
         self,
         hidden_size,
@@ -161,6 +192,18 @@ def __init__(
         kernel_initializer="glorot_uniform",
         **kwargs,
     ):
+        """Initialize RWKV7 time mixer.
+
+        Args:
+            hidden_size: Hidden dimension size.
+            head_size: Attention head size.
+            gate_lora: LoRA dimension for gating.
+            mv_lora: LoRA dimension for value mixing.
+            aaa_lora: LoRA dimension for alpha parameters.
+            decay_lora: LoRA dimension for decay parameters.
+            kernel_initializer: Weight initializer.
+            **kwargs: Additional layer arguments.
+        """
         super().__init__(**kwargs)
         self.head_size = head_size
         self.hidden_size = hidden_size
@@ -173,12 +216,14 @@ def __init__(
         self.initial_state = None
         try:
             from rwkv_ops import generalized_delta_rule
+
             self.RWKV7_OP = generalized_delta_rule
         except ImportError:
             warnings.warn(
                 "The 'rwkv_ops' package is not installed. "
-                "Falling back to the default (pure-Python) operators, which will be very slow. "
-                "Please 'pip install rwkv_ops' to enable the optimized kernels.",
+                "Falling back to the default (pure-Python) operators"
+                "pure-Python which will be very slow. "
+                "Please 'pip install rwkv_ops' to enable the optimized kernels",
                 UserWarning,
                 stacklevel=2,
             )
@@ -319,7 +364,21 @@ def call(
         rnn_mode=False,
         train_mode=True,
     ):
-        if cache_state == None:
+        """Process input through time mixer.
+
+        Args:
+            x: Input tensor.
+            v_first: First value for mixing.
+            padding_mask: Mask for padding tokens.
+            last_cache_x: Cached previous values.
+            cache_state: Cached recurrent state.
+            rnn_mode: Whether to use RNN mode.
+            train_mode: Whether in training mode.
+
+        Returns:
+            Mixed output tensor and state information.
+        """
+        if cache_state is None:
             initial_state = self.initial_state
         else:
             initial_state = cache_state
@@ -346,7 +405,10 @@ def call(
         r = self.receptance(xr)
         w = (
             -ops.softplus(
-                -(self.w0 + ops.matmul(ops.tanh(ops.matmul(xw, self.w1)), self.w2))
+                -(
+                    self.w0
+                    + ops.matmul(ops.tanh(ops.matmul(xw, self.w1)), self.w2)
+                )
             )
             - 0.5
         )  # soft-clamp to (-inf, -0.5)
@@ -443,7 +505,9 @@ def get_config(self):
             "mv_lora": self.mv_lora,
             "aaa_lora": self.aaa_lora,
             "decay_lora": self.decay_lora,
-            "kernel_initializer": initializers.serialize(self.kernel_initializer),
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -463,6 +527,20 @@ def __init__(
         kernel_initializer="glorot_uniform",
         **kwargs,
     ):
+        """Initialize RWKV7 block.
+
+        Args:
+            hidden_size: Hidden dimension size.
+            head_size: Attention head size.
+            intermediate_dim: Intermediate dimension for FFN.
+            gate_lora: LoRA dimension for gating.
+            mv_lora: LoRA dimension for value mixing.
+            aaa_lora: LoRA dimension for alpha parameters.
+            decay_lora: LoRA dimension for decay parameters.
+            use_initial_norm: Whether to use initial normalization.
+            kernel_initializer: Weight initializer.
+            **kwargs: Additional layer arguments.
+        """
         super().__init__(**kwargs)
         self.head_size = head_size
         self.hidden_size = hidden_size
@@ -477,13 +555,19 @@ def __init__(
     def build(self, input_shape):
         super().build(input_shape)
         if self.use_initial_norm:
-            self.ln0 = keras.layers.LayerNormalization(epsilon=1e-5, name="init_norm")
+            self.ln0 = keras.layers.LayerNormalization(
+                epsilon=1e-5, name="init_norm"
+            )
             self.ln0.build(input_shape)
 
-        self.ln1 = keras.layers.LayerNormalization(epsilon=1e-5, name="att_norm")
+        self.ln1 = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="att_norm"
+        )
         self.ln1.build(input_shape)
 
-        self.ln2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ffn_norm")
+        self.ln2 = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="ffn_norm"
+        )
         self.ln2.build(input_shape)
 
         self.att = RWKV7_TimeMix(
@@ -516,6 +600,21 @@ def call(
         rnn_mode=False,
         train_mode=True,
     ):
+        """Process input through RWKV block.
+
+        Args:
+            x: Input tensor.
+            v_first: First value for mixing.
+            padding_mask: Mask for padding tokens.
+            cache_state: Cached recurrent state.
+            cache_tmix_x: Cached time mixer values.
+            cache_cmix_x: Cached channel mixer values.
+            rnn_mode: Whether to use RNN mode.
+            train_mode: Whether in training mode.
+
+        Returns:
+            Processed output tensor and cache information.
+        """
         if padding_mask is not None:
             padding_mask = ops.cast(padding_mask, x.dtype)
             padding_mask = ops.expand_dims(padding_mask, axis=-1)
@@ -569,7 +668,9 @@ def get_config(self):
             "decay_lora": self.decay_lora,
             "intermediate_dim": self.intermediate_dim,
             "use_initial_norm": self.use_initial_norm,
-            "kernel_initializer": initializers.serialize(self.kernel_initializer),
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
index e2ca50e418..bc2069a604 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
@@ -1,6 +1,7 @@
 import os
 
 import keras
+
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.tokenizers import tokenizer
 from keras_hub.src.utils.tensor_utils import is_int_dtype
@@ -13,17 +14,18 @@
 
 class TRIE:
     """Byte-level Trie structure for longest prefix matching.
-    
+
     This class implements a trie data structure that stores byte
     sequences and allows efficient longest prefix matching.
     """
+
     __slots__ = tuple("ch,to,values,front".split(","))
     to: list
     values: set
 
     def __init__(self, front=None, ch=None):
         """Initialize a TRIE node.
-        
+
         Args:
             front: Parent node reference.
             ch: Byte value for this node.
@@ -45,12 +47,12 @@ def __repr__(self):
 
     def add(self, key: bytes, idx: int = 0, val=None):
         """Add a key-value pair to the trie.
-        
+
         Args:
             key: Byte sequence to add.
             idx: Current index in key processing.
             val: Value to store (defaults to key).
-            
+
         Returns:
             Final node where key was inserted.
         """
@@ -66,11 +68,11 @@ def add(self, key: bytes, idx: int = 0, val=None):
 
     def find_longest(self, key: bytes, idx: int = 0):
         """Find longest match in trie for given key.
-        
+
         Args:
             key: Byte sequence to search for.
             idx: Starting index for search.
-            
+
         Returns:
             Tuple of (end_index, node, values) for match.
         """
@@ -90,13 +92,14 @@ def find_longest(self, key: bytes, idx: int = 0):
 
 class RWKV_TOKENIZER:
     """RWKV tokenizer implementation using byte-level trie.
-    
+
     Implements tokenization using a fixed vocabulary and greedy
     longest-match algorithm on byte sequences.
     """
+
     def __init__(self, vocabs):
         """Initialize tokenizer with vocabulary.
-        
+
         Args:
             vocabs: List of vocabulary entries in format
                    "<idx> <repr> <len>".
@@ -122,10 +125,10 @@ def __init__(self, vocabs):
 
     def encodeBytes(self, src: bytes):
         """Encode byte sequence to token IDs.
-        
+
         Args:
             src: Byte sequence to encode.
-            
+
         Returns:
             List of token IDs.
         """
@@ -141,10 +144,10 @@ def encodeBytes(self, src: bytes):
 
     def decodeBytes(self, tokens):
         """Decode token IDs to byte sequence.
-        
+
         Args:
             tokens: List of token IDs.
-            
+
         Returns:
             Decoded byte sequence.
         """
@@ -152,10 +155,10 @@ def decodeBytes(self, tokens):
 
     def encode(self, src):
         """Encode text to token IDs.
-        
+
         Args:
             src: Text string or list of strings.
-            
+
         Returns:
             Token IDs or list of token ID lists.
         """
@@ -166,10 +169,10 @@ def encode(self, src):
 
     def decode(self, tokens):
         """Decode token IDs to text.
-        
+
         Args:
             tokens: Token IDs or list of token ID lists.
-            
+
         Returns:
             List of decoded text strings.
         """
@@ -181,7 +184,7 @@ def decode(self, tokens):
 
     def printTokens(self, tokens):
         """Print tokens with their string representations.
-        
+
         Args:
             tokens: List of token IDs to print.
         """
@@ -215,6 +218,7 @@ class RWKVTokenizer(tokenizer.Tokenizer):
     >>> tok("hello the")
     [3, 0, 2]
     """
+
     def __init__(
         self,
         vocabulary=None,
@@ -222,7 +226,7 @@ def __init__(
         **kwargs,
     ) -> None:
         """Initialize RWKV tokenizer.
-        
+
         Args:
             vocabulary: Vocabulary list.
             dtype: Output data type.
@@ -243,7 +247,7 @@ def __init__(
 
     def set_vocabulary(self, vocabulary):
         """Set the tokenizer vocabulary.
-        
+
         Args:
             vocabulary: Vocabulary list to set.
         """
@@ -255,7 +259,7 @@ def set_vocabulary(self, vocabulary):
 
     def save_assets(self, dir_path):
         """Save vocabulary to directory.
-        
+
         Args:
             dir_path: Directory path to save to.
         """
@@ -265,7 +269,7 @@ def save_assets(self, dir_path):
 
     def load_assets(self, dir_path=""):
         """Load vocabulary from directory.
-        
+
         Args:
             dir_path: Directory path to load from.
         """
@@ -284,7 +288,7 @@ def _check_vocabulary(self):
 
     def vocabulary_size(self):
         """Get the size of the vocabulary.
-        
+
         Returns:
             Number of tokens in vocabulary.
         """
@@ -293,7 +297,7 @@ def vocabulary_size(self):
 
     def get_vocabulary(self):
         """Get the current vocabulary.
-        
+
         Returns:
             Current vocabulary list.
         """
@@ -302,10 +306,10 @@ def get_vocabulary(self):
 
     def id_to_token(self, id):
         """Convert token ID to string representation.
-        
+
         Args:
             id: Token ID to convert.
-            
+
         Returns:
             String representation of token.
         """
@@ -324,7 +328,7 @@ def token_to_id(self, token):
 
     def get_config(self):
         """Get tokenizer configuration.
-        
+
         Returns:
             Configuration dictionary.
         """
@@ -338,10 +342,10 @@ def get_config(self):
 
     def tokenize(self, inputs):
         """Tokenize input text.
-        
+
         Args:
             inputs: Text to tokenize.
-            
+
         Returns:
             Tokenized representation.
         """
@@ -359,10 +363,10 @@ def tokens2ids(x):
 
     def detokenize(self, inputs):
         """Convert tokens back to text.
-        
+
         Args:
             inputs: Tokens to convert.
-            
+
         Returns:
             Detokenized text.
         """
@@ -375,10 +379,10 @@ def detokenize(self, inputs):
 
     def compute_output_spec(self, input_spec):
         """Compute output specification.
-        
+
         Args:
             input_spec: Input specification.
-            
+
         Returns:
             Output tensor specification.
         """
@@ -388,11 +392,11 @@ def compute_output_spec(self, input_spec):
 
     def call(self, inputs):
         """Call the tokenizer on inputs.
-        
+
         Args:
             inputs: Input text.
-            
+
         Returns:
             Tokenized output.
         """
-        return self.tokenize(inputs)
\ No newline at end of file
+        return self.tokenize(inputs)

From 4201a7f54277dad1c1af50d55e6d2bbd62cccaa9 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Wed, 8 Oct 2025 01:47:13 +0800
Subject: [PATCH 08/10] add test case

---
 keras_hub/api/__init__.py                     |  16 +-
 keras_hub/api/layers/__init__.py              | 144 ++--
 keras_hub/api/metrics/__init__.py             |  10 +-
 keras_hub/api/models/__init__.py              | 749 +++++++++++-------
 keras_hub/api/samplers/__init__.py            |  22 +-
 keras_hub/api/tokenizers/__init__.py          | 146 +++-
 keras_hub/api/utils/__init__.py               |  18 +-
 .../src/models/rwkv7/rwkv7_backbone_test.py   |  37 +
 keras_hub/src/models/rwkv7/rwkv7_causal_lm.py |  10 +-
 .../rwkv7/rwkv7_causal_lm_preprocessor.py     |  45 +-
 .../rwkv7_causal_lm_preprocessor_test.py      |  98 +++
 .../src/models/rwkv7/rwkv7_causal_lm_test.py  |  92 +++
 .../src/models/rwkv7/rwkv7_tokenizer_test.py  |  25 +
 13 files changed, 1004 insertions(+), 408 deletions(-)
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_backbone_test.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor_test.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
 create mode 100644 keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py

diff --git a/keras_hub/api/__init__.py b/keras_hub/api/__init__.py
index 3796e4c7f4..2aa98bf3f9 100644
--- a/keras_hub/api/__init__.py
+++ b/keras_hub/api/__init__.py
@@ -4,12 +4,12 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub import layers
-from keras_hub import metrics
-from keras_hub import models
-from keras_hub import samplers
-from keras_hub import tokenizers
-from keras_hub import utils
-from keras_hub.src.utils.preset_utils import upload_preset
+from keras_hub import layers as layers
+from keras_hub import metrics as metrics
+from keras_hub import models as models
+from keras_hub import samplers as samplers
+from keras_hub import tokenizers as tokenizers
+from keras_hub import utils as utils
+from keras_hub.src.utils.preset_utils import upload_preset as upload_preset
 from keras_hub.src.version import __version__ as __version__
-from keras_hub.src.version import version
+from keras_hub.src.version import version as version
diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py
index c4411ba889..4550cf8689 100644
--- a/keras_hub/api/layers/__init__.py
+++ b/keras_hub/api/layers/__init__.py
@@ -4,105 +4,149 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.layers.modeling.alibi_bias import AlibiBias
-from keras_hub.src.layers.modeling.anchor_generator import AnchorGenerator
-from keras_hub.src.layers.modeling.box_matcher import BoxMatcher
+from keras_hub.src.layers.modeling.alibi_bias import AlibiBias as AlibiBias
+from keras_hub.src.layers.modeling.anchor_generator import (
+    AnchorGenerator as AnchorGenerator,
+)
+from keras_hub.src.layers.modeling.box_matcher import BoxMatcher as BoxMatcher
 from keras_hub.src.layers.modeling.cached_multi_head_attention import (
-    CachedMultiHeadAttention,
+    CachedMultiHeadAttention as CachedMultiHeadAttention,
+)
+from keras_hub.src.layers.modeling.f_net_encoder import (
+    FNetEncoder as FNetEncoder,
+)
+from keras_hub.src.layers.modeling.masked_lm_head import (
+    MaskedLMHead as MaskedLMHead,
+)
+from keras_hub.src.layers.modeling.non_max_supression import (
+    NonMaxSuppression as NonMaxSuppression,
+)
+from keras_hub.src.layers.modeling.position_embedding import (
+    PositionEmbedding as PositionEmbedding,
 )
-from keras_hub.src.layers.modeling.f_net_encoder import FNetEncoder
-from keras_hub.src.layers.modeling.masked_lm_head import MaskedLMHead
-from keras_hub.src.layers.modeling.non_max_supression import NonMaxSuppression
-from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
 from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
+    ReversibleEmbedding as ReversibleEmbedding,
+)
+from keras_hub.src.layers.modeling.rms_normalization import (
+    RMSNormalization as RMSNormalization,
+)
+from keras_hub.src.layers.modeling.rotary_embedding import (
+    RotaryEmbedding as RotaryEmbedding,
 )
-from keras_hub.src.layers.modeling.rms_normalization import RMSNormalization
-from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.layers.modeling.sine_position_encoding import (
-    SinePositionEncoding,
+    SinePositionEncoding as SinePositionEncoding,
 )
 from keras_hub.src.layers.modeling.token_and_position_embedding import (
-    TokenAndPositionEmbedding,
+    TokenAndPositionEmbedding as TokenAndPositionEmbedding,
+)
+from keras_hub.src.layers.modeling.transformer_decoder import (
+    TransformerDecoder as TransformerDecoder,
+)
+from keras_hub.src.layers.modeling.transformer_encoder import (
+    TransformerEncoder as TransformerEncoder,
+)
+from keras_hub.src.layers.preprocessing.audio_converter import (
+    AudioConverter as AudioConverter,
+)
+from keras_hub.src.layers.preprocessing.image_converter import (
+    ImageConverter as ImageConverter,
 )
-from keras_hub.src.layers.modeling.transformer_decoder import TransformerDecoder
-from keras_hub.src.layers.modeling.transformer_encoder import TransformerEncoder
-from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
-from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
 from keras_hub.src.layers.preprocessing.masked_lm_mask_generator import (
-    MaskedLMMaskGenerator,
+    MaskedLMMaskGenerator as MaskedLMMaskGenerator,
 )
 from keras_hub.src.layers.preprocessing.multi_segment_packer import (
-    MultiSegmentPacker,
+    MultiSegmentPacker as MultiSegmentPacker,
+)
+from keras_hub.src.layers.preprocessing.random_deletion import (
+    RandomDeletion as RandomDeletion,
+)
+from keras_hub.src.layers.preprocessing.random_swap import (
+    RandomSwap as RandomSwap,
+)
+from keras_hub.src.layers.preprocessing.start_end_packer import (
+    StartEndPacker as StartEndPacker,
 )
-from keras_hub.src.layers.preprocessing.random_deletion import RandomDeletion
-from keras_hub.src.layers.preprocessing.random_swap import RandomSwap
-from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.models.basnet.basnet_image_converter import (
-    BASNetImageConverter,
+    BASNetImageConverter as BASNetImageConverter,
+)
+from keras_hub.src.models.clip.clip_image_converter import (
+    CLIPImageConverter as CLIPImageConverter,
 )
-from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
 from keras_hub.src.models.cspnet.cspnet_image_converter import (
-    CSPNetImageConverter,
+    CSPNetImageConverter as CSPNetImageConverter,
 )
 from keras_hub.src.models.d_fine.d_fine_image_converter import (
-    DFineImageConverter,
+    DFineImageConverter as DFineImageConverter,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_image_converter import (
-    DeepLabV3ImageConverter,
+    DeepLabV3ImageConverter as DeepLabV3ImageConverter,
+)
+from keras_hub.src.models.deit.deit_image_converter import (
+    DeiTImageConverter as DeiTImageConverter,
 )
-from keras_hub.src.models.deit.deit_image_converter import DeiTImageConverter
 from keras_hub.src.models.densenet.densenet_image_converter import (
-    DenseNetImageConverter,
+    DenseNetImageConverter as DenseNetImageConverter,
 )
 from keras_hub.src.models.depth_anything.depth_anything_image_converter import (
-    DepthAnythingImageConverter,
+    DepthAnythingImageConverter as DepthAnythingImageConverter,
 )
 from keras_hub.src.models.dinov2.dinov2_image_converter import (
-    DINOV2ImageConverter,
+    DINOV2ImageConverter as DINOV2ImageConverter,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
-    EfficientNetImageConverter,
+    EfficientNetImageConverter as EfficientNetImageConverter,
 )
 from keras_hub.src.models.gemma3.gemma3_image_converter import (
-    Gemma3ImageConverter,
+    Gemma3ImageConverter as Gemma3ImageConverter,
 )
 from keras_hub.src.models.hgnetv2.hgnetv2_image_converter import (
-    HGNetV2ImageConverter,
+    HGNetV2ImageConverter as HGNetV2ImageConverter,
+)
+from keras_hub.src.models.mit.mit_image_converter import (
+    MiTImageConverter as MiTImageConverter,
 )
-from keras_hub.src.models.mit.mit_image_converter import MiTImageConverter
 from keras_hub.src.models.mobilenet.mobilenet_image_converter import (
-    MobileNetImageConverter,
+    MobileNetImageConverter as MobileNetImageConverter,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_converter import (
-    MoonshineAudioConverter,
+    MoonshineAudioConverter as MoonshineAudioConverter,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_image_converter import (
-    PaliGemmaImageConverter,
+    PaliGemmaImageConverter as PaliGemmaImageConverter,
 )
 from keras_hub.src.models.parseq.parseq_image_converter import (
-    PARSeqImageConverter,
+    PARSeqImageConverter as PARSeqImageConverter,
 )
 from keras_hub.src.models.resnet.resnet_image_converter import (
-    ResNetImageConverter,
+    ResNetImageConverter as ResNetImageConverter,
 )
 from keras_hub.src.models.retinanet.retinanet_image_converter import (
-    RetinaNetImageConverter,
+    RetinaNetImageConverter as RetinaNetImageConverter,
+)
+from keras_hub.src.models.sam.sam_image_converter import (
+    SAMImageConverter as SAMImageConverter,
+)
+from keras_hub.src.models.sam.sam_mask_decoder import (
+    SAMMaskDecoder as SAMMaskDecoder,
+)
+from keras_hub.src.models.sam.sam_prompt_encoder import (
+    SAMPromptEncoder as SAMPromptEncoder,
 )
-from keras_hub.src.models.sam.sam_image_converter import SAMImageConverter
-from keras_hub.src.models.sam.sam_mask_decoder import SAMMaskDecoder
-from keras_hub.src.models.sam.sam_prompt_encoder import SAMPromptEncoder
 from keras_hub.src.models.segformer.segformer_image_converter import (
-    SegFormerImageConverter,
+    SegFormerImageConverter as SegFormerImageConverter,
 )
 from keras_hub.src.models.siglip.siglip_image_converter import (
-    SigLIPImageConverter,
+    SigLIPImageConverter as SigLIPImageConverter,
+)
+from keras_hub.src.models.vgg.vgg_image_converter import (
+    VGGImageConverter as VGGImageConverter,
+)
+from keras_hub.src.models.vit.vit_image_converter import (
+    ViTImageConverter as ViTImageConverter,
 )
-from keras_hub.src.models.vgg.vgg_image_converter import VGGImageConverter
-from keras_hub.src.models.vit.vit_image_converter import ViTImageConverter
 from keras_hub.src.models.whisper.whisper_audio_converter import (
-    WhisperAudioConverter,
+    WhisperAudioConverter as WhisperAudioConverter,
 )
 from keras_hub.src.models.xception.xception_image_converter import (
-    XceptionImageConverter,
+    XceptionImageConverter as XceptionImageConverter,
 )
diff --git a/keras_hub/api/metrics/__init__.py b/keras_hub/api/metrics/__init__.py
index 88a0a7df2b..100c2c66fb 100644
--- a/keras_hub/api/metrics/__init__.py
+++ b/keras_hub/api/metrics/__init__.py
@@ -4,8 +4,8 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.metrics.bleu import Bleu
-from keras_hub.src.metrics.edit_distance import EditDistance
-from keras_hub.src.metrics.perplexity import Perplexity
-from keras_hub.src.metrics.rouge_l import RougeL
-from keras_hub.src.metrics.rouge_n import RougeN
+from keras_hub.src.metrics.bleu import Bleu as Bleu
+from keras_hub.src.metrics.edit_distance import EditDistance as EditDistance
+from keras_hub.src.metrics.perplexity import Perplexity as Perplexity
+from keras_hub.src.metrics.rouge_l import RougeL as RougeL
+from keras_hub.src.metrics.rouge_n import RougeN as RougeN
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index 714b0e64ad..03fe5c0418 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -4,535 +4,752 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.models.albert.albert_backbone import AlbertBackbone
-from keras_hub.src.models.albert.albert_masked_lm import AlbertMaskedLM
+from keras_hub.src.models.albert.albert_backbone import (
+    AlbertBackbone as AlbertBackbone,
+)
+from keras_hub.src.models.albert.albert_masked_lm import (
+    AlbertMaskedLM as AlbertMaskedLM,
+)
 from keras_hub.src.models.albert.albert_masked_lm_preprocessor import (
-    AlbertMaskedLMPreprocessor,
+    AlbertMaskedLMPreprocessor as AlbertMaskedLMPreprocessor,
 )
 from keras_hub.src.models.albert.albert_text_classifier import (
-    AlbertTextClassifier,
+    AlbertTextClassifier as AlbertClassifier,
 )
 from keras_hub.src.models.albert.albert_text_classifier import (
-    AlbertTextClassifier as AlbertClassifier,
+    AlbertTextClassifier as AlbertTextClassifier,
 )
 from keras_hub.src.models.albert.albert_text_classifier_preprocessor import (
-    AlbertTextClassifierPreprocessor,
+    AlbertTextClassifierPreprocessor as AlbertPreprocessor,
 )
 from keras_hub.src.models.albert.albert_text_classifier_preprocessor import (
-    AlbertTextClassifierPreprocessor as AlbertPreprocessor,
+    AlbertTextClassifierPreprocessor as AlbertTextClassifierPreprocessor,
+)
+from keras_hub.src.models.albert.albert_tokenizer import (
+    AlbertTokenizer as AlbertTokenizer,
+)
+from keras_hub.src.models.backbone import Backbone as Backbone
+from keras_hub.src.models.bart.bart_backbone import BartBackbone as BartBackbone
+from keras_hub.src.models.bart.bart_seq_2_seq_lm import (
+    BartSeq2SeqLM as BartSeq2SeqLM,
 )
-from keras_hub.src.models.albert.albert_tokenizer import AlbertTokenizer
-from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.bart.bart_backbone import BartBackbone
-from keras_hub.src.models.bart.bart_seq_2_seq_lm import BartSeq2SeqLM
 from keras_hub.src.models.bart.bart_seq_2_seq_lm_preprocessor import (
-    BartSeq2SeqLMPreprocessor,
-)
-from keras_hub.src.models.bart.bart_tokenizer import BartTokenizer
-from keras_hub.src.models.basnet.basnet import BASNetImageSegmenter
-from keras_hub.src.models.basnet.basnet_backbone import BASNetBackbone
-from keras_hub.src.models.basnet.basnet_preprocessor import BASNetPreprocessor
-from keras_hub.src.models.bert.bert_backbone import BertBackbone
-from keras_hub.src.models.bert.bert_masked_lm import BertMaskedLM
+    BartSeq2SeqLMPreprocessor as BartSeq2SeqLMPreprocessor,
+)
+from keras_hub.src.models.bart.bart_tokenizer import (
+    BartTokenizer as BartTokenizer,
+)
+from keras_hub.src.models.basnet.basnet import (
+    BASNetImageSegmenter as BASNetImageSegmenter,
+)
+from keras_hub.src.models.basnet.basnet_backbone import (
+    BASNetBackbone as BASNetBackbone,
+)
+from keras_hub.src.models.basnet.basnet_preprocessor import (
+    BASNetPreprocessor as BASNetPreprocessor,
+)
+from keras_hub.src.models.bert.bert_backbone import BertBackbone as BertBackbone
+from keras_hub.src.models.bert.bert_masked_lm import (
+    BertMaskedLM as BertMaskedLM,
+)
 from keras_hub.src.models.bert.bert_masked_lm_preprocessor import (
-    BertMaskedLMPreprocessor,
+    BertMaskedLMPreprocessor as BertMaskedLMPreprocessor,
 )
-from keras_hub.src.models.bert.bert_text_classifier import BertTextClassifier
 from keras_hub.src.models.bert.bert_text_classifier import (
     BertTextClassifier as BertClassifier,
 )
-from keras_hub.src.models.bert.bert_text_classifier_preprocessor import (
-    BertTextClassifierPreprocessor,
+from keras_hub.src.models.bert.bert_text_classifier import (
+    BertTextClassifier as BertTextClassifier,
 )
 from keras_hub.src.models.bert.bert_text_classifier_preprocessor import (
     BertTextClassifierPreprocessor as BertPreprocessor,
 )
-from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
-from keras_hub.src.models.bloom.bloom_backbone import BloomBackbone
-from keras_hub.src.models.bloom.bloom_causal_lm import BloomCausalLM
+from keras_hub.src.models.bert.bert_text_classifier_preprocessor import (
+    BertTextClassifierPreprocessor as BertTextClassifierPreprocessor,
+)
+from keras_hub.src.models.bert.bert_tokenizer import (
+    BertTokenizer as BertTokenizer,
+)
+from keras_hub.src.models.bloom.bloom_backbone import (
+    BloomBackbone as BloomBackbone,
+)
+from keras_hub.src.models.bloom.bloom_causal_lm import (
+    BloomCausalLM as BloomCausalLM,
+)
 from keras_hub.src.models.bloom.bloom_causal_lm_preprocessor import (
-    BloomCausalLMPreprocessor,
-)
-from keras_hub.src.models.bloom.bloom_tokenizer import BloomTokenizer
-from keras_hub.src.models.causal_lm import CausalLM
-from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
-from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
-from keras_hub.src.models.clip.clip_preprocessor import CLIPPreprocessor
-from keras_hub.src.models.clip.clip_text_encoder import CLIPTextEncoder
-from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
-from keras_hub.src.models.clip.clip_vision_encoder import CLIPVisionEncoder
-from keras_hub.src.models.cspnet.cspnet_backbone import CSPNetBackbone
+    BloomCausalLMPreprocessor as BloomCausalLMPreprocessor,
+)
+from keras_hub.src.models.bloom.bloom_tokenizer import (
+    BloomTokenizer as BloomTokenizer,
+)
+from keras_hub.src.models.causal_lm import CausalLM as CausalLM
+from keras_hub.src.models.causal_lm_preprocessor import (
+    CausalLMPreprocessor as CausalLMPreprocessor,
+)
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone as CLIPBackbone
+from keras_hub.src.models.clip.clip_preprocessor import (
+    CLIPPreprocessor as CLIPPreprocessor,
+)
+from keras_hub.src.models.clip.clip_text_encoder import (
+    CLIPTextEncoder as CLIPTextEncoder,
+)
+from keras_hub.src.models.clip.clip_tokenizer import (
+    CLIPTokenizer as CLIPTokenizer,
+)
+from keras_hub.src.models.clip.clip_vision_encoder import (
+    CLIPVisionEncoder as CLIPVisionEncoder,
+)
+from keras_hub.src.models.cspnet.cspnet_backbone import (
+    CSPNetBackbone as CSPNetBackbone,
+)
 from keras_hub.src.models.cspnet.cspnet_image_classifier import (
-    CSPNetImageClassifier,
+    CSPNetImageClassifier as CSPNetImageClassifier,
 )
 from keras_hub.src.models.cspnet.cspnet_image_classifier_preprocessor import (
-    CSPNetImageClassifierPreprocessor,
+    CSPNetImageClassifierPreprocessor as CSPNetImageClassifierPreprocessor,
+)
+from keras_hub.src.models.d_fine.d_fine_backbone import (
+    DFineBackbone as DFineBackbone,
 )
-from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
 from keras_hub.src.models.d_fine.d_fine_object_detector import (
-    DFineObjectDetector,
+    DFineObjectDetector as DFineObjectDetector,
 )
 from keras_hub.src.models.d_fine.d_fine_object_detector_preprocessor import (
-    DFineObjectDetectorPreprocessor,
+    DFineObjectDetectorPreprocessor as DFineObjectDetectorPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_backbone import (
-    DebertaV3Backbone,
+    DebertaV3Backbone as DebertaV3Backbone,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_masked_lm import (
-    DebertaV3MaskedLM,
+    DebertaV3MaskedLM as DebertaV3MaskedLM,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
-    DebertaV3MaskedLMPreprocessor,
+    DebertaV3MaskedLMPreprocessor as DebertaV3MaskedLMPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier import (
-    DebertaV3TextClassifier,
+    DebertaV3TextClassifier as DebertaV3Classifier,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier import (
-    DebertaV3TextClassifier as DebertaV3Classifier,
+    DebertaV3TextClassifier as DebertaV3TextClassifier,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier_preprocessor import (
-    DebertaV3TextClassifierPreprocessor,
+    DebertaV3TextClassifierPreprocessor as DebertaV3Preprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_text_classifier_preprocessor import (
-    DebertaV3TextClassifierPreprocessor as DebertaV3Preprocessor,
+    DebertaV3TextClassifierPreprocessor as DebertaV3TextClassifierPreprocessor,
 )
 from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
-    DebertaV3Tokenizer,
+    DebertaV3Tokenizer as DebertaV3Tokenizer,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_backbone import (
-    DeepLabV3Backbone,
+    DeepLabV3Backbone as DeepLabV3Backbone,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_image_segmeter_preprocessor import (
-    DeepLabV3ImageSegmenterPreprocessor,
+    DeepLabV3ImageSegmenterPreprocessor as DeepLabV3ImageSegmenterPreprocessor,
 )
 from keras_hub.src.models.deeplab_v3.deeplab_v3_segmenter import (
-    DeepLabV3ImageSegmenter,
+    DeepLabV3ImageSegmenter as DeepLabV3ImageSegmenter,
+)
+from keras_hub.src.models.deit.deit_backbone import DeiTBackbone as DeiTBackbone
+from keras_hub.src.models.deit.deit_image_classifier import (
+    DeiTImageClassifier as DeiTImageClassifier,
 )
-from keras_hub.src.models.deit.deit_backbone import DeiTBackbone
-from keras_hub.src.models.deit.deit_image_classifier import DeiTImageClassifier
 from keras_hub.src.models.deit.deit_image_classifier_preprocessor import (
-    DeiTImageClassifierPreprocessor,
+    DeiTImageClassifierPreprocessor as DeiTImageClassifierPreprocessor,
+)
+from keras_hub.src.models.densenet.densenet_backbone import (
+    DenseNetBackbone as DenseNetBackbone,
 )
-from keras_hub.src.models.densenet.densenet_backbone import DenseNetBackbone
 from keras_hub.src.models.densenet.densenet_image_classifier import (
-    DenseNetImageClassifier,
+    DenseNetImageClassifier as DenseNetImageClassifier,
 )
 from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
-    DenseNetImageClassifierPreprocessor,
+    DenseNetImageClassifierPreprocessor as DenseNetImageClassifierPreprocessor,
 )
 from keras_hub.src.models.depth_anything.depth_anything_backbone import (
-    DepthAnythingBackbone,
+    DepthAnythingBackbone as DepthAnythingBackbone,
 )
 from keras_hub.src.models.depth_anything.depth_anything_depth_estimator import (
-    DepthAnythingDepthEstimator,
+    DepthAnythingDepthEstimator as DepthAnythingDepthEstimator,
 )
 from keras_hub.src.models.depth_anything.depth_anything_depth_estimator_preprocessor import (
-    DepthAnythingDepthEstimatorPreprocessor,
+    DepthAnythingDepthEstimatorPreprocessor as DepthAnythingDepthEstimatorPreprocessor,
+)
+from keras_hub.src.models.depth_estimator import (
+    DepthEstimator as DepthEstimator,
 )
-from keras_hub.src.models.depth_estimator import DepthEstimator
 from keras_hub.src.models.depth_estimator_preprocessor import (
-    DepthEstimatorPreprocessor,
+    DepthEstimatorPreprocessor as DepthEstimatorPreprocessor,
+)
+from keras_hub.src.models.dinov2.dinov2_backbone import (
+    DINOV2Backbone as DINOV2Backbone,
 )
-from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
 from keras_hub.src.models.distil_bert.distil_bert_backbone import (
-    DistilBertBackbone,
+    DistilBertBackbone as DistilBertBackbone,
 )
 from keras_hub.src.models.distil_bert.distil_bert_masked_lm import (
-    DistilBertMaskedLM,
+    DistilBertMaskedLM as DistilBertMaskedLM,
 )
 from keras_hub.src.models.distil_bert.distil_bert_masked_lm_preprocessor import (
-    DistilBertMaskedLMPreprocessor,
+    DistilBertMaskedLMPreprocessor as DistilBertMaskedLMPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier import (
-    DistilBertTextClassifier,
+    DistilBertTextClassifier as DistilBertClassifier,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier import (
-    DistilBertTextClassifier as DistilBertClassifier,
+    DistilBertTextClassifier as DistilBertTextClassifier,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier_preprocessor import (
-    DistilBertTextClassifierPreprocessor,
+    DistilBertTextClassifierPreprocessor as DistilBertPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_text_classifier_preprocessor import (
-    DistilBertTextClassifierPreprocessor as DistilBertPreprocessor,
+    DistilBertTextClassifierPreprocessor as DistilBertTextClassifierPreprocessor,
 )
 from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
-    DistilBertTokenizer,
+    DistilBertTokenizer as DistilBertTokenizer,
 )
 from keras_hub.src.models.efficientnet.efficientnet_backbone import (
-    EfficientNetBackbone,
+    EfficientNetBackbone as EfficientNetBackbone,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_classifier import (
-    EfficientNetImageClassifier,
+    EfficientNetImageClassifier as EfficientNetImageClassifier,
 )
 from keras_hub.src.models.efficientnet.efficientnet_image_classifier_preprocessor import (
-    EfficientNetImageClassifierPreprocessor,
+    EfficientNetImageClassifierPreprocessor as EfficientNetImageClassifierPreprocessor,
+)
+from keras_hub.src.models.electra.electra_backbone import (
+    ElectraBackbone as ElectraBackbone,
+)
+from keras_hub.src.models.electra.electra_tokenizer import (
+    ElectraTokenizer as ElectraTokenizer,
 )
-from keras_hub.src.models.electra.electra_backbone import ElectraBackbone
-from keras_hub.src.models.electra.electra_tokenizer import ElectraTokenizer
-from keras_hub.src.models.esm.esm_backbone import ESMBackbone
 from keras_hub.src.models.esm.esm_backbone import ESMBackbone as ESM2Backbone
-from keras_hub.src.models.esm.esm_classifier import ESMProteinClassifier
+from keras_hub.src.models.esm.esm_backbone import ESMBackbone as ESMBackbone
+from keras_hub.src.models.esm.esm_classifier import (
+    ESMProteinClassifier as ESMProteinClassifier,
+)
 from keras_hub.src.models.esm.esm_classifier_preprocessor import (
-    ESMProteinClassifierPreprocessor,
+    ESMProteinClassifierPreprocessor as ESMProteinClassifierPreprocessor,
 )
-from keras_hub.src.models.esm.esm_masked_plm import ESMMaskedPLM
 from keras_hub.src.models.esm.esm_masked_plm import (
     ESMMaskedPLM as ESM2MaskedPLM,
 )
+from keras_hub.src.models.esm.esm_masked_plm import ESMMaskedPLM as ESMMaskedPLM
 from keras_hub.src.models.esm.esm_masked_plm_preprocessor import (
-    ESMMaskedPLMPreprocessor,
+    ESMMaskedPLMPreprocessor as ESMMaskedPLMPreprocessor,
+)
+from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer as ESMTokenizer
+from keras_hub.src.models.f_net.f_net_backbone import (
+    FNetBackbone as FNetBackbone,
+)
+from keras_hub.src.models.f_net.f_net_masked_lm import (
+    FNetMaskedLM as FNetMaskedLM,
 )
-from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer
-from keras_hub.src.models.f_net.f_net_backbone import FNetBackbone
-from keras_hub.src.models.f_net.f_net_masked_lm import FNetMaskedLM
 from keras_hub.src.models.f_net.f_net_masked_lm_preprocessor import (
-    FNetMaskedLMPreprocessor,
+    FNetMaskedLMPreprocessor as FNetMaskedLMPreprocessor,
 )
-from keras_hub.src.models.f_net.f_net_text_classifier import FNetTextClassifier
 from keras_hub.src.models.f_net.f_net_text_classifier import (
     FNetTextClassifier as FNetClassifier,
 )
-from keras_hub.src.models.f_net.f_net_text_classifier_preprocessor import (
-    FNetTextClassifierPreprocessor,
+from keras_hub.src.models.f_net.f_net_text_classifier import (
+    FNetTextClassifier as FNetTextClassifier,
 )
 from keras_hub.src.models.f_net.f_net_text_classifier_preprocessor import (
     FNetTextClassifierPreprocessor as FNetPreprocessor,
 )
-from keras_hub.src.models.f_net.f_net_tokenizer import FNetTokenizer
-from keras_hub.src.models.falcon.falcon_backbone import FalconBackbone
-from keras_hub.src.models.falcon.falcon_causal_lm import FalconCausalLM
+from keras_hub.src.models.f_net.f_net_text_classifier_preprocessor import (
+    FNetTextClassifierPreprocessor as FNetTextClassifierPreprocessor,
+)
+from keras_hub.src.models.f_net.f_net_tokenizer import (
+    FNetTokenizer as FNetTokenizer,
+)
+from keras_hub.src.models.falcon.falcon_backbone import (
+    FalconBackbone as FalconBackbone,
+)
+from keras_hub.src.models.falcon.falcon_causal_lm import (
+    FalconCausalLM as FalconCausalLM,
+)
 from keras_hub.src.models.falcon.falcon_causal_lm_preprocessor import (
-    FalconCausalLMPreprocessor,
+    FalconCausalLMPreprocessor as FalconCausalLMPreprocessor,
+)
+from keras_hub.src.models.falcon.falcon_tokenizer import (
+    FalconTokenizer as FalconTokenizer,
+)
+from keras_hub.src.models.feature_pyramid_backbone import (
+    FeaturePyramidBackbone as FeaturePyramidBackbone,
+)
+from keras_hub.src.models.flux.flux_model import FluxBackbone as FluxBackbone
+from keras_hub.src.models.flux.flux_text_to_image import (
+    FluxTextToImage as FluxTextToImage,
 )
-from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
-from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone
-from keras_hub.src.models.flux.flux_model import FluxBackbone
-from keras_hub.src.models.flux.flux_text_to_image import FluxTextToImage
 from keras_hub.src.models.flux.flux_text_to_image_preprocessor import (
-    FluxTextToImagePreprocessor,
+    FluxTextToImagePreprocessor as FluxTextToImagePreprocessor,
+)
+from keras_hub.src.models.gemma.gemma_backbone import (
+    GemmaBackbone as GemmaBackbone,
+)
+from keras_hub.src.models.gemma.gemma_causal_lm import (
+    GemmaCausalLM as GemmaCausalLM,
 )
-from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
-from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
 from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
-    GemmaCausalLMPreprocessor,
+    GemmaCausalLMPreprocessor as GemmaCausalLMPreprocessor,
+)
+from keras_hub.src.models.gemma.gemma_tokenizer import (
+    GemmaTokenizer as GemmaTokenizer,
+)
+from keras_hub.src.models.gemma3.gemma3_backbone import (
+    Gemma3Backbone as Gemma3Backbone,
+)
+from keras_hub.src.models.gemma3.gemma3_causal_lm import (
+    Gemma3CausalLM as Gemma3CausalLM,
 )
-from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
-from keras_hub.src.models.gemma3.gemma3_backbone import Gemma3Backbone
-from keras_hub.src.models.gemma3.gemma3_causal_lm import Gemma3CausalLM
 from keras_hub.src.models.gemma3.gemma3_causal_lm_preprocessor import (
-    Gemma3CausalLMPreprocessor,
+    Gemma3CausalLMPreprocessor as Gemma3CausalLMPreprocessor,
+)
+from keras_hub.src.models.gemma3.gemma3_tokenizer import (
+    Gemma3Tokenizer as Gemma3Tokenizer,
 )
-from keras_hub.src.models.gemma3.gemma3_tokenizer import Gemma3Tokenizer
 from keras_hub.src.models.gemma3.gemma3_vision_encoder import (
-    Gemma3VisionEncoder,
+    Gemma3VisionEncoder as Gemma3VisionEncoder,
+)
+from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone as GPT2Backbone
+from keras_hub.src.models.gpt2.gpt2_causal_lm import (
+    GPT2CausalLM as GPT2CausalLM,
 )
-from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone
-from keras_hub.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM
 from keras_hub.src.models.gpt2.gpt2_causal_lm_preprocessor import (
-    GPT2CausalLMPreprocessor,
+    GPT2CausalLMPreprocessor as GPT2CausalLMPreprocessor,
+)
+from keras_hub.src.models.gpt2.gpt2_preprocessor import (
+    GPT2Preprocessor as GPT2Preprocessor,
+)
+from keras_hub.src.models.gpt2.gpt2_tokenizer import (
+    GPT2Tokenizer as GPT2Tokenizer,
+)
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_backbone import (
+    GPTNeoXBackbone as GPTNeoXBackbone,
+)
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm import (
+    GPTNeoXCausalLM as GPTNeoXCausalLM,
 )
-from keras_hub.src.models.gpt2.gpt2_preprocessor import GPT2Preprocessor
-from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm import GPTNeoXCausalLM
 from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import (
-    GPTNeoXCausalLMPreprocessor,
+    GPTNeoXCausalLMPreprocessor as GPTNeoXCausalLMPreprocessor,
+)
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
+    GPTNeoXTokenizer as GPTNeoXTokenizer,
+)
+from keras_hub.src.models.hgnetv2.hgnetv2_backbone import (
+    HGNetV2Backbone as HGNetV2Backbone,
 )
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
-from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
 from keras_hub.src.models.hgnetv2.hgnetv2_image_classifier import (
-    HGNetV2ImageClassifier,
+    HGNetV2ImageClassifier as HGNetV2ImageClassifier,
 )
 from keras_hub.src.models.hgnetv2.hgnetv2_image_classifier_preprocessor import (
-    HGNetV2ImageClassifierPreprocessor,
+    HGNetV2ImageClassifierPreprocessor as HGNetV2ImageClassifierPreprocessor,
+)
+from keras_hub.src.models.image_classifier import (
+    ImageClassifier as ImageClassifier,
 )
-from keras_hub.src.models.image_classifier import ImageClassifier
 from keras_hub.src.models.image_classifier_preprocessor import (
-    ImageClassifierPreprocessor,
+    ImageClassifierPreprocessor as ImageClassifierPreprocessor,
+)
+from keras_hub.src.models.image_segmenter import (
+    ImageSegmenter as ImageSegmenter,
 )
-from keras_hub.src.models.image_segmenter import ImageSegmenter
 from keras_hub.src.models.image_segmenter_preprocessor import (
-    ImageSegmenterPreprocessor,
+    ImageSegmenterPreprocessor as ImageSegmenterPreprocessor,
+)
+from keras_hub.src.models.image_to_image import ImageToImage as ImageToImage
+from keras_hub.src.models.inpaint import Inpaint as Inpaint
+from keras_hub.src.models.llama.llama_backbone import (
+    LlamaBackbone as LlamaBackbone,
+)
+from keras_hub.src.models.llama.llama_causal_lm import (
+    LlamaCausalLM as LlamaCausalLM,
 )
-from keras_hub.src.models.image_to_image import ImageToImage
-from keras_hub.src.models.inpaint import Inpaint
-from keras_hub.src.models.llama.llama_backbone import LlamaBackbone
-from keras_hub.src.models.llama.llama_causal_lm import LlamaCausalLM
 from keras_hub.src.models.llama.llama_causal_lm_preprocessor import (
-    LlamaCausalLMPreprocessor,
+    LlamaCausalLMPreprocessor as LlamaCausalLMPreprocessor,
+)
+from keras_hub.src.models.llama.llama_tokenizer import (
+    LlamaTokenizer as LlamaTokenizer,
+)
+from keras_hub.src.models.llama3.llama3_backbone import (
+    Llama3Backbone as Llama3Backbone,
+)
+from keras_hub.src.models.llama3.llama3_causal_lm import (
+    Llama3CausalLM as Llama3CausalLM,
 )
-from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
-from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
-from keras_hub.src.models.llama3.llama3_causal_lm import Llama3CausalLM
 from keras_hub.src.models.llama3.llama3_causal_lm_preprocessor import (
-    Llama3CausalLMPreprocessor,
+    Llama3CausalLMPreprocessor as Llama3CausalLMPreprocessor,
+)
+from keras_hub.src.models.llama3.llama3_tokenizer import (
+    Llama3Tokenizer as Llama3Tokenizer,
+)
+from keras_hub.src.models.masked_lm import MaskedLM as MaskedLM
+from keras_hub.src.models.masked_lm_preprocessor import (
+    MaskedLMPreprocessor as MaskedLMPreprocessor,
+)
+from keras_hub.src.models.mistral.mistral_backbone import (
+    MistralBackbone as MistralBackbone,
+)
+from keras_hub.src.models.mistral.mistral_causal_lm import (
+    MistralCausalLM as MistralCausalLM,
 )
-from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
-from keras_hub.src.models.masked_lm import MaskedLM
-from keras_hub.src.models.masked_lm_preprocessor import MaskedLMPreprocessor
-from keras_hub.src.models.mistral.mistral_backbone import MistralBackbone
-from keras_hub.src.models.mistral.mistral_causal_lm import MistralCausalLM
 from keras_hub.src.models.mistral.mistral_causal_lm_preprocessor import (
-    MistralCausalLMPreprocessor,
+    MistralCausalLMPreprocessor as MistralCausalLMPreprocessor,
+)
+from keras_hub.src.models.mistral.mistral_tokenizer import (
+    MistralTokenizer as MistralTokenizer,
+)
+from keras_hub.src.models.mit.mit_backbone import MiTBackbone as MiTBackbone
+from keras_hub.src.models.mit.mit_image_classifier import (
+    MiTImageClassifier as MiTImageClassifier,
 )
-from keras_hub.src.models.mistral.mistral_tokenizer import MistralTokenizer
-from keras_hub.src.models.mit.mit_backbone import MiTBackbone
-from keras_hub.src.models.mit.mit_image_classifier import MiTImageClassifier
 from keras_hub.src.models.mit.mit_image_classifier_preprocessor import (
-    MiTImageClassifierPreprocessor,
+    MiTImageClassifierPreprocessor as MiTImageClassifierPreprocessor,
+)
+from keras_hub.src.models.mixtral.mixtral_backbone import (
+    MixtralBackbone as MixtralBackbone,
+)
+from keras_hub.src.models.mixtral.mixtral_causal_lm import (
+    MixtralCausalLM as MixtralCausalLM,
 )
-from keras_hub.src.models.mixtral.mixtral_backbone import MixtralBackbone
-from keras_hub.src.models.mixtral.mixtral_causal_lm import MixtralCausalLM
 from keras_hub.src.models.mixtral.mixtral_causal_lm_preprocessor import (
-    MixtralCausalLMPreprocessor,
+    MixtralCausalLMPreprocessor as MixtralCausalLMPreprocessor,
+)
+from keras_hub.src.models.mixtral.mixtral_tokenizer import (
+    MixtralTokenizer as MixtralTokenizer,
+)
+from keras_hub.src.models.mobilenet.mobilenet_backbone import (
+    MobileNetBackbone as MobileNetBackbone,
 )
-from keras_hub.src.models.mixtral.mixtral_tokenizer import MixtralTokenizer
-from keras_hub.src.models.mobilenet.mobilenet_backbone import MobileNetBackbone
 from keras_hub.src.models.mobilenet.mobilenet_image_classifier import (
-    MobileNetImageClassifier,
+    MobileNetImageClassifier as MobileNetImageClassifier,
 )
 from keras_hub.src.models.mobilenet.mobilenet_image_classifier_preprocessor import (
-    MobileNetImageClassifierPreprocessor,
+    MobileNetImageClassifierPreprocessor as MobileNetImageClassifierPreprocessor,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
-    MoonshineAudioToText,
+    MoonshineAudioToText as MoonshineAudioToText,
 )
 from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import (
-    MoonshineAudioToTextPreprocessor,
+    MoonshineAudioToTextPreprocessor as MoonshineAudioToTextPreprocessor,
+)
+from keras_hub.src.models.moonshine.moonshine_backbone import (
+    MoonshineBackbone as MoonshineBackbone,
 )
-from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
 from keras_hub.src.models.moonshine.moonshine_tokenizer import (
-    MoonshineTokenizer,
+    MoonshineTokenizer as MoonshineTokenizer,
 )
-from keras_hub.src.models.object_detector import ObjectDetector
 from keras_hub.src.models.object_detector import (
     ObjectDetector as ImageObjectDetector,
 )
-from keras_hub.src.models.object_detector_preprocessor import (
-    ObjectDetectorPreprocessor,
+from keras_hub.src.models.object_detector import (
+    ObjectDetector as ObjectDetector,
 )
 from keras_hub.src.models.object_detector_preprocessor import (
     ObjectDetectorPreprocessor as ImageObjectDetectorPreprocessor,
 )
-from keras_hub.src.models.opt.opt_backbone import OPTBackbone
-from keras_hub.src.models.opt.opt_causal_lm import OPTCausalLM
+from keras_hub.src.models.object_detector_preprocessor import (
+    ObjectDetectorPreprocessor as ObjectDetectorPreprocessor,
+)
+from keras_hub.src.models.opt.opt_backbone import OPTBackbone as OPTBackbone
+from keras_hub.src.models.opt.opt_causal_lm import OPTCausalLM as OPTCausalLM
 from keras_hub.src.models.opt.opt_causal_lm_preprocessor import (
-    OPTCausalLMPreprocessor,
+    OPTCausalLMPreprocessor as OPTCausalLMPreprocessor,
 )
-from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer
+from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
 from keras_hub.src.models.pali_gemma.pali_gemma_backbone import (
-    PaliGemmaBackbone,
+    PaliGemmaBackbone as PaliGemmaBackbone,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm import (
-    PaliGemmaCausalLM,
+    PaliGemmaCausalLM as PaliGemmaCausalLM,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm_preprocessor import (
-    PaliGemmaCausalLMPreprocessor,
+    PaliGemmaCausalLMPreprocessor as PaliGemmaCausalLMPreprocessor,
 )
 from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
-    PaliGemmaTokenizer,
+    PaliGemmaTokenizer as PaliGemmaTokenizer,
+)
+from keras_hub.src.models.parseq.parseq_backbone import (
+    PARSeqBackbone as PARSeqBackbone,
+)
+from keras_hub.src.models.parseq.parseq_causal_lm import (
+    PARSeqCausalLM as PARSeqCausalLM,
 )
-from keras_hub.src.models.parseq.parseq_backbone import PARSeqBackbone
-from keras_hub.src.models.parseq.parseq_causal_lm import PARSeqCausalLM
 from keras_hub.src.models.parseq.parseq_causal_lm_preprocessor import (
-    PARSeqCausalLMPreprocessor,
+    PARSeqCausalLMPreprocessor as PARSeqCausalLMPreprocessor,
+)
+from keras_hub.src.models.parseq.parseq_tokenizer import (
+    PARSeqTokenizer as PARSeqTokenizer,
+)
+from keras_hub.src.models.phi3.phi3_backbone import Phi3Backbone as Phi3Backbone
+from keras_hub.src.models.phi3.phi3_causal_lm import (
+    Phi3CausalLM as Phi3CausalLM,
 )
-from keras_hub.src.models.parseq.parseq_tokenizer import PARSeqTokenizer
-from keras_hub.src.models.phi3.phi3_backbone import Phi3Backbone
-from keras_hub.src.models.phi3.phi3_causal_lm import Phi3CausalLM
 from keras_hub.src.models.phi3.phi3_causal_lm_preprocessor import (
-    Phi3CausalLMPreprocessor,
+    Phi3CausalLMPreprocessor as Phi3CausalLMPreprocessor,
 )
-from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
-from keras_hub.src.models.preprocessor import Preprocessor
-from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
+from keras_hub.src.models.phi3.phi3_tokenizer import (
+    Phi3Tokenizer as Phi3Tokenizer,
+)
+from keras_hub.src.models.preprocessor import Preprocessor as Preprocessor
 from keras_hub.src.models.qwen.qwen_backbone import (
     QwenBackbone as Qwen2Backbone,
 )
-from keras_hub.src.models.qwen.qwen_causal_lm import QwenCausalLM
+from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone as QwenBackbone
 from keras_hub.src.models.qwen.qwen_causal_lm import (
     QwenCausalLM as Qwen2CausalLM,
 )
-from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
-    QwenCausalLMPreprocessor,
+from keras_hub.src.models.qwen.qwen_causal_lm import (
+    QwenCausalLM as QwenCausalLM,
 )
 from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
     QwenCausalLMPreprocessor as Qwen2CausalLMPreprocessor,
 )
-from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
+from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
+    QwenCausalLMPreprocessor as QwenCausalLMPreprocessor,
+)
 from keras_hub.src.models.qwen.qwen_tokenizer import (
     QwenTokenizer as Qwen2Tokenizer,
 )
-from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
-from keras_hub.src.models.qwen3.qwen3_causal_lm import Qwen3CausalLM
+from keras_hub.src.models.qwen.qwen_tokenizer import (
+    QwenTokenizer as QwenTokenizer,
+)
+from keras_hub.src.models.qwen3.qwen3_backbone import (
+    Qwen3Backbone as Qwen3Backbone,
+)
+from keras_hub.src.models.qwen3.qwen3_causal_lm import (
+    Qwen3CausalLM as Qwen3CausalLM,
+)
 from keras_hub.src.models.qwen3.qwen3_causal_lm_preprocessor import (
-    Qwen3CausalLMPreprocessor,
+    Qwen3CausalLMPreprocessor as Qwen3CausalLMPreprocessor,
+)
+from keras_hub.src.models.qwen3.qwen3_tokenizer import (
+    Qwen3Tokenizer as Qwen3Tokenizer,
+)
+from keras_hub.src.models.qwen3_moe.qwen3_moe_backbone import (
+    Qwen3MoeBackbone as Qwen3MoeBackbone,
+)
+from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm import (
+    Qwen3MoeCausalLM as Qwen3MoeCausalLM,
 )
-from keras_hub.src.models.qwen3.qwen3_tokenizer import Qwen3Tokenizer
-from keras_hub.src.models.qwen3_moe.qwen3_moe_backbone import Qwen3MoeBackbone
-from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm import Qwen3MoeCausalLM
 from keras_hub.src.models.qwen3_moe.qwen3_moe_causal_lm_preprocessor import (
-    Qwen3MoeCausalLMPreprocessor,
+    Qwen3MoeCausalLMPreprocessor as Qwen3MoeCausalLMPreprocessor,
+)
+from keras_hub.src.models.qwen_moe.qwen_moe_backbone import (
+    QwenMoeBackbone as QwenMoeBackbone,
+)
+from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm import (
+    QwenMoeCausalLM as QwenMoeCausalLM,
 )
-from keras_hub.src.models.qwen_moe.qwen_moe_backbone import QwenMoeBackbone
-from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm import QwenMoeCausalLM
 from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm_preprocessor import (
-    QwenMoeCausalLMPreprocessor,
+    QwenMoeCausalLMPreprocessor as QwenMoeCausalLMPreprocessor,
+)
+from keras_hub.src.models.resnet.resnet_backbone import (
+    ResNetBackbone as ResNetBackbone,
 )
-from keras_hub.src.models.resnet.resnet_backbone import ResNetBackbone
 from keras_hub.src.models.resnet.resnet_image_classifier import (
-    ResNetImageClassifier,
+    ResNetImageClassifier as ResNetImageClassifier,
 )
 from keras_hub.src.models.resnet.resnet_image_classifier_preprocessor import (
-    ResNetImageClassifierPreprocessor,
+    ResNetImageClassifierPreprocessor as ResNetImageClassifierPreprocessor,
+)
+from keras_hub.src.models.retinanet.retinanet_backbone import (
+    RetinaNetBackbone as RetinaNetBackbone,
 )
-from keras_hub.src.models.retinanet.retinanet_backbone import RetinaNetBackbone
 from keras_hub.src.models.retinanet.retinanet_object_detector import (
-    RetinaNetObjectDetector,
+    RetinaNetObjectDetector as RetinaNetObjectDetector,
 )
 from keras_hub.src.models.retinanet.retinanet_object_detector_preprocessor import (
-    RetinaNetObjectDetectorPreprocessor,
+    RetinaNetObjectDetectorPreprocessor as RetinaNetObjectDetectorPreprocessor,
+)
+from keras_hub.src.models.roberta.roberta_backbone import (
+    RobertaBackbone as RobertaBackbone,
+)
+from keras_hub.src.models.roberta.roberta_masked_lm import (
+    RobertaMaskedLM as RobertaMaskedLM,
 )
-from keras_hub.src.models.roberta.roberta_backbone import RobertaBackbone
-from keras_hub.src.models.roberta.roberta_masked_lm import RobertaMaskedLM
 from keras_hub.src.models.roberta.roberta_masked_lm_preprocessor import (
-    RobertaMaskedLMPreprocessor,
+    RobertaMaskedLMPreprocessor as RobertaMaskedLMPreprocessor,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier import (
-    RobertaTextClassifier,
+    RobertaTextClassifier as RobertaClassifier,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier import (
-    RobertaTextClassifier as RobertaClassifier,
+    RobertaTextClassifier as RobertaTextClassifier,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier_preprocessor import (
-    RobertaTextClassifierPreprocessor,
+    RobertaTextClassifierPreprocessor as RobertaPreprocessor,
 )
 from keras_hub.src.models.roberta.roberta_text_classifier_preprocessor import (
-    RobertaTextClassifierPreprocessor as RobertaPreprocessor,
+    RobertaTextClassifierPreprocessor as RobertaTextClassifierPreprocessor,
+)
+from keras_hub.src.models.roberta.roberta_tokenizer import (
+    RobertaTokenizer as RobertaTokenizer,
 )
-from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_hub.src.models.roformer_v2.roformer_v2_backbone import (
-    RoformerV2Backbone,
+    RoformerV2Backbone as RoformerV2Backbone,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm import (
-    RoformerV2MaskedLM,
+    RoformerV2MaskedLM as RoformerV2MaskedLM,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm_preprocessor import (
-    RoformerV2MaskedLMPreprocessor,
+    RoformerV2MaskedLMPreprocessor as RoformerV2MaskedLMPreprocessor,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier import (
-    RoformerV2TextClassifier,
+    RoformerV2TextClassifier as RoformerV2TextClassifier,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier_preprocessor import (
-    RoformerV2TextClassifierPreprocessor,
+    RoformerV2TextClassifierPreprocessor as RoformerV2TextClassifierPreprocessor,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
-    RoformerV2Tokenizer,
+    RoformerV2Tokenizer as RoformerV2Tokenizer,
+)
+from keras_hub.src.models.rwkv7.rwkv7_backbone import (
+    RWKV7Backbone as RWKV7Backbone,
+)
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm import (
+    RWKV7CausalLM as RWKV7CausalLM,
 )
-from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
-from keras_hub.src.models.rwkv7.rwkv7_causal_lm import RWKV7CausalLM
 from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
-    RWKV7CausalLMPreprocessor,
+    RWKV7CausalLMPreprocessor as RWKV7CausalLMPreprocessor,
+)
+from keras_hub.src.models.sam.sam_backbone import SAMBackbone as SAMBackbone
+from keras_hub.src.models.sam.sam_image_segmenter import (
+    SAMImageSegmenter as SAMImageSegmenter,
 )
-from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
-from keras_hub.src.models.sam.sam_backbone import SAMBackbone
-from keras_hub.src.models.sam.sam_image_segmenter import SAMImageSegmenter
 from keras_hub.src.models.sam.sam_image_segmenter_preprocessor import (
-    SAMImageSegmenterPreprocessor,
+    SAMImageSegmenterPreprocessor as SAMImageSegmenterPreprocessor,
+)
+from keras_hub.src.models.segformer.segformer_backbone import (
+    SegFormerBackbone as SegFormerBackbone,
 )
-from keras_hub.src.models.segformer.segformer_backbone import SegFormerBackbone
 from keras_hub.src.models.segformer.segformer_image_segmenter import (
-    SegFormerImageSegmenter,
+    SegFormerImageSegmenter as SegFormerImageSegmenter,
 )
 from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor import (
-    SegFormerImageSegmenterPreprocessor,
-)
-from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
-from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
-from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
-from keras_hub.src.models.siglip.siglip_preprocessor import SigLIPPreprocessor
-from keras_hub.src.models.siglip.siglip_text_encoder import SigLIPTextEncoder
-from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
+    SegFormerImageSegmenterPreprocessor as SegFormerImageSegmenterPreprocessor,
+)
+from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM as Seq2SeqLM
+from keras_hub.src.models.seq_2_seq_lm_preprocessor import (
+    Seq2SeqLMPreprocessor as Seq2SeqLMPreprocessor,
+)
+from keras_hub.src.models.siglip.siglip_backbone import (
+    SigLIPBackbone as SigLIPBackbone,
+)
+from keras_hub.src.models.siglip.siglip_preprocessor import (
+    SigLIPPreprocessor as SigLIPPreprocessor,
+)
+from keras_hub.src.models.siglip.siglip_text_encoder import (
+    SigLIPTextEncoder as SigLIPTextEncoder,
+)
+from keras_hub.src.models.siglip.siglip_tokenizer import (
+    SigLIPTokenizer as SigLIPTokenizer,
+)
 from keras_hub.src.models.siglip.siglip_vision_encoder import (
-    SigLIPVisionEncoder,
+    SigLIPVisionEncoder as SigLIPVisionEncoder,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import (
-    StableDiffusion3Backbone,
+    StableDiffusion3Backbone as StableDiffusion3Backbone,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_image_to_image import (
-    StableDiffusion3ImageToImage,
+    StableDiffusion3ImageToImage as StableDiffusion3ImageToImage,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_inpaint import (
-    StableDiffusion3Inpaint,
+    StableDiffusion3Inpaint as StableDiffusion3Inpaint,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image import (
-    StableDiffusion3TextToImage,
+    StableDiffusion3TextToImage as StableDiffusion3TextToImage,
 )
 from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image_preprocessor import (
-    StableDiffusion3TextToImagePreprocessor,
+    StableDiffusion3TextToImagePreprocessor as StableDiffusion3TextToImagePreprocessor,
+)
+from keras_hub.src.models.t5.t5_backbone import T5Backbone as T5Backbone
+from keras_hub.src.models.t5.t5_preprocessor import (
+    T5Preprocessor as T5Preprocessor,
+)
+from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
+from keras_hub.src.models.t5gemma.t5gemma_backbone import (
+    T5GemmaBackbone as T5GemmaBackbone,
+)
+from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm import (
+    T5GemmaSeq2SeqLM as T5GemmaSeq2SeqLM,
 )
-from keras_hub.src.models.t5.t5_backbone import T5Backbone
-from keras_hub.src.models.t5.t5_preprocessor import T5Preprocessor
-from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer
-from keras_hub.src.models.t5gemma.t5gemma_backbone import T5GemmaBackbone
-from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm import T5GemmaSeq2SeqLM
 from keras_hub.src.models.t5gemma.t5gemma_seq_2_seq_lm_preprocessor import (
-    T5GemmaSeq2SeqLMPreprocessor,
+    T5GemmaSeq2SeqLMPreprocessor as T5GemmaSeq2SeqLMPreprocessor,
 )
-from keras_hub.src.models.t5gemma.t5gemma_tokenizer import T5GemmaTokenizer
-from keras_hub.src.models.task import Task
-from keras_hub.src.models.text_classifier import TextClassifier
+from keras_hub.src.models.t5gemma.t5gemma_tokenizer import (
+    T5GemmaTokenizer as T5GemmaTokenizer,
+)
+from keras_hub.src.models.task import Task as Task
 from keras_hub.src.models.text_classifier import TextClassifier as Classifier
+from keras_hub.src.models.text_classifier import (
+    TextClassifier as TextClassifier,
+)
 from keras_hub.src.models.text_classifier_preprocessor import (
-    TextClassifierPreprocessor,
+    TextClassifierPreprocessor as TextClassifierPreprocessor,
 )
-from keras_hub.src.models.text_to_image import TextToImage
+from keras_hub.src.models.text_to_image import TextToImage as TextToImage
 from keras_hub.src.models.text_to_image_preprocessor import (
-    TextToImagePreprocessor,
+    TextToImagePreprocessor as TextToImagePreprocessor,
+)
+from keras_hub.src.models.vgg.vgg_backbone import VGGBackbone as VGGBackbone
+from keras_hub.src.models.vgg.vgg_image_classifier import (
+    VGGImageClassifier as VGGImageClassifier,
 )
-from keras_hub.src.models.vgg.vgg_backbone import VGGBackbone
-from keras_hub.src.models.vgg.vgg_image_classifier import VGGImageClassifier
 from keras_hub.src.models.vgg.vgg_image_classifier_preprocessor import (
-    VGGImageClassifierPreprocessor,
+    VGGImageClassifierPreprocessor as VGGImageClassifierPreprocessor,
+)
+from keras_hub.src.models.vit.vit_backbone import ViTBackbone as ViTBackbone
+from keras_hub.src.models.vit.vit_image_classifier import (
+    ViTImageClassifier as ViTImageClassifier,
 )
-from keras_hub.src.models.vit.vit_backbone import ViTBackbone
-from keras_hub.src.models.vit.vit_image_classifier import ViTImageClassifier
 from keras_hub.src.models.vit.vit_image_classifier_preprocessor import (
-    ViTImageClassifierPreprocessor,
+    ViTImageClassifierPreprocessor as ViTImageClassifierPreprocessor,
+)
+from keras_hub.src.models.vit_det.vit_det_backbone import (
+    ViTDetBackbone as ViTDetBackbone,
+)
+from keras_hub.src.models.whisper.whisper_backbone import (
+    WhisperBackbone as WhisperBackbone,
+)
+from keras_hub.src.models.whisper.whisper_tokenizer import (
+    WhisperTokenizer as WhisperTokenizer,
+)
+from keras_hub.src.models.xception.xception_backbone import (
+    XceptionBackbone as XceptionBackbone,
 )
-from keras_hub.src.models.vit_det.vit_det_backbone import ViTDetBackbone
-from keras_hub.src.models.whisper.whisper_backbone import WhisperBackbone
-from keras_hub.src.models.whisper.whisper_tokenizer import WhisperTokenizer
-from keras_hub.src.models.xception.xception_backbone import XceptionBackbone
 from keras_hub.src.models.xception.xception_image_classifier import (
-    XceptionImageClassifier,
+    XceptionImageClassifier as XceptionImageClassifier,
 )
 from keras_hub.src.models.xception.xception_image_classifier_preprocessor import (
-    XceptionImageClassifierPreprocessor,
+    XceptionImageClassifierPreprocessor as XceptionImageClassifierPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
-    XLMRobertaBackbone,
+    XLMRobertaBackbone as XLMRobertaBackbone,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm import (
-    XLMRobertaMaskedLM,
+    XLMRobertaMaskedLM as XLMRobertaMaskedLM,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (
-    XLMRobertaMaskedLMPreprocessor,
+    XLMRobertaMaskedLMPreprocessor as XLMRobertaMaskedLMPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier import (
-    XLMRobertaTextClassifier,
+    XLMRobertaTextClassifier as XLMRobertaClassifier,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier import (
-    XLMRobertaTextClassifier as XLMRobertaClassifier,
+    XLMRobertaTextClassifier as XLMRobertaTextClassifier,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (
-    XLMRobertaTextClassifierPreprocessor,
+    XLMRobertaTextClassifierPreprocessor as XLMRobertaPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (
-    XLMRobertaTextClassifierPreprocessor as XLMRobertaPreprocessor,
+    XLMRobertaTextClassifierPreprocessor as XLMRobertaTextClassifierPreprocessor,
 )
 from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
-    XLMRobertaTokenizer,
+    XLMRobertaTokenizer as XLMRobertaTokenizer,
+)
+from keras_hub.src.models.xlnet.xlnet_backbone import (
+    XLNetBackbone as XLNetBackbone,
 )
-from keras_hub.src.models.xlnet.xlnet_backbone import XLNetBackbone
-from keras_hub.src.tokenizers.tokenizer import Tokenizer
+from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
diff --git a/keras_hub/api/samplers/__init__.py b/keras_hub/api/samplers/__init__.py
index 9feb76c669..29bfef00fc 100644
--- a/keras_hub/api/samplers/__init__.py
+++ b/keras_hub/api/samplers/__init__.py
@@ -4,13 +4,15 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.samplers.beam_sampler import BeamSampler
-from keras_hub.src.samplers.contrastive_sampler import ContrastiveSampler
-from keras_hub.src.samplers.greedy_sampler import GreedySampler
-from keras_hub.src.samplers.random_sampler import RandomSampler
-from keras_hub.src.samplers.sampler import Sampler
-from keras_hub.src.samplers.serialization import deserialize
-from keras_hub.src.samplers.serialization import get
-from keras_hub.src.samplers.serialization import serialize
-from keras_hub.src.samplers.top_k_sampler import TopKSampler
-from keras_hub.src.samplers.top_p_sampler import TopPSampler
+from keras_hub.src.samplers.beam_sampler import BeamSampler as BeamSampler
+from keras_hub.src.samplers.contrastive_sampler import (
+    ContrastiveSampler as ContrastiveSampler,
+)
+from keras_hub.src.samplers.greedy_sampler import GreedySampler as GreedySampler
+from keras_hub.src.samplers.random_sampler import RandomSampler as RandomSampler
+from keras_hub.src.samplers.sampler import Sampler as Sampler
+from keras_hub.src.samplers.serialization import deserialize as deserialize
+from keras_hub.src.samplers.serialization import get as get
+from keras_hub.src.samplers.serialization import serialize as serialize
+from keras_hub.src.samplers.top_k_sampler import TopKSampler as TopKSampler
+from keras_hub.src.samplers.top_p_sampler import TopPSampler as TopPSampler
diff --git a/keras_hub/api/tokenizers/__init__.py b/keras_hub/api/tokenizers/__init__.py
index b13023ef3e..264bc8bdd4 100644
--- a/keras_hub/api/tokenizers/__init__.py
+++ b/keras_hub/api/tokenizers/__init__.py
@@ -4,69 +4,127 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.models.albert.albert_tokenizer import AlbertTokenizer
-from keras_hub.src.models.bart.bart_tokenizer import BartTokenizer
-from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
-from keras_hub.src.models.bloom.bloom_tokenizer import BloomTokenizer
-from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
+from keras_hub.src.models.albert.albert_tokenizer import (
+    AlbertTokenizer as AlbertTokenizer,
+)
+from keras_hub.src.models.bart.bart_tokenizer import (
+    BartTokenizer as BartTokenizer,
+)
+from keras_hub.src.models.bert.bert_tokenizer import (
+    BertTokenizer as BertTokenizer,
+)
+from keras_hub.src.models.bloom.bloom_tokenizer import (
+    BloomTokenizer as BloomTokenizer,
+)
+from keras_hub.src.models.clip.clip_tokenizer import (
+    CLIPTokenizer as CLIPTokenizer,
+)
 from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
-    DebertaV3Tokenizer,
+    DebertaV3Tokenizer as DebertaV3Tokenizer,
 )
 from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
-    DistilBertTokenizer,
-)
-from keras_hub.src.models.electra.electra_tokenizer import ElectraTokenizer
-from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer
-from keras_hub.src.models.f_net.f_net_tokenizer import FNetTokenizer
-from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
-from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
-from keras_hub.src.models.gemma3.gemma3_tokenizer import Gemma3Tokenizer
-from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
-from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
-from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
-from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
-from keras_hub.src.models.mistral.mistral_tokenizer import MistralTokenizer
-from keras_hub.src.models.mixtral.mixtral_tokenizer import MixtralTokenizer
+    DistilBertTokenizer as DistilBertTokenizer,
+)
+from keras_hub.src.models.electra.electra_tokenizer import (
+    ElectraTokenizer as ElectraTokenizer,
+)
+from keras_hub.src.models.esm.esm_tokenizer import ESMTokenizer as ESMTokenizer
+from keras_hub.src.models.f_net.f_net_tokenizer import (
+    FNetTokenizer as FNetTokenizer,
+)
+from keras_hub.src.models.falcon.falcon_tokenizer import (
+    FalconTokenizer as FalconTokenizer,
+)
+from keras_hub.src.models.gemma.gemma_tokenizer import (
+    GemmaTokenizer as GemmaTokenizer,
+)
+from keras_hub.src.models.gemma3.gemma3_tokenizer import (
+    Gemma3Tokenizer as Gemma3Tokenizer,
+)
+from keras_hub.src.models.gpt2.gpt2_tokenizer import (
+    GPT2Tokenizer as GPT2Tokenizer,
+)
+from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
+    GPTNeoXTokenizer as GPTNeoXTokenizer,
+)
+from keras_hub.src.models.llama.llama_tokenizer import (
+    LlamaTokenizer as LlamaTokenizer,
+)
+from keras_hub.src.models.llama3.llama3_tokenizer import (
+    Llama3Tokenizer as Llama3Tokenizer,
+)
+from keras_hub.src.models.mistral.mistral_tokenizer import (
+    MistralTokenizer as MistralTokenizer,
+)
+from keras_hub.src.models.mixtral.mixtral_tokenizer import (
+    MixtralTokenizer as MixtralTokenizer,
+)
 from keras_hub.src.models.moonshine.moonshine_tokenizer import (
-    MoonshineTokenizer,
+    MoonshineTokenizer as MoonshineTokenizer,
 )
-from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer
+from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
 from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
-    PaliGemmaTokenizer,
+    PaliGemmaTokenizer as PaliGemmaTokenizer,
+)
+from keras_hub.src.models.parseq.parseq_tokenizer import (
+    PARSeqTokenizer as PARSeqTokenizer,
+)
+from keras_hub.src.models.phi3.phi3_tokenizer import (
+    Phi3Tokenizer as Phi3Tokenizer,
 )
-from keras_hub.src.models.parseq.parseq_tokenizer import PARSeqTokenizer
-from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
-from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
 from keras_hub.src.models.qwen.qwen_tokenizer import (
     QwenTokenizer as Qwen2Tokenizer,
 )
-from keras_hub.src.models.qwen3_moe.qwen3_moe_tokenizer import Qwen3MoeTokenizer
-from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import QwenMoeTokenizer
-from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
+from keras_hub.src.models.qwen.qwen_tokenizer import (
+    QwenTokenizer as QwenTokenizer,
+)
+from keras_hub.src.models.qwen3_moe.qwen3_moe_tokenizer import (
+    Qwen3MoeTokenizer as Qwen3MoeTokenizer,
+)
+from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import (
+    QwenMoeTokenizer as QwenMoeTokenizer,
+)
+from keras_hub.src.models.roberta.roberta_tokenizer import (
+    RobertaTokenizer as RobertaTokenizer,
+)
 from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
-    RoformerV2Tokenizer,
+    RoformerV2Tokenizer as RoformerV2Tokenizer,
+)
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import (
+    RWKVTokenizer as RWKVTokenizer,
+)
+from keras_hub.src.models.siglip.siglip_tokenizer import (
+    SigLIPTokenizer as SigLIPTokenizer,
+)
+from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
+from keras_hub.src.models.t5gemma.t5gemma_tokenizer import (
+    T5GemmaTokenizer as T5GemmaTokenizer,
+)
+from keras_hub.src.models.whisper.whisper_tokenizer import (
+    WhisperTokenizer as WhisperTokenizer,
 )
-from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
-from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
-from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer
-from keras_hub.src.models.t5gemma.t5gemma_tokenizer import T5GemmaTokenizer
-from keras_hub.src.models.whisper.whisper_tokenizer import WhisperTokenizer
 from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
-    XLMRobertaTokenizer,
+    XLMRobertaTokenizer as XLMRobertaTokenizer,
+)
+from keras_hub.src.tokenizers.byte_pair_tokenizer import (
+    BytePairTokenizer as BytePairTokenizer,
+)
+from keras_hub.src.tokenizers.byte_tokenizer import (
+    ByteTokenizer as ByteTokenizer,
 )
-from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
-from keras_hub.src.tokenizers.byte_tokenizer import ByteTokenizer
 from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
-    SentencePieceTokenizer,
+    SentencePieceTokenizer as SentencePieceTokenizer,
 )
 from keras_hub.src.tokenizers.sentence_piece_tokenizer_trainer import (
-    compute_sentence_piece_proto,
+    compute_sentence_piece_proto as compute_sentence_piece_proto,
 )
-from keras_hub.src.tokenizers.tokenizer import Tokenizer
+from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
 from keras_hub.src.tokenizers.unicode_codepoint_tokenizer import (
-    UnicodeCodepointTokenizer,
+    UnicodeCodepointTokenizer as UnicodeCodepointTokenizer,
+)
+from keras_hub.src.tokenizers.word_piece_tokenizer import (
+    WordPieceTokenizer as WordPieceTokenizer,
 )
-from keras_hub.src.tokenizers.word_piece_tokenizer import WordPieceTokenizer
 from keras_hub.src.tokenizers.word_piece_tokenizer_trainer import (
-    compute_word_piece_vocabulary,
+    compute_word_piece_vocabulary as compute_word_piece_vocabulary,
 )
diff --git a/keras_hub/api/utils/__init__.py b/keras_hub/api/utils/__init__.py
index 8ce47790b0..0bd8cb642e 100644
--- a/keras_hub/api/utils/__init__.py
+++ b/keras_hub/api/utils/__init__.py
@@ -4,10 +4,18 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.utils.coco.coco_utils import coco_id_to_name
-from keras_hub.src.utils.coco.coco_utils import coco_name_to_id
+from keras_hub.src.utils.coco.coco_utils import (
+    coco_id_to_name as coco_id_to_name,
+)
+from keras_hub.src.utils.coco.coco_utils import (
+    coco_name_to_id as coco_name_to_id,
+)
+from keras_hub.src.utils.imagenet.imagenet_utils import (
+    decode_imagenet_predictions as decode_imagenet_predictions,
+)
+from keras_hub.src.utils.imagenet.imagenet_utils import (
+    imagenet_id_to_name as imagenet_id_to_name,
+)
 from keras_hub.src.utils.imagenet.imagenet_utils import (
-    decode_imagenet_predictions,
+    imagenet_name_to_id as imagenet_name_to_id,
 )
-from keras_hub.src.utils.imagenet.imagenet_utils import imagenet_id_to_name
-from keras_hub.src.utils.imagenet.imagenet_utils import imagenet_name_to_id
diff --git a/keras_hub/src/models/rwkv7/rwkv7_backbone_test.py b/keras_hub/src/models/rwkv7/rwkv7_backbone_test.py
new file mode 100644
index 0000000000..e061c0e3e6
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_backbone_test.py
@@ -0,0 +1,37 @@
+from keras import ops
+
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.tests.test_case import TestCase
+
+
+class RWKV7BackboneTest(TestCase):
+    def setUp(self):
+        """
+        Set up the test case with default arguments and input data.
+        """
+        self.init_kwargs = {
+            "vocabulary_size": 10,
+            "hidden_size": 16,
+            "num_layers": 2,
+            "head_size": 4,
+            "intermediate_dim": 32,
+            "gate_lora": 32,
+            "mv_lora": 16,
+            "aaa_lora": 16,
+            "decay_lora": 16,
+        }
+        self.input_data = ops.ones((2, 5), dtype="int32")
+        self.backbone = RWKV7Backbone(**self.init_kwargs)
+
+    def test_backbone_basics(self):
+        """
+        Test basic functionality of the RWKV7 backbone.
+        """
+        y = self.backbone(self.input_data)
+        self.assertEqual(y.shape, (2, 5, 10))
+
+    def test_num_parameters(self):
+        """
+        Test that the model has the expected number of parameters.
+        """
+        self.assertEqual(self.backbone.count_params(), 10208)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
index a57d1d1773..c7a33c1fb9 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm.py
@@ -36,10 +36,10 @@ class RWKV7CausalLM(CausalLM):
     # Initialize the tokenizer and load assets from a local path.
     tokenizer = RWKVTokenizer()
     tokenizer.load_assets(rwkv_path)
-    
+
     # Create a preprocessor with a sequence length of 8.
     preprocessor = RWKV7CausalLMPreprocessor(tokenizer, sequence_length=8)
-    
+
     # Initialize the model with a backbone and preprocessor.
     causal_lm = RWKV7CausalLM(backbone, preprocessor)
 
@@ -59,7 +59,7 @@ class RWKV7CausalLM(CausalLM):
 
     def __init__(self, backbone, preprocessor=None, **kwargs):
         """Initialize the RWKV-7 causal language model.
-        
+
         Args:
             backbone: The backbone model.
             preprocessor: The preprocessor for tokenization.
@@ -87,7 +87,7 @@ def call_with_cache(
 
         `call_with_cache` adds an additional forward pass for the model for
         autoregressive inference. Unlike calling the model directly, this method
-        allows caching previous state Tensors in RWKV layers, and avoids 
+        allows caching previous state Tensors in RWKV layers, and avoids
         recomputing the outputs of seen tokens.
 
         Args:
@@ -249,4 +249,4 @@ def next(prompt, cache, index):
         return {
             "token_ids": token_ids,
             "padding_mask": padding_mask,
-        }
\ No newline at end of file
+        }
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
index 6187a07f35..0071cda60b 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py
@@ -44,13 +44,13 @@ class RWKV7CausalLMPreprocessor(CausalLMPreprocessor):
     # Initialize the tokenizer and load assets from a local path.
     tokenizer = RWKVTokenizer()
     tokenizer.load_assets(rwkv_path)
-    
+
     # Create a preprocessor with a sequence length of 8.
     preprocessor = RWKV7CausalLMPreprocessor(tokenizer, sequence_length=8)
-    
+
     # Tokenize and pack a batch of sentences.
     preprocessor(["Bubble sort\n```python", "Hello World\n```python\n"])
-    
+
     # Preprocess inputs for generation with a maximum generation length of 16.
     preprocessor.generate_preprocess(
         ["Bubble sort\n```python", "Hello World\n```python\n"], 16
@@ -91,6 +91,7 @@ class RWKV7CausalLMPreprocessor(CausalLMPreprocessor):
              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
              0,  0]], dtype=torch.int32)}
     """
+
     backbone_cls = RWKV7Backbone
     tokenizer_cls = RWKVTokenizer
 
@@ -101,7 +102,7 @@ def __init__(
         **kwargs,
     ):
         """Initialize the preprocessor.
-        
+
         Args:
             tokenizer: The tokenizer to use.
             add_start_token: Whether to add start token.
@@ -119,19 +120,26 @@ def call(
         sequence_length=None,
     ):
         """Preprocess the input for training.
-        
+
         Args:
             x: Input text data.
             y: Target data (optional).
             sample_weight: Sample weights (optional).
             sequence_length: Desired sequence length.
-            
+
         Returns:
             Preprocessed data tuple (x, y, sample_weight).
         """
+        if isinstance(x, str):
+            x = [x]
         sequence_length = sequence_length or self.sequence_length
         # Pad length to multiples of 16 to meet kernel requirements
-        sequence_length = sequence_length + (16 - sequence_length % 16)
+        if sequence_length is None:
+            raise (ValueError("`sequence_length` must be specified."))
+        if (sequence_length - 1) % 16 != 0:
+            sequence_length = sequence_length + (
+                16 - (sequence_length - 1) % 16
+            )
         x = self.tokenizer(x)
 
         token_ids, padding_mask = self.packer(
@@ -158,28 +166,35 @@ def build(self, input_shape):
     def generate_preprocess(
         self,
         x,
-        sequence_length=None,
+        sequence_length,
     ):
         """Preprocess input for generation.
-        
+
         Args:
             x: Input text data.
             sequence_length: Maximum generation length.
-            
+
         Returns:
             Dictionary with preprocessed inputs for generation.
         """
+        if isinstance(x, str):
+            x = [x]
+
         if not self.built:
             self.build(None)
         # Align with Keras API
         # Input sequence_length is the maximum generation length
         # While self.sequence_length corresponds to the prefill max length
         generate_length = sequence_length
+        if sequence_length is None:
+            raise (ValueError("`sequence_length` must be specified."))
         sequence_length = self.sequence_length
 
         # Pad length to multiples of 16 to meet kernel requirements
-        sequence_length = sequence_length + (16 - sequence_length % 16)
-        generate_length = generate_length + (16 - generate_length % 16)
+        if sequence_length % 16 != 0:
+            sequence_length = sequence_length + (16 - sequence_length % 16)
+        if generate_length % 16 != 0:
+            generate_length = generate_length + (16 - generate_length % 16)
 
         x = [t[-sequence_length:] for t in self.tokenizer(x)]
         y = ops.zeros((len(x), generate_length), "int32")
@@ -206,14 +221,14 @@ def generate_postprocess(
         x,
     ):
         """Convert integer token output to strings for generation.
-        
+
         This method reverses `generate_preprocess()`, by first removing all
         padding and start/end tokens, and then converting the integer sequence
         back to a string.
-        
+
         Args:
             x: Dictionary containing token_ids and padding_mask.
-            
+
         Returns:
             Detokenized string output.
         """
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor_test.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor_test.py
new file mode 100644
index 0000000000..a2648b9c4a
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor_test.py
@@ -0,0 +1,98 @@
+import numpy as np
+
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
+    RWKV7CausalLMPreprocessor,
+)
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.tests.test_case import TestCase
+
+
+class RWKV7CausalLMPreprocessorTest(TestCase):
+    def setUp(self):
+        self.tokenizer = RWKVTokenizer(
+            ["1 ' ' 1", "2 '\\n' 1", "3 'the' 3", "4 'hello' 5", "5 'world' 5"]
+        )
+        self.preprocessor = RWKV7CausalLMPreprocessor(
+            tokenizer=self.tokenizer,
+            sequence_length=15,
+        )
+
+    def test_preprocessor_basics(self):
+        result = self.preprocessor(x=["hello world hello world hello world"])
+        self.assertAllEqual(
+            result[0], [[0, 0, 0, 0, 0, 0, 4, 1, 5, 1, 4, 1, 5, 1, 4, 1]]
+        )
+        self.assertAllEqual(
+            result[1], [[0, 0, 0, 0, 0, 4, 1, 5, 1, 4, 1, 5, 1, 4, 1, 5]]
+        )
+        self.assertAllEqual(
+            result[2],
+            [
+                [
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                ]
+            ],
+        )
+
+    def test_generate_preprocess(self):
+        result = self.preprocessor.generate_preprocess(
+            ["hello world hello world hello world"], 16
+        )
+        self.assertAllEqual(
+            result["token_ids"],
+            [[0, 0, 0, 0, 0, 0, 4, 1, 5, 1, 4, 1, 5, 1, 4, 1]],
+        )
+        self.assertAllEqual(
+            result["padding_mask"],
+            [
+                [
+                    True,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                ]
+            ],
+        )
+        self.assertAllEqual(
+            result["predict_token_ids"],
+            [[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        )
+
+    def test_generate_postprocess(self):
+        input_data = {
+            "token_ids": np.array(
+                [[3, 2, 4, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+            ),
+            "padding_mask": np.array(
+                [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+            ),
+        }
+        result = self.preprocessor.generate_postprocess(input_data)
+        self.assertEqual(result, ["the\nhellothe"])
diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
new file mode 100644
index 0000000000..ed84ef8205
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
@@ -0,0 +1,92 @@
+from keras_hub.src.models.rwkv7.rwkv7_backbone import RWKV7Backbone
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm import RWKV7CausalLM
+from keras_hub.src.models.rwkv7.rwkv7_causal_lm_preprocessor import (
+    RWKV7CausalLMPreprocessor,
+)
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.tests.test_case import TestCase
+
+
+class RWKV7CausalLMTest(TestCase):
+    def setUp(self):
+        """
+        Set up the test case with vocabulary, merges, preprocessor, backbone,
+        and other initialization parameters.
+        """
+        # Create a small vocabulary for testing
+        self.vocab = [
+            "0 ' ' 1",
+            "1 '\\n' 1",
+            "2 'the' 3",
+            "3 'hello' 5",
+            "4 'world' 5",
+            "5 'python' 6",
+        ]
+
+        # Initialize tokenizer with test vocabulary
+        self.tokenizer = RWKVTokenizer(vocabulary=self.vocab)
+
+        # Create preprocessor with sequence length of 8
+        self.preprocessor = RWKV7CausalLMPreprocessor(
+            tokenizer=self.tokenizer,
+            sequence_length=16,
+        )
+
+        # Create a small backbone for testing
+        self.backbone = RWKV7Backbone(
+            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size() + 1,
+            hidden_size=16,
+            num_layers=2,
+            head_size=4,
+            intermediate_dim=32,
+            gate_lora=8,
+            mv_lora=4,
+            aaa_lora=4,
+            decay_lora=4,
+        )
+
+        # Initialize parameters for the causal LM
+        self.init_kwargs = {
+            "preprocessor": self.preprocessor,
+            "backbone": self.backbone,
+        }
+
+        self.causal_lm = RWKV7CausalLM(self.backbone, self.preprocessor)
+        self.causal_lm.compile(sampler="greedy")
+
+    def test_generate(self):
+        """
+        Test text generation functionality.
+        """
+
+        prompt = ["hello world"]
+        output = self.causal_lm.generate(prompt, 16)
+        self.assertTrue(isinstance(output[0], str))
+        self.assertTrue(isinstance(output, list))
+
+        prompt = "hello world"
+        output = self.causal_lm.generate(prompt, 16)
+        self.assertTrue(isinstance(output, str))
+
+    def test_generate_strip_prompt(self):
+        """
+        Test that generated text can strip the prompt from output.
+        """
+        prompt = ["hello world"]
+        output = self.causal_lm.generate(prompt, 16, strip_prompt=True)
+        self.assertFalse(output[0].startswith(prompt[0]))
+
+    def test_generate_compilation(self):
+        """
+        Test that the generate function compiles correctly and
+        reuses compiled functions.
+        """
+
+        self.causal_lm.generate(["hello world"], 16)
+        first_fn = self.causal_lm.generate_function
+        self.causal_lm.generate(["hello world"], 16)
+        second_fn = self.causal_lm.generate_function
+        self.assertEqual(first_fn, second_fn)
+
+        self.causal_lm.compile(sampler="greedy")
+        self.assertIsNone(self.causal_lm.generate_function)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py
new file mode 100644
index 0000000000..f49b39ccf5
--- /dev/null
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py
@@ -0,0 +1,25 @@
+from keras_hub.src.models.rwkv7.rwkv7_tokenizer import RWKVTokenizer
+from keras_hub.src.tests.test_case import TestCase
+
+
+class RWKV7TokenizerTest(TestCase):
+    def setUp(self):
+        self.tokenizer = RWKVTokenizer(
+            ["1 ' ' 1", "2 '\\n' 1", "3 'the' 3", "4 'hello' 5", "5 'world' 5"]
+        )
+
+    def test_tokenizer_basics(self):
+        result = self.tokenizer("hello world")
+        self.assertAllEqual(result, [[4, 1, 5]])
+
+    def test_vocabulary_size(self):
+        self.assertEqual(self.tokenizer.vocabulary_size(), 5)
+
+    def test_tokenize_and_detokenize(self):
+        # Test detokenization
+        text = self.tokenizer.detokenize([[4, 1, 5]])
+        self.assertEqual(text[0], "hello world")
+
+    def test_special_tokens(self):
+        self.assertEqual(self.tokenizer.pad_token_id, 0)
+        self.assertEqual(self.tokenizer.end_token_id, 2)

From 897a64b56e69319b4dad5ea8f83fa6d0c07e5286 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Wed, 8 Oct 2025 11:55:39 +0800
Subject: [PATCH 09/10] fix test

---
 .../src/models/rwkv7/rwkv7_causal_lm_test.py  | 28 +++++++++----------
 .../src/models/rwkv7/rwkv7_tokenizer_test.py  |  2 +-
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
index ed84ef8205..215fda095d 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_causal_lm_test.py
@@ -34,7 +34,7 @@ def setUp(self):
 
         # Create a small backbone for testing
         self.backbone = RWKV7Backbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size() + 1,
+            vocabulary_size=5,
             hidden_size=16,
             num_layers=2,
             head_size=4,
@@ -51,21 +51,18 @@ def setUp(self):
             "backbone": self.backbone,
         }
 
-        self.causal_lm = RWKV7CausalLM(self.backbone, self.preprocessor)
-        self.causal_lm.compile(sampler="greedy")
-
     def test_generate(self):
         """
         Test text generation functionality.
         """
-
+        causal_lm = RWKV7CausalLM(self.backbone, self.preprocessor)
         prompt = ["hello world"]
-        output = self.causal_lm.generate(prompt, 16)
+        output = causal_lm.generate(prompt, 16)
         self.assertTrue(isinstance(output[0], str))
         self.assertTrue(isinstance(output, list))
 
         prompt = "hello world"
-        output = self.causal_lm.generate(prompt, 16)
+        output = causal_lm.generate(prompt, 16)
         self.assertTrue(isinstance(output, str))
 
     def test_generate_strip_prompt(self):
@@ -73,7 +70,8 @@ def test_generate_strip_prompt(self):
         Test that generated text can strip the prompt from output.
         """
         prompt = ["hello world"]
-        output = self.causal_lm.generate(prompt, 16, strip_prompt=True)
+        causal_lm = RWKV7CausalLM(self.backbone, self.preprocessor)
+        output = causal_lm.generate(prompt, 16, strip_prompt=True)
         self.assertFalse(output[0].startswith(prompt[0]))
 
     def test_generate_compilation(self):
@@ -81,12 +79,12 @@ def test_generate_compilation(self):
         Test that the generate function compiles correctly and
         reuses compiled functions.
         """
-
-        self.causal_lm.generate(["hello world"], 16)
-        first_fn = self.causal_lm.generate_function
-        self.causal_lm.generate(["hello world"], 16)
-        second_fn = self.causal_lm.generate_function
+        causal_lm = RWKV7CausalLM(self.backbone, self.preprocessor)
+        causal_lm.generate(["hello world"], 16)
+        first_fn = causal_lm.generate_function
+        causal_lm.generate(["hello world"], 16)
+        second_fn = causal_lm.generate_function
         self.assertEqual(first_fn, second_fn)
 
-        self.causal_lm.compile(sampler="greedy")
-        self.assertIsNone(self.causal_lm.generate_function)
+        causal_lm.compile(sampler="greedy")
+        self.assertIsNone(causal_lm.generate_function)
diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py
index f49b39ccf5..69f76a2366 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer_test.py
@@ -10,7 +10,7 @@ def setUp(self):
 
     def test_tokenizer_basics(self):
         result = self.tokenizer("hello world")
-        self.assertAllEqual(result, [[4, 1, 5]])
+        self.assertAllEqual(result, [4, 1, 5])
 
     def test_vocabulary_size(self):
         self.assertEqual(self.tokenizer.vocabulary_size(), 5)

From ff11f946cfbf9d1883f575155f5f347d002b7da3 Mon Sep 17 00:00:00 2001
From: pass_lin <935499957@qq.com>
Date: Wed, 8 Oct 2025 12:35:08 +0800
Subject: [PATCH 10/10] fix doc

---
 keras_hub/src/models/rwkv7/rwkv7_tokenizer.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
index bc2069a604..ef11a059e8 100644
--- a/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
+++ b/keras_hub/src/models/rwkv7/rwkv7_tokenizer.py
@@ -212,10 +212,13 @@ class RWKVTokenizer(tokenizer.Tokenizer):
             or string type.
 
     Examples:
+    ```python
+    vocab = ["0 ' ' 1", "1 '\\n' 1", "2 'the' 3", "3 'hello' 5"]
+    tok = RWKVTokenizer(vocabulary=vocab)
+    tok("hello the")
+    ```
 
-    >>> vocab = ["0 ' ' 1", "1 '\\n' 1", "2 'the' 3", "3 'hello' 5"]
-    >>> tok = RWKVTokenizer(vocabulary=vocab)
-    >>> tok("hello the")
+    Output:
     [3, 0, 2]
     """