diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 4ce1cd747..27ea529cd 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -25,24 +25,57 @@ def main( full_batch_size: Optional[int] = None, ): """ - Helper function used by execute CLI app to run the Model on ``Cloud AI 100`` Platform. - - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. - :qpc_path (str): Path to the generated binary after compilation. - ``Optional`` Args: - :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.``Defaults to None.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` - :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` - :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to Constants.CACHE_DIR.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` + Main function for the QEfficient execution CLI application. + + This function serves as the entry point for running a compiled model + (QPC package) on the Cloud AI 100 Platform. It loads the necessary + tokenizer and then orchestrates the text generation inference. + + Parameters + ---------- + model_name : str + Hugging Face Model Card name (e.g., ``gpt2``) for loading the tokenizer. + qpc_path : str + Path to the generated binary (QPC package) after compilation. + + Other Parameters + ---------------- + device_group : List[int], optional + List of device IDs to be used for inference. If `len(device_group) > 1`, + a multi-card setup is enabled. Default is None. + local_model_dir : str, optional + Path to custom model weights and config files, used if not loading tokenizer + from Hugging Face Hub. Default is None. + prompt : str, optional + Sample prompt(s) for the model text generation. For batch size > 1, + pass multiple prompts separated by a pipe (``|``) symbol. Default is None. + prompts_txt_file_path : str, optional + Path to a text file containing multiple input prompts, one per line. Default is None. + generation_len : int, optional + Maximum number of tokens to be generated during inference. Default is None. + cache_dir : str, optional + Cache directory where downloaded HuggingFace files (like tokenizer) are stored. + Default is None. + hf_token : str, optional + HuggingFace login token to access private repositories. Default is None. + full_batch_size : int, optional + Ignored in this context as continuous batching is managed by the compiled QPC. + However, it might be passed through from CLI arguments. Default is None. + + Example + ------- + To execute a compiled model from the command line: .. code-block:: bash - python -m QEfficient.cloud.execute OPTIONS + python -m QEfficient.cloud.execute --model-name gpt2 --qpc-path /path/to/qpc/binaries --prompt "Hello world" + + For multi-device inference: + + .. code-block:: bash + + python -m QEfficient.cloud.execute --model-name gpt2 --qpc-path /path/to/qpc/binaries --device-group "[0,1]" --prompt "Hello | Hi" + """ tokenizer = load_hf_tokenizer( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 849325c9d..d2e3f66fc 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -25,16 +25,32 @@ def get_onnx_model_path( local_model_dir: Optional[str] = None, ): """ - exports the model to onnx if pre-exported file is not found and returns onnx_model_path - - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. - ``Optional`` Args: - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` - :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` + Exports the PyTorch model to ONNX format if a pre-exported file is not found, + and returns the path to the ONNX model. + + This function loads a Hugging Face model via QEFFCommonLoader, then calls + its export method to generate the ONNX graph. + + Parameters + ---------- + model_name : str + Hugging Face Model Card name (e.g., ``gpt2``). + + Other Parameters + ---------------- + cache_dir : str, optional + Cache directory where downloaded HuggingFace files are stored. Default is None. + hf_token : str, optional + HuggingFace login token to access private repositories. Default is None. + full_batch_size : int, optional + Sets the full batch size to enable continuous batching mode. Default is None. + local_model_dir : str, optional + Path to custom model weights and config files. Default is None. + + Returns + ------- + str + Path of the generated ONNX graph file. """ logger.info(f"Exporting Pytorch {model_name} model to ONNX...") @@ -58,20 +74,35 @@ def main( full_batch_size: Optional[int] = None, ) -> None: """ - Helper function used by export CLI app for exporting to ONNX Model. - - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. - - ``Optional`` Args: - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` + Main function for the QEfficient ONNX export CLI application. + + This function serves as the entry point for exporting a PyTorch model, loaded + via QEFFCommonLoader, to the ONNX format. It prepares the necessary + paths and calls `get_onnx_model_path`. + + Parameters + ---------- + model_name : str + Hugging Face Model Card name (e.g., ``gpt2``). + + Other Parameters + ---------------- + cache_dir : str, optional + Cache directory where downloaded HuggingFace files are stored. Default is None. + hf_token : str, optional + HuggingFace login token to access private repositories. Default is None. + local_model_dir : str, optional + Path to custom model weights and config files. Default is None. + full_batch_size : int, optional + Sets the full batch size to enable continuous batching mode. Default is None. + + Example + ------- + To export a model from the command line: .. code-block:: bash - python -m QEfficient.cloud.export OPTIONS + python -m QEfficient.cloud.export --model-name gpt2 --cache-dir /path/to/cache """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index b8138e8b0..35ebbde32 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -49,18 +49,29 @@ def setup_distributed_training(train_config: TrainConfig) -> None: - """Initialize distributed training environment if enabled. - - Args: - train_config (TrainConfig): Training configuration object. - - Notes: - - If distributed data parallel (DDP) is disabled, this function does nothing. - - Ensures the device is not CPU and does not specify an index for DDP compatibility. - - Initializes the process group using the specified distributed backend. - - Raises: - AssertionError: If device is CPU or includes an index with DDP enabled. + """ + Initialize the distributed training environment if Distributed Data Parallel (DDP) is enabled. + + This function configures the PyTorch distributed backend based on the device type + and initializes the process group. It also validates device availability and + pipeline parallelism settings. + + Parameters + ---------- + train_config : TrainConfig + Training configuration object containing settings for distributed training. + + Raises + ------ + AssertionError + If the number of required devices exceeds the total available devices. + If pipeline parallelism (`num_pp_stages`) is enabled but set to 1. + If DDP is enabled with a CPU device or with a specific device index (DDP requires device type only). + + Notes + ----- + - If `train_config.enable_ddp` is False, this function performs no action. + - Sets the appropriate device for each process in a distributed setup. """ torch_device = torch.device(train_config.device) @@ -85,13 +96,16 @@ def setup_distributed_training(train_config: TrainConfig) -> None: def setup_seeds(seed: int) -> None: - """Set random seeds across libraries for reproducibility. + """ + Set random seeds across multiple libraries for reproducibility. - Args: - seed (int): Seed value to set for random number generators. + This function ensures that random number generation is deterministic across PyTorch, + Python's built-in `random` module, and NumPy for consistent experiment results. - Notes: - - Sets seeds for PyTorch, Python's random module, and NumPy. + Parameters + ---------- + seed : int + The seed value to set for all random number generators. """ torch.use_deterministic_algorithms(True) # With this flag, PP+DDP works only for meta-llama/Llama-3.2-1B and mistralai/Mistral-7B-Instruct-v0.3 @@ -105,23 +119,35 @@ def setup_seeds(seed: int) -> None: def load_model_and_tokenizer( train_config: TrainConfig, dataset_config: Any, **kwargs ) -> tuple[AutoModelForCausalLM, AutoTokenizer]: - """Load the pre-trained model and tokenizer from Hugging Face. - - Args: - train_config (TrainConfig): Training configuration object containing model and tokenizer names. - dataset_config (Any): A dataclass object representing dataset configuration. - kwargs: Additional arguments to override PEFT config. - - Returns: - tuple: A tuple of two values. - - Model with pretrained weights loaded. - - Model's tokenizer (AutoTokenizer). - - Notes: - - Downloads the model if not already cached using login_and_download_hf_lm. - - Configures the model with FP16 precision and disables caching for training. - - Resizes model embeddings if tokenizer vocab size exceeds model embedding size. - - Sets pad_token_id to eos_token_id if not defined in the tokenizer. + """ + Load the pre-trained Hugging Face model and its corresponding tokenizer. + + This function handles model download, configuration (e.g., precision, caching), + and tokenizer setup. It also applies PEFT if enabled in the training configuration. + + Parameters + ---------- + train_config : TrainConfig + Training configuration object containing model and tokenizer names, task mode, etc. + dataset_config : Any + A dataclass object representing the dataset configuration, used for task-specific + model setup (e.g., number of labels for sequence classification). + **kwargs : + Additional arguments to override PEFT configuration parameters. + + Returns + ------- + tuple[Union[AutoModelForCausalLM, AutoModelForSequenceClassification], AutoTokenizer] + A tuple containing: + - The loaded model (either `AutoModelForCausalLM` or `AutoModelForSequenceClassification`). + - The model's tokenizer (`AutoTokenizer`). + + Raises + ------ + RuntimeError + If the Hugging Face model for sequence classification does not have + a `base_model_prefix` attribute when `task_mode` is `SEQ_CLASSIFICATION`. + If gradient checkpointing is enabled but the model does not support it. """ logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}") pretrained_model_path = hf_download( @@ -188,17 +214,26 @@ def load_model_and_tokenizer( def apply_peft(model: AutoModel, train_config: TrainConfig, **kwargs) -> Union[AutoModel, PeftModel]: - """Apply Parameter-Efficient Fine-Tuning (PEFT) to the model if enabled. - - Args: - model (AutoModel): Huggingface model. - train_config (TrainConfig): Training configuration object. - kwargs: Additional arguments to override PEFT config params. - - Returns: - Union[AutoModel, PeftModel]: If use_peft in train_config is True - then PeftModel object is returned else original model object - (AutoModel) is returned. + """ + Apply Parameter-Efficient Fine-Tuning (PEFT) to the model if enabled in the training configuration. + + This function configures and applies PEFT methods (e.g., LoRA) to the base model, + either from a pre-trained PEFT checkpoint or by generating a new PEFT configuration. + + Parameters + ---------- + model : AutoModel + The Hugging Face model to which PEFT will be applied. + train_config : TrainConfig + Training configuration object, specifying whether to use PEFT and if a checkpoint exists. + **kwargs : + Additional arguments to override PEFT configuration parameters. + + Returns + ------- + Union[AutoModel, PeftModel] + If `train_config.use_peft` is True, a `PeftModel` object is returned. + Otherwise, the original `AutoModel` object is returned. """ if not train_config.use_peft: return model @@ -221,26 +256,35 @@ def setup_dataloaders( dataset_config: Any, tokenizer: AutoTokenizer, ) -> tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader], int]: - """Set up training and validation DataLoaders. - - Args: - train_config (TrainConfig): Training configuration object. - dataset_config (Any): Configuration for the dataset (generated from train_config). - tokenizer (AutoTokenizer): Tokenizer for preprocessing data. - - Returns: - tuple: A tuple of three values. - - First value represents train_dataloader - - Second value represents eval_dataloader. It is None if - validation is disabled. - - Length of longest sequence in the dataset. - - Raises: - RuntimeError: If validation is enabled but the validation set is too small. - - Notes: - - Applies a custom data collator if provided by get_custom_data_collator. - - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits. + """ + Set up training and optional validation DataLoaders based on the provided configurations. + + This function prepares `DataLoader` instances for both training and validation datasets, + applying necessary preprocessing and batching. It also determines the longest sequence + length in the combined dataset. + + Parameters + ---------- + train_config : TrainConfig + Training configuration object containing DataLoader settings (batch size, etc.) + and validation preferences. + dataset_config : Any + Configuration for the dataset, used to fetch and prepare splits. + tokenizer : AutoTokenizer + Tokenizer for preprocessing and tokenizing the dataset samples. + + Returns + ------- + tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader], int] + A tuple containing: + - `train_dataloader`: The DataLoader for the training dataset. + - `eval_dataloader`: The DataLoader for the validation dataset, or `None` if validation is disabled. + - `longest_seq_length`: The length of the longest sequence found in the dataset(s). + + Raises + ------ + ValueError + If validation is enabled but the resulting validation DataLoader is empty. """ train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train") @@ -268,24 +312,37 @@ def setup_dataloaders( def main(**kwargs) -> None: """ - Fine-tune a model on QAIC hardware with configurable training and LoRA parameters. + Fine-tune a Hugging Face model on Qualcomm AI 100 hardware with configurable training + and Parameter-Efficient Fine-Tuning (PEFT) parameters. + + This is the main entry point for the fine-tuning script. It orchestrates the + setup of distributed training, model and tokenizer loading, DataLoader creation, + optimizer and scheduler initialization, and the training loop. + + Parameters + ---------- + **kwargs : + Additional arguments used to override default parameters in `TrainConfig` + and PEFT configuration. These are typically parsed from command-line arguments. + + Example + ------- + To fine-tune a model using a YAML configuration file for PEFT: + + .. code-block:: bash - Args: - kwargs: Additional arguments to override TrainConfig. + python -m QEfficient.cloud.finetune \\ + --model_name "meta-llama/Llama-3.2-1B" \\ + --lr 5e-4 \\ + --peft_config_file "lora_config.yaml" - Example: - .. code-block:: bash + To fine-tune a model using a default LoRA configuration: - # Using a YAML config file for PEFT - python -m QEfficient.cloud.finetune \\ - --model_name "meta-llama/Llama-3.2-1B" \\ - --lr 5e-4 \\ - --peft_config_file "lora_config.yaml" + .. code-block:: bash - # Using default LoRA config - python -m QEfficient.cloud.finetune \\ - --model_name "meta-llama/Llama-3.2-1B" \\ - --lr 5e-4 + python -m QEfficient.cloud.finetune \\ + --model_name "meta-llama/Llama-3.2-1B" \\ + --lr 5e-4 """ train_config = TrainConfig() update_config(train_config, **kwargs) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 1c620ad7d..c0274d8b5 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -34,21 +34,47 @@ def execute_vlm_model( generation_len: Optional[int] = None, ): """ - This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - ``Mandatory`` Args: - :qeff_model (PreTrainedModel): QEfficient model object. - :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf`` - :image_url (str): Image URL to be used for inference. ``Defaults to None.`` - :image_path (str): Image path to be used for inference. ``Defaults to None.`` - ``Optional`` Args: - :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` - :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` - Returns: - :dict: Output from the ``AI_100`` runtime. + Generate output from a compiled Vision-Language Model (VLM) on Cloud AI 100 hardware. + + This method takes a QEfficient VLM model, processes image and text inputs, and generates + text outputs using the compiled QPC. + + Parameters + ---------- + qeff_model : PreTrainedModel + QEfficient model object, expected to be an instance capable of VLM operations. + model_name : str + Hugging Face Model Card name (e.g., ``llava-hf/llava-1.5-7b-hf``) used for loading processor. + image_url : str + URL of the image to be used for inference. + image_path : str + Local file path to the image to be used for inference. + + Other Parameters + ---------------- + prompt : str, optional + Sample prompt for the model text generation. Default is None. + device_group : List[int], optional + List of device IDs to be used for inference. If ``len(device_group) > 1``, + multiple card setup is enabled. Default is None. + local_model_dir : str, optional + Path to custom model weights and config files, used if not loading from Hugging Face Hub. Default is None. + cache_dir : str, optional + Cache directory where downloaded HuggingFace files are stored. Default is None. + hf_token : str, optional + HuggingFace login token to access private repositories. Default is None. + generation_len : int, optional + Maximum number of tokens to be generated. Default is None. + + Returns + ------- + dict + Output from the ``AI_100`` runtime, typically containing generated text and performance metrics. + + Raises + ------ + ValueError + If neither ``image_url`` nor ``image_path`` is provided. """ if not (image_url or image_path): raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') @@ -115,40 +141,91 @@ def main( **kwargs, ) -> None: """ - 1. Check if compiled qpc for given config already exists, if it does jump to execute, else - 2. Check if exported ONNX file already exists, if true, jump to compilation -> execution, else - 3. Check if HF model exists in cache, if true, start transform -> export -> compilation -> execution, else, - 4. Download HF model -> transform -> export -> compile -> execute - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` - :num_cores (int): Number of cores to compile model on. - ``Optional`` Args: - :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.`` - :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` - :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.`` - :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` - :mos (int): Effort level to reduce the on-chip memory. ``Defaults to 1.`` - :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` - :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32.`` - :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128.`` - :generation_len (int): Number of tokens to be generated. ``Defaults to False.`` - :mxfp6 (bool): Enable compilation for MXFP6 precision. ``Defaults to False.`` - :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` - :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` - :trust_remote_code (bool): Trust remote code execution. ``Defaults to False.`` - :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: - -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1 - -qpc_crc=True -> -qpc-crc + Main entry point for the QEfficient inference script. + + This function handles the end-to-end process of downloading, optimizing, + compiling, and executing a HuggingFace model on Cloud AI 100 hardware. + The process follows these steps: + + 1. Checks for an existing compiled QPC package. If found, it jumps directly to execution. + 2. Checks for an existing exported ONNX file. If true, it proceeds to compilation then execution. + 3. Checks if the HuggingFace model exists in the cache. If true, it performs model transformation, ONNX export, compilation, and then execution. + 4. If none of the above, it downloads the HuggingFace model, then performs transformation, ONNX export, compilation, and execution. + + Parameters + ---------- + model_name : str + Hugging Face Model Card name (e.g., ``gpt2``) or path to a local model. + num_cores : int + Number of cores to compile the model on. + + Other Parameters + ---------------- + device_group : List[int], optional + List of device IDs to be used for compilation and inference. If ``len(device_group) > 1``, + a multiple card setup is enabled. Default is None. + prompt : str, optional + Sample prompt(s) for the model text generation. For batch size > 1, + pass multiple prompts separated by a pipe (``|``) symbol. Default is None. + prompts_txt_file_path : str, optional + Path to a text file containing multiple input prompts, one per line. Default is None. + aic_enable_depth_first : bool, optional + Enables Depth-First Search (DFS) with default memory size during compilation. Default is False. + mos : int, optional + Effort level to reduce on-chip memory. Default is 1. + batch_size : int, optional + Batch size to compile the model for. Default is 1. + full_batch_size : int, optional + Sets the full batch size to enable continuous batching mode. Default is None. + prompt_len : int, optional + Prompt length for the model to compile. Default is 32. + ctx_len : int, optional + Maximum context length to compile the model for. Default is 128. + generation_len : int, optional + Maximum number of tokens to be generated during inference. Default is None. + mxfp6 : bool, optional + Enables compilation for MXFP6 precision for constant MatMul weights. Default is False. + A warning is issued as ``--mxfp6`` is deprecated; use ``--mxfp6-matmul`` instead. + mxint8 : bool, optional + Compresses Present/Past KV to ``MXINT8`` using ``CustomIO`` config. Default is False. + A warning is issued as ``--mxint8`` is deprecated; use ``--mxint8-kv-cache`` instead. + local_model_dir : str, optional + Path to custom model weights and config files. Default is None. + cache_dir : str, optional + Cache directory where downloaded HuggingFace files are stored. Default is None. + hf_token : str, optional + HuggingFace login token to access private repositories. Default is None. + allow_mxint8_mdp_io : bool, optional + Allows MXINT8 compression of MDP IO traffic during compilation. Default is False. + enable_qnn : bool or str, optional + Enables QNN compilation. Can be passed as a flag (True) or with a configuration file path (str). + If a string path is provided, it's treated as ``qnn_config``. Default is False. + qnn_config : str, optional + Path of the QNN Config parameters file. Default is None. + trust_remote_code : bool, optional + If True, trusts remote code when loading models from HuggingFace. Default is False. + **kwargs : + Additional compiler options passed directly to `qaic-exec`. Any flag supported by + `qaic-exec` can be passed. Parameters are converted to flags as follows: + + - ``-allocator_dealloc_delay=1`` -> ``-allocator-dealloc-delay=1`` + - ``-qpc_crc=True`` -> ``-qpc-crc`` + + Example + ------- + To run inference from the command line: + + .. code-block:: bash + + python -m QEfficient.cloud.infer --model-name gpt2 --num-cores 16 --prompt "Hello world" + + For advanced compilation options: .. code-block:: bash - python -m QEfficient.cloud.infer OPTIONS + python -m QEfficient.cloud.infer --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \\ + --num-cores 16 --prompt "Describe this image." --image-url "https://example.com/image.jpg" \\ + --prefill-seq-len 32 --ctx-len 512 --img-size 560 --mxfp6-matmul """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index 56177cce9..c0e71b5f7 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -59,6 +59,61 @@ def compile_kv_model_on_cloud_ai_100( device_group: Optional[List[int]] = None, **kwargs, ) -> Tuple[bool, str]: + """ + Compiles an ONNX Key-Value (KV) model for Cloud AI 100 hardware using `qaic-exec`. + + This function sets up and executes the Qualcomm AI 100 compiler with various options + to generate a QPC package. + + Parameters + ---------- + onnx_path : str + Path to the ONNX model file to be compiled. + specializations_json : str + Path to the JSON file defining compilation specializations (batch size, sequence length, etc.). + num_cores : int + Number of cores to use for compilation on Cloud AI 100. + base_path : str + Base directory where QPC binaries will be stored (a `qpcs` subdirectory will be created). + mxfp6 : bool + If True, enables MXFP6 precision for MatMul weights. + custom_io_path : str + Path to the Custom IO list file (e.g., YAML format) specifying input/output data types. + aic_enable_depth_first : bool + If True, enables Depth-First Search (DFS) optimization with default memory size. + allow_mxint8_mdp_io : bool + If True, allows MXINT8 compression of MDP IO traffic. + + Other Parameters + ---------------- + mos : int, optional + Effort level to reduce on-chip memory. A value greater than 0 applies this effort. Default is -1 (no effort). + device_group : List[int], optional + List of device IDs for multi-device compilation (tensor slicing). If `len(device_group) > 1`, + a multi-device partition configuration is generated. Default is None. + **kwargs : + Additional compiler options passed directly to `qaic-exec`. These are formatted as + `-key=value` or `-key` for boolean flags. + + Returns + ------- + Tuple[bool, str] + A tuple containing: + - bool: True if compilation was successful, False otherwise. + - str: Path to the generated QPC binary directory. + + Raises + ------ + FileNotFoundError + If the `specializations_json` or `custom_io_path` files are not found. + RuntimeError + If the `qaic-exec` compilation process fails. + + Warnings + -------- + DeprecationWarning + This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead. + """ warnings.warn( "\033[93mUse `QEFFAutoModelForCausalLM.compile` instead, this method will be removed soon.\033[0m", DeprecationWarning, @@ -144,32 +199,75 @@ def compile( **kwargs, ) -> str: """ - Compiles the given ``ONNX`` model using Cloud AI 100 platform SDK compiler and saves the compiled ``qpc`` package at ``qpc_path``. - Generates tensor-slicing configuration if multiple devices are passed in ``device_group``. - - This function will be deprecated soon and will be replaced by ``QEFFAutoModelForCausalLM.compile``. - - ``Mandatory`` Args: - :onnx_path (str): Generated ``ONNX`` Model Path. - :qpc_path (str): Path for saving compiled qpc binaries. - :num_cores (int): Number of cores to compile the model on. - ``Optional`` Args: - :device_group (List[int]): Used for finding the number of devices to compile for. ``Defaults to None.`` - :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` - :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` - :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` - :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` - :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` - :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` - :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` - :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.`` - :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` - :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` - - Returns: - :str: Path to compiled ``qpc`` package. + Compiles the given ONNX model using either the Cloud AI 100 platform SDK compiler + or the QNN compiler, and saves the compiled QPC package. + + This function handles the creation of specialization files, selection of custom IO + configurations, and execution of the appropriate compiler (QAIC or QNN). + It supports multi-device compilation for tensor slicing. + + Parameters + ---------- + onnx_path : str + Path to the generated ONNX model file. + qpc_path : str + Target directory path for saving the compiled QPC binaries. + num_cores : int + Number of cores to use for compilation. + + Other Parameters + ---------------- + device_group : List[int], optional + List of device IDs. Used to determine the number of devices for multi-device compilation. + Default is None. + aic_enable_depth_first : bool, optional + If True, enables Depth-First Search (DFS) optimization with default memory size during QAIC compilation. + Default is False. + mos : int, optional + Effort level to reduce on-chip memory during QAIC compilation. A value greater than 0 applies this effort. + Default is -1 (no effort). + batch_size : int, optional + Batch size to compile the model for. Default is 1. + full_batch_size : int, optional + Sets the full batch size to enable continuous batching mode. If provided, `batch_size` must be 1. + Default is None. + prompt_len : int, optional + Prompt length for the model to compile. Default is 32. + ctx_len : int, optional + Maximum context length to compile the model for. Default is 128. + mxfp6 : bool, optional + If True, enables MXFP6 precision for MatMul weights during compilation. Default is True. + mxint8 : bool, optional + If True, compresses Present/Past KV to MXINT8 using a CustomIO configuration. Default is False. + custom_io_file_path : str, optional + Explicit path to a Custom IO file (e.g., YAML format). If None, it's inferred based on `mxint8`. + Default is None. + allow_mxint8_mdp_io : bool, optional + If True, allows MXINT8 compression of MDP IO traffic during QAIC compilation. Default is False. + enable_qnn : bool, optional + If True, enables compilation using the QNN compiler instead of QAIC. Default is False. + qnn_config : str, optional + Path to the QNN Config parameters file, used if `enable_qnn` is True. Default is None. + **kwargs : + Additional compiler options passed directly to the chosen compiler. + + Returns + ------- + str + Path to the compiled QPC package directory. + + Raises + ------ + ValueError + If both `batch_size` and `full_batch_size` are greater than one (mutually exclusive in some contexts). + FileNotFoundError + If required Custom IO files are not found. + + Warnings + -------- + DeprecationWarning + This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead. + """ if full_batch_size and batch_size != 1: raise ValueError("Only either batch_size or full_batch_size should be greater than one") diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py index 820372561..d854e4ad6 100644 --- a/QEfficient/peft/auto.py +++ b/QEfficient/peft/auto.py @@ -34,27 +34,29 @@ class QEffAutoPeftModelForCausalLM(QEFFBaseModel): """ - QEff class for loading models with PEFT adapters (Only LoRA is supported currently). - Once exported and compiled for an adapter, the same can be utilized for another adapter with same base model and adapter config. + QEfficient class for loading and running Causal Language Models with PEFT adapters (currently only LoRA is supported). - Args: - :model (nn.Module): PyTorch model + This class enables efficient inference and deployment of PEFT-adapted models on Cloud AI 100 hardware. + Once exported and compiled for an adapter, the same base model can be reused with other compatible adapters. - .. code-block:: python + Example: + .. code-block:: python - from QEfficient import QEffAutoPeftModelForCausalLM + from QEfficient import QEffAutoPeftModelForCausalLM - m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder") - m.export() - m.compile(prefill_seq_len=32, ctx_len=1024) + # Load a model with a LoRA adapter + m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder") + m.export() + m.compile(prefill_seq_len=32, ctx_len=1024) - inputs = ... # A coding prompt - outputs = m.generate(**inputs) + # Generate with the current adapter + inputs = ... # A coding prompt + outputs = m.generate(**inputs) - inputs = ... # A math prompt - m.load_adapter("predibase/gsm8k", "gsm8k") - m.set_adapter("gsm8k") - outputs = m.generate(**inputs) + # Switch to another adapter + m.load_adapter("predibase/gsm8k", "gsm8k") + m.set_adapter("gsm8k") + outputs = m.generate(**inputs) """ _pytorch_transforms: List[PytorchTransform] = [CustomOpsTransform, KVCacheTransform, PeftModelInputsTransform] @@ -62,6 +64,15 @@ class QEffAutoPeftModelForCausalLM(QEFFBaseModel): _hf_auto_class = AutoPeftModelForCausalLM def __init__(self, model: nn.Module): + """ + Initialize the QEffAutoPeftModelForCausalLM instance. + + Args: + model (nn.Module): A PyTorch model of type PeftModelForCausalLM with a LoRA adapter. + Raises: + TypeError: If the provided model is not a PeftModelForCausalLM. + NotImplementedError: If the adapter type is not LoRA. + """ if not isinstance(model, PeftModelForCausalLM): raise TypeError(f"Required pytorch module of type PeftModel, got {type(model)}") @@ -86,6 +97,12 @@ def __repr__(self) -> str: @property def model_name(self) -> str: + """ + Get the model name with "-lora" suffix. + + Returns: + str: The base model class name with "-lora" appended. + """ mname = self.model.get_base_model().__class__.__name__ + "-lora" if mname.startswith("QEff"): mname = mname[4:] @@ -93,6 +110,12 @@ def model_name(self) -> str: @property def model_hash(self) -> str: + """ + Compute a unique hash for the model configuration and adapter. + + Returns: + str: A 16-character SHA256 hash string. + """ # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path. # Using same card name will result in same hash. But, using a relative path for one run and # absolute path for another run will result in different hash. @@ -109,27 +132,47 @@ def model_hash(self) -> str: @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying base model. + + Returns: + dict: The configuration dictionary. + """ return self.model.get_base_model().config.__dict__ + @property + def active_adapter(self) -> str: + """ + Get the currently active adapter name. + + Returns: + str: Name of the active adapter. + """ + return self.model.active_adapter + def load_adapter(self, model_id: str, adapter_name: str): - """Loads a new adapter from huggingface hub or local path + """ + Load a new adapter from the HuggingFace Hub or a local path. Args: - :model_id (str): Adapter model ID from huggingface hub or local path - :adapter_name (str): Adapter name to be used to set this adapter as current + model_id (str): Adapter model ID from HuggingFace Hub or local path. + adapter_name (str): Name to assign to the loaded adapter. """ self.model.load_adapter(model_id, adapter_name) self.adapter_weights[adapter_name] = { k: v.numpy().astype("float16") for k, v in load_peft_weights(model_id).items() } - @property - def active_adapter(self) -> str: - "Currently active adapter to be used for inference" - return self.model.active_adapter - def set_adapter(self, adapter_name: str): - "Sets active adapter from one of the loaded adapters" + """ + Set the active adapter from the loaded adapters. + + Args: + adapter_name (str): Name of the adapter to activate. + + Raises: + ValueError: If the adapter is incompatible with the export-time adapter. + """ if self.exported_peft_config is not None and self.exported_peft_config != self.model.peft_config[adapter_name]: raise ValueError( "Unable to activate incompatible adapter. " @@ -139,6 +182,12 @@ def set_adapter(self, adapter_name: str): self.model.set_adapter(adapter_name) def disable_adapter(self): + """ + Disable the currently active adapter. + + Raises: + NotImplementedError: Disabling adapters is not currently supported. + """ # TODO: Set zero tensors as adapter weights raise NotImplementedError("Disabling adapters not supported currently") @@ -151,11 +200,21 @@ def _from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs): @classmethod def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs): """ + Load a QEffAutoPeftModelForCausalLM from a pretrained model and adapter. + Args: - :pretrained_name_or_path (str): Model card name from huggingface or local path to model directory. - :finite_adapters (bool): set True to enable finite adapter mode with QEffAutoLoraModelForCausalLM class. Please refer to QEffAutoLoraModelForCausalLM for API specification. - :adapter_name (str): Name used to identify loaded adapter. - :args, kwargs: Additional arguments to pass to peft.AutoPeftModelForCausalLM. + pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. + finite_adapters (bool, optional): Set True to enable finite adapter mode with QEffAutoLoraModelForCausalLM class. + adapter_name (str, optional): Name used to identify the loaded adapter. + *args: Additional positional arguments for peft.AutoPeftModelForCausalLM. + **kwargs: Additional keyword arguments for peft.AutoPeftModelForCausalLM. + + Returns: + QEffAutoPeftModelForCausalLM: An instance initialized with the pretrained weights and adapter. + + Raises: + NotImplementedError: If continuous batching is requested (not supported). + TypeError: If adapter name is missing in finite adapter mode. """ if kwargs.get("full_batch_size"): raise NotImplementedError("Continuous batching currently not supported for PEFT models") @@ -181,6 +240,19 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs): return obj def export(self, export_dir: Optional[str] = None) -> str: + """ + Export the model with the active adapter to ONNX format. + + This method prepares example inputs and dynamic axes based on the model and adapter configuration, + then exports the model to an ONNX graph suitable for compilation and deployment on Cloud AI 100 hardware. + + Args: + export_dir (str, optional): Directory path where the exported ONNX graph will be saved. + If not provided, the default export directory is used. + + Returns: + str: Path to the generated ONNX graph file. + """ self.exported_peft_config = self.model.active_peft_config example_shape = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) @@ -224,6 +296,27 @@ def compile( mxint8_kv_cache: bool = False, **compiler_options, ) -> str: + """ + Compile the exported ONNX model for Cloud AI 100 hardware. + + This method generates a QPC package. If the model has not been exported yet, this method will handle the export process. + Additional arguments for the QAIC compiler can be passed as keyword arguments. + + Args: + onnx_path (str, optional): Path to a pre-exported ONNX model. + compile_dir (str, optional): Directory to save the generated QPC package. + batch_size (int, optional): Batch size for compilation. Default is 1. + prefill_seq_len (int): Length of the prefill prompt. + ctx_len (int): Maximum context length the compiled model can remember. + num_devices (int, optional): Number of devices to compile for. Default is 1. + num_cores (int, optional): Number of cores to use for compilation. Default is 16. + mxfp6_matmul (bool, optional): Use MXFP6 compression for weights. Default is False. + mxint8_kv_cache (bool, optional): Use MXINT8 compression for KV cache. Default is False. + **compiler_options: Additional compiler options for QAIC. + + Returns: + str: Path to the compiled QPC package. + """ # Specializations specializations = [ {"batch_size": batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len}, @@ -265,14 +358,20 @@ def generate( **kwargs, ) -> np.ndarray: """ - Generate tokens from compiled binary. This method takes same parameters as HuggingFace transformers model.generate() method. + Generate tokens from the compiled binary using the active adapter. + + This method takes similar parameters as HuggingFace's ``model.generate()`` method. Args: - :inputs: input_ids - :generation_config: Merge this generation_config with model-specific for the current generation. - :stopping_criteria: Pass custom stopping_criteria to stop at a specific point in generation. - :streamer: Streamer to put the generated tokens into. - :kwargs: Additional parameters for generation_config or to be passed to the model while generating. + inputs (torch.Tensor or np.ndarray, optional): Input IDs for generation. + device_ids (List[int], optional): Device IDs for running inference. + generation_config (GenerationConfig, optional): Generation configuration to merge with model-specific config. + stopping_criteria (StoppingCriteria, optional): Custom stopping criteria for generation. + streamer (BaseStreamer, optional): Streamer to receive generated tokens. + **kwargs: Additional parameters for generation_config or to be passed to the model. + + Returns: + np.ndarray: Generated token IDs. """ # Initialize session if self.qpc_session is None: diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py index 14cadf997..19afa2144 100644 --- a/QEfficient/peft/lora/auto.py +++ b/QEfficient/peft/lora/auto.py @@ -24,28 +24,37 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM): """ - QEff class for loading models with multiple LoRA adapters. Currently only Mistral and Llama model are supported. - Once exported and compiled, the qpc can perform mixed batch inference with provided `prompt_to_adapter_mapping`. + QEfficient class for loading models with multiple LoRA adapters for causal language modeling. - Args: - :model (nn.Module): PyTorch model - :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. + This class enables mixed batch inference with different adapters on Cloud AI 100 hardware. + Currently, only Mistral and Llama models are supported. Once exported and compiled, the QPC can perform + mixed batch inference using the `prompt_to_adapter_mapping` argument. - .. code-block:: python + Example: + .. code-block:: python - from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM + from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM - m = QEffAutoPeftModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - m.load_adapter("predibase/gsm8k", "gsm8k") - m.load_adapter("predibase/magicoder", "magicoder") - m.compile(num_cores=16, device_group=[0]) - - prompts=["code prompt", "math prompt", "generic"] - m.generate(prompts, device_group=[0], prompt_to_adapter_mapping=["magicoder","gsm8k_id","base"]) + m = QEffAutoLoraModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + m.load_adapter("predibase/gsm8k", "gsm8k") + m.load_adapter("predibase/magicoder", "magicoder") + m.compile(num_cores=16, device_group=[0]) + prompts = ["code prompt", "math prompt", "generic"] + m.generate(prompts, device_group=[0], prompt_to_adapter_mapping=["magicoder", "gsm8k", "base"]) """ def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs) -> None: + """ + Initialize a QEffAutoLoraModelForCausalLM instance. + + Args: + model (nn.Module): The underlying PyTorch model. + continuous_batching (bool, optional): Whether to enable continuous batching support. Default is False. + + Raises: + NotImplementedError: If the model is not a supported type (Mistral or Llama). + """ super().__init__(model, continuous_batching) if self.model.__class__.__name__ not in ["QEffMistralForCausalLM", "QEffLlamaForCausalLM"]: raise NotImplementedError( @@ -64,6 +73,12 @@ def __repr__(self) -> str: @property def model_hash(self) -> str: + """ + Compute a unique hash for the model configuration and all loaded adapters. + + Returns: + str: A 16-character SHA256 hash string representing the model and adapter state. + """ mhash = hashlib.sha256() # should use model config here @@ -92,6 +107,12 @@ def model_hash(self) -> str: @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying base model. + + Returns: + dict: The configuration dictionary. + """ return self.model.model.config.__dict__ def download_adapter( @@ -102,14 +123,16 @@ def download_adapter( adapter_config: Optional[PeftConfig] = None, ): """ - Loads a new adapter from huggingface hub or local path into CPU cache - - ``Mandatory`` Args: - :adapter_model_id (str): Adapter model ID from huggingface hub or local path - :adapter_name (str): Adapter name to be used to downloaded this adapter - ``Optional`` Args: - :adapter_weight (dict): Adapter weight tensors in dictionary format - :adapter_config (PeftConfig): Adapter config in the format of PeftConfig + Download a new adapter from the HuggingFace Hub or a local path into CPU cache. + + Args: + adapter_model_id (str): Adapter model ID from HuggingFace Hub or local path. + adapter_name (str): Name to assign to the downloaded adapter. + adapter_weight (dict, optional): Adapter weight tensors in dictionary format. + adapter_config (PeftConfig, optional): Adapter configuration object. + + Notes: + If both `adapter_weight` and `adapter_config` are provided, downloading from the Hub is skipped. """ # check if adapter name already loaded @@ -133,14 +156,19 @@ def load_adapter( adapter_config: Optional[PeftConfig] = None, ): """ - Load adapter into CPU cache and set it as active - - ``Mandatory`` Args: - :adapter_model_id (str): Adapter model ID from huggingface hub or local path - :adapter_name (str): Adapter name to be used to load this adapter - ``Optional`` Args: - :adapter_weight (dict): Adapter weight tensors in dictionary format - :adapter_config (PeftConfig): Adapter config in the format of PeftConfig + Load an adapter into CPU cache and set it as active. + + Args: + adapter_model_id (str): Adapter model ID from HuggingFace Hub or local path. + adapter_name (str): Name to assign to the loaded adapter. + adapter_weight (dict, optional): Adapter weight tensors in dictionary format. + adapter_config (PeftConfig, optional): Adapter configuration object. + + Returns: + int: The adapter ID assigned to the loaded adapter. + + Raises: + ValueError: If the adapter's target modules or rank do not match existing adapters. """ # check if adapter name already exist and activated @@ -170,10 +198,17 @@ def load_adapter( def unload_adapter(self, adapter_name: str): """ - Deactivate adpater and remove it from CPU cache + Deactivate and remove an adapter from CPU cache. + + Args: + adapter_name (str): Name of the adapter to unload. - ``Mandatory`` Args: - :adapter_name (str): Adapter name to be unloaded + Returns: + bool: True if the adapter was unloaded, False otherwise. + + Notes: + If the adapter is active, it will be deactivated and removed from cache. + You must re-export and re-compile the model after unloading adapters. """ # step1: remove from active list if it's there @@ -202,6 +237,12 @@ def unload_adapter(self, adapter_name: str): return True def set_adapter(self, adapter_name: str): + """ + Not supported in finite_adapters mode. + + Raises: + NotImplementedError: Always raised, as this operation is not supported. + """ raise NotImplementedError("Set adapter is not supported in finite_adapters mode") def _load_adapter_weights_to_model(self): @@ -286,14 +327,18 @@ def _init_adapter_model(self): def export(self, export_dir: Optional[str] = None) -> str: """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. - We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this." + Export the model with all loaded adapters to ONNX format using ``torch.onnx.export``. + + The exported ONNX graph will support mixed batch inference with multiple adapters. - ``Optional`` Args: - does not any arguments. + Args: + export_dir (str, optional): Directory to save the exported ONNX graph. If not provided, the default export directory is used. Returns: - :str: Path of the generated ``ONNX`` graph. + str: Path to the generated ONNX graph. + + Raises: + ValueError: If no adapters are loaded. """ # initialize the adapter model @@ -352,18 +397,27 @@ def generate( **kwargs, ): """ - This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. - If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. - - ``Mandatory`` Args: - :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference - :prompts (List[str]): List of prompts to run the execution. - :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter). - ``optional`` Args: - :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. - :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100". + Generate output for a batch of prompts using the compiled QPC on Cloud AI 100 hardware. + + This method supports mixed batch inference, where each prompt can use a different adapter as specified + by `prompt_to_adapter_mapping`. If the number of prompts is not divisible by the compiled batch size, + the last incomplete batch will be dropped. + + Args: + tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): Tokenizer used for inference. + prompts (List[str]): List of prompts to generate outputs for. + prompt_to_adapter_mapping (List[str]): List of adapter names to use for each prompt. Use "base" for the base model (no adapter). + device_id (List[int], optional): Device IDs to use for execution. If `None`, auto-device-picker is used. + runtime (str, optional): Runtime to use. Only "AI_100" is currently supported. Default is "AI_100". + **kwargs: Additional generation parameters. + + Returns: + Model outputs for each prompt. + Raises: + ValueError: If runtime is not "AI_100". + TypeError: If the model has not been compiled. + RuntimeError: If the number of prompts does not match the number of adapter mappings. """ if runtime != "AI_100": raise ValueError("Only AI_100 runtime is supported right now via generate API") diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index b3d27f3a5..4b76d514a 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -60,7 +60,11 @@ class QEFFTransformersBase(QEFFBaseModel): """ - Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. + Base class for QEfficient wrappers around HuggingFace transformer models. + + This class provides common functionality for loading, representing, and managing + HuggingFace models within the QEfficient framework. It serves as a parent + for specific model types like `AutoModel`, `AutoModelForCausalLM`, etc. """ _hf_auto_class: type @@ -81,6 +85,28 @@ def __repr__(self) -> str: @classmethod @with_replaced_quantizers def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): + """ + Load a QEfficient transformer model from a pretrained HuggingFace model or local path. + + This is the recommended way to initialize any QEfficient transformer model. + The interface is similar to ``transformers.AutoModel.from_pretrained``. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + *args : + Positional arguments passed directly to `cls._hf_auto_class.from_pretrained`. + **kwargs : + Keyword arguments passed directly to `cls._hf_auto_class.from_pretrained`. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + + Returns + ------- + QEFFTransformersBase + An instance of the specific QEFFAutoModel subclass, initialized with the pretrained weights. + """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -94,6 +120,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs): @property def model_name(self) -> str: + """ + Get the name of the underlying HuggingFace model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -101,12 +135,41 @@ def model_name(self) -> str: class MultimodalUtilityMixin: + """ + Mixin for multimodal models providing utilities like input auto-correction. + + This mixin ensures that inputs to multimodal models conform to the expected + names, shapes, and dtypes defined by the model's `get_inputs_info` method. + """ + def __new__(cls, *args, **kwargs): if cls is MultimodalUtilityMixin: raise TypeError(f"only children of '{cls.__name__}' may be instantiated") return object.__new__(cls) def auto_correct_inputs(self, inputs): + """ + Validates and corrects model inputs to match expected specifications. + + Checks if the provided inputs dictionary contains all required keys and + if the data types of the tensors match the model's specifications. + It then filters the input dictionary to only include expected inputs. + + Parameters + ---------- + inputs : Dict[str, torch.Tensor] + A dictionary of input tensors, where keys are input names and values are `torch.Tensor` objects. + + Returns + ------- + Dict[str, torch.Tensor] + A filtered dictionary of input tensors that match the model's expected inputs. + + Raises + ------ + RuntimeError + If any expected input is missing or has a mismatched data type. + """ checked = True inputs_info = self.model.get_inputs_info() for valid_input_info in inputs_info: @@ -132,29 +195,25 @@ def auto_correct_inputs(self, inputs): class QEFFAutoModel(QEFFTransformersBase): """ - The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. - Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + QEfficient class for general transformer models from the HuggingFace hub (e.g., BERT, Sentence Transformers). - ``Mandatory`` Args: - :model (nn.Module): PyTorch model + This class provides a unified interface for loading, exporting, compiling, and running + various encoder-only transformer models on Cloud AI 100 hardware. It supports pooling + for embedding extraction. + Example + ------- .. code-block:: python from QEfficient import QEFFAutoModel from transformers import AutoTokenizer - # Initialize the model using from_pretrained similar to transformers.AutoModel. - model = QEFFAutoModel.from_pretrained("model_name") - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU - - #prepare input - tokenizer = AutoTokenizer.from_pretrained(model_name) + model = QEFFAutoModel.from_pretrained("bert-base-uncased", pooling="mean") + model.compile(num_cores=16) + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") inputs = tokenizer("My name is", return_tensors="pt") - - # You can now execute the model - model.generate(inputs) + output = model.generate(inputs) + print(output) # Output will be a dictionary containing extracted features. """ _hf_auto_class = AutoModel @@ -162,6 +221,20 @@ class QEFFAutoModel(QEFFTransformersBase): _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.Module, pooling=None, **kwargs): + """ + Initializes a QEFFAutoModel instance. + + Parameters + ---------- + model : nn.Module + The underlying HuggingFace PyTorch model. + pooling : str or Callable, optional + The pooling method to use for feature extraction. + Options include: "mean", "max", "cls", "avg", or a custom Callable. + Default is None (no pooling applied). + **kwargs : + Additional keyword arguments passed to the base class constructor. + """ super().__init__(model, **kwargs) # Make Embedding specific transforms like appending pooling @@ -176,38 +249,34 @@ def __init__(self, model: nn.Module, pooling=None, **kwargs): @with_replaced_quantizers def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **kwargs): """ - This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. - Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. - - This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API - Args: - pretrained_model_name_or_path (str): The name or path of the pre-trained model. - pooling (Optional[Union[str, Callable]], optional): The pooling method to use. Defaults to None. - Options: - - "mean": Mean pooling - - "max": Max pooling - - "cls": CLS token pooling - - "avg": Average pooling - - Callable: A custom pooling function - - None: No pooling applied - - .. code-block:: python - - from QEfficient import QEFFAutoModel - from transformers import AutoTokenizer - - # Initialize the model using from_pretrained similar to transformers.AutoModel. - model = QEFFAutoModel.from_pretrained("model_name", pooling="mean") - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU - - #prepare input - tokenizer = AutoTokenizer.from_pretrained(model_name) - inputs = tokenizer("My name is", return_tensors="pt") - - # You can now execute the model - model.generate(inputs) + Load a QEfficient transformer model from a pretrained HuggingFace model or local path. + + This is the recommended way to initialize a QEfficient transformer model. The interface is similar to + ``transformers.AutoModel.from_pretrained``. Once initialized, you can use methods such as ``export``, ``compile``, and ``generate``. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + pooling : str or Callable, optional + The pooling method to use. Options include: + - "mean": Mean pooling + - "max": Max pooling + - "cls": CLS token pooling + - "avg": Average pooling + - Callable: A custom pooling function + - None: No pooling applied. Default is None. + *args : + Positional arguments passed directly to `cls._hf_auto_class.from_pretrained`. + **kwargs : + Additional keyword arguments passed directly to `cls._hf_auto_class.from_pretrained`. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + + Returns + ------- + QEFFAutoModel + An instance initialized with the pretrained weights. """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -230,17 +299,33 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k @property def get_model_config(self) -> dict: + """ + Get the model configuration as a dictionary. + + Returns + ------- + dict + The configuration dictionary of the underlying HuggingFace model. + """ return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. + Export the model to ONNX format using ``torch.onnx.export``. + + This method prepares example inputs and dynamic axes based on the model configuration, + then exports the model to an ONNX graph suitable for compilation and deployment on Cloud AI 100 hardware. - ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. + Parameters + ---------- + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. If not provided, + the default export directory is used. - Returns: - :str: Path of the generated ``ONNX`` graph. + Returns + ------- + str + Path to the generated ONNX graph file. """ bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN @@ -274,27 +359,37 @@ def compile( **compiler_options, ) -> str: """ - This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. - If the model has not been exported yet, this method will handle the export process. - You can pass any other arguments that the `qaic-exec` takes as extra kwargs. - - ``Optional`` Args: - :onnx_path (str, optional): Path to pre-exported onnx model. - :compile_dir (str, optional): Path for saving the qpc generated. - :seq_len (Union[int, List[int]]): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. - :num_cores (int): Number of cores used to compile the model. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :compiler_options (dict, optional): Additional compiler options. - For QAIC Compiler: Extra arguments for qaic-exec can be passed. - :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` - For QNN Compiler: Following arguments can be passed. - :enable_qnn (bool): Enables QNN Compilation. - :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. - Returns: - :str: Path of the compiled ``qpc`` package. + Compile the exported ONNX model using the Cloud AI 100 Platform SDK compiler. + + This method generates a ``qpc`` package. If the model has not been exported yet, + this method will handle the export process. Additional arguments for the `qaic-exec` + compiler can be passed as keyword arguments. + + Parameters + ---------- + onnx_path : str, optional + Path to a pre-exported ONNX model. If not provided, the model will be exported first. + compile_dir : str, optional + Directory to save the generated QPC package. If not provided, a default directory is used. + seq_len : int or list of int, optional + The length(s) of the prompt(s) to compile for. Can be a single integer or a list of integers + to create multiple specializations. Default is 32. + batch_size : int, optional + Batch size. Default is 1. + num_devices : int, optional + Number of devices to compile for. Default is 1. + num_cores : int, optional + Number of cores to use for compilation. + mxfp6_matmul : bool, optional + Use MXFP6 compression for weights. Default is False. + **compiler_options : dict + Additional compiler options for QAIC or QNN compilers. These are passed directly + to the underlying compilation command. + + Returns + ------- + str + Path to the compiled QPC package. """ if isinstance(seq_len, list) and len(seq_len) >= 15: @@ -323,14 +418,26 @@ def generate( runtime_ai100: bool = True, ) -> Union[torch.Tensor, np.ndarray]: """ - This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - ``Mandatory`` Args: - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - Returns: - :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + Generate output by executing the compiled QPC on Cloud AI 100 hardware or using PyTorch runtime. + + This method runs sequential execution based on the compiled model's batch size and the number of prompts. + If the number of prompts is not divisible by the batch size, the last batch will be dropped. + + Parameters + ---------- + inputs : torch.Tensor or np.ndarray + Input data for the model. For AI 100 runtime, this typically includes + `input_ids` and `attention_mask`. + device_ids : list of int, optional + Device IDs for running the QPC. Defaults to `[0]` if not specified and `runtime_ai100` is True. + runtime_ai100 : bool, optional + Whether to use the AI 100 runtime for inference. If False, the PyTorch + runtime will be used. Default is True. + + Returns + ------- + torch.Tensor or np.ndarray + Output from the AI 100 or PyTorch runtime. The type depends on the runtime and model. """ # AI_100 runtime if runtime_ai100: @@ -348,15 +455,23 @@ def cloud_ai_100_feature_generate( device_ids: List[int] = [0], ) -> np.ndarray: """ - Generates features with list of prompts using AI 100 runtime. - - ``Mandatory`` Args: - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - ``Optional`` Args: - device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. - - Returns: - np.ndarray: A list of dictionaries containing the generated output features. + Generate features for a batch of inputs using the Cloud AI 100 hardware runtime. + + This method runs inference on the compiled QPC using the Cloud AI 100 accelerator. + It automatically pads input tensors to match the compiled sequence length and handles session setup. + + Parameters + ---------- + inputs : torch.Tensor or np.ndarray + Input tensors for feature extraction. Must be a dictionary-like object + including `input_ids` and `attention_mask`. + device_ids : List[int], optional + List of device IDs to use for inference. Defaults to [0]. + + Returns + ------- + np.ndarray + Array containing the generated output features for each input in the batch. """ if self.qpc_session is None: @@ -406,19 +521,33 @@ def cloud_ai_100_feature_generate( def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: """ - Generates features from a list of text prompts using a PyTorch model. + Generate features from a batch of inputs using the PyTorch model. + + This method runs the model in PyTorch (CPU/GPU) mode for feature extraction. - ``Mandatory`` Args: - :model: The transformed PyTorch model used for generating features. - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + Parameters + ---------- + model : nn.Module + The PyTorch model to use for inference. + inputs : torch.Tensor or np.ndarray + Input tensors for feature extraction. Expected to be a dictionary-like object. - Returns: - torch.Tensor: A list of output features generated by the model for each prompt. + Returns + ------- + List[torch.Tensor] + List of output features generated by the model for each input. """ return model(**inputs) class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): + """ + QEfficient wrapper for the Vision Encoder component of a Text-to-Image-to-Text model. + + This class handles the export and compilation of the vision encoder part + of multimodal models for optimal performance on Cloud AI 100 hardware. + """ + _pytorch_transforms = [ AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, @@ -429,11 +558,42 @@ class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.modules, **kwargs): + """ + Initializes the vision encoder component for multimodal models. + + Parameters + ---------- + model : nn.Module + The full HuggingFace multimodal model from which the vision encoder is extracted. + **kwargs : + Additional keyword arguments passed to the base class constructor. + """ super().__init__(model, **kwargs) self.model = model.get_qeff_vision_encoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + """ + Exports the vision encoder component to ONNX format. + + Parameters + ---------- + inputs : Dict[str, torch.Tensor] + Example inputs for the ONNX export. + output_names : List[str] + List of output names for the ONNX graph. + dynamic_axes : Dict[str, Dict[int, str]] + Dynamic axes configuration for the ONNX graph. + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. Default is None. + offload_pt_weights : bool, optional + If True, PyTorch weights will be offloaded after export. Default is True. + + Returns + ------- + str + Path to the generated ONNX graph file for the vision encoder. + """ return self._export( inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights ) @@ -450,6 +610,35 @@ def compile( custom_io, **compiler_options, ) -> str: + """ + Compiles the vision encoder component to a QPC package. + + Parameters + ---------- + compile_dir : str + Directory to save the generated QPC package. + compile_only : bool + If True, only compilation occurs without running inference. + specializations : List[Dict[str, Union[int, str]]] + List of dictionaries, each specifying a compilation specialization. + convert_to_fp16 : bool + If True, converts model to FP16 precision during compilation. + mxfp6_matmul : bool + If True, uses MXFP6 compression for MatMul weights. + mdp_ts_num_devices : int + Number of devices for multi-device (tensor slicing) compilation. + aic_num_cores : int + Number of cores to use for compilation. + custom_io : Dict[str, str] + Custom I/O configurations for the compiler. + **compiler_options : + Additional compiler options passed to the underlying compilation command. + + Returns + ------- + str + Path to the compiled QPC package for the vision encoder. + """ return self._compile( compile_dir=compile_dir, compile_only=compile_only, @@ -464,6 +653,14 @@ def compile( @property def model_name(self) -> str: + """ + Get the name of the underlying vision encoder model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -471,10 +668,25 @@ def model_name(self) -> str: @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying HuggingFace vision model. + + Returns + ------- + dict + The configuration dictionary. + """ return self.model.model.vision_model.config.__dict__ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): + """ + QEfficient wrapper for the Causal Language Model (decoder) component of a Text-to-Image-to-Text model. + + This class handles the export and compilation of the language decoder part + of multimodal models for optimal performance on Cloud AI 100 hardware. + """ + _pytorch_transforms = [ AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, @@ -486,11 +698,42 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model, **kwargs): + """ + Initializes the language decoder component for multimodal models. + + Parameters + ---------- + model : nn.Module + The full HuggingFace multimodal model from which the language decoder is extracted. + **kwargs : + Additional keyword arguments passed to the base class constructor. + """ super().__init__(model, **kwargs) self.model = model.get_qeff_language_decoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + """ + Exports the language decoder component to ONNX format. + + Parameters + ---------- + inputs : Dict[str, torch.Tensor] + Example inputs for the ONNX export. + output_names : List[str] + List of output names for the ONNX graph. + dynamic_axes : Dict[str, Dict[int, str]] + Dynamic axes configuration for the ONNX graph. + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. Default is None. + offload_pt_weights : bool, optional + If True, PyTorch weights will be offloaded after export. Default is True. + + Returns + ------- + str + Path to the generated ONNX graph file for the language decoder. + """ return self._export( inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights ) @@ -507,6 +750,35 @@ def compile( custom_io, **compiler_options, ) -> str: + """ + Compiles the language decoder component to a QPC package. + + Parameters + ---------- + compile_dir : str + Directory to save the generated QPC package. + compile_only : bool + If True, only compilation occurs without running inference. + specializations : List[Dict[str, Union[int, str]]] + List of dictionaries, each specifying a compilation specialization. + convert_to_fp16 : bool + If True, converts model to FP16 precision during compilation. + mxfp6_matmul : bool + If True, uses MXFP6 compression for MatMul weights. + mdp_ts_num_devices : int + Number of devices for multi-device (tensor slicing) compilation. + aic_num_cores : int + Number of cores to use for compilation. + custom_io : Dict[str, str] + Custom I/O configurations for the compiler. + **compiler_options : + Additional compiler options passed to the underlying compilation command. + + Returns + ------- + str + Path to the compiled QPC package for the language decoder. + """ return self._compile( compile_dir=compile_dir, compile_only=compile_only, @@ -521,6 +793,14 @@ def compile( @property def model_name(self) -> str: + """ + Get the name of the underlying language decoder model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -528,10 +808,26 @@ def model_name(self) -> str: @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying HuggingFace language model. + + Returns + ------- + dict + The configuration dictionary. + """ return self.model.language_model.config.__dict__ class _QEffAutoModelForImageTextToTextDualQPC: + """ + Internal class handling multimodal image-text-to-text models using a dual QPC approach. + + In this approach, the vision encoder and language model decoder are compiled + into separate QPC packages. The vision encoder's KV cache might be offloaded + to CPU or managed differently from the language model's KV cache. + """ + _hf_auto_class = AutoModelForImageTextToText def __init__( @@ -539,6 +835,21 @@ def __init__( model: nn.Module, **kwargs, ): + """ + Initializes the dual QPC multimodal model wrapper. + + Parameters + ---------- + model : nn.Module + The full HuggingFace multimodal model. + **kwargs : + Additional keyword arguments. `full_batch_size` is not supported here. + + Raises + ------ + NotImplementedError + If `full_batch_size` is provided. + """ if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") self.model = model @@ -549,6 +860,14 @@ def __init__( @property def model_name(self) -> str: + """ + Get the name of the underlying multimodal model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -556,6 +875,23 @@ def model_name(self) -> str: @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + """ + Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + **kwargs : + Additional keyword arguments passed directly to `cls._hf_auto_class.from_pretrained`. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + + Returns + ------- + _QEffAutoModelForImageTextToTextDualQPC + An instance initialized with the pretrained weights. + """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -568,10 +904,27 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): @property def onnx_path(self): + """ + Get the ONNX paths for the vision and language model components. + + Returns + ------- + List[str] + A list containing the ONNX paths of the vision model and the language model. + """ return [self.vision_model.onnx_path, self.lang_model.onnx_path] @property def qpc_path(self): + """ + Get the QPC paths for the vision and language model components. + + Returns + ------- + Union[List[str], str, None] + A list containing both QPC paths if both are compiled, or just one if only one is, + or None if neither is compiled. + """ if self.vision_model.qpc_path and self.lang_model.qpc_path: return [self.vision_model.qpc_path, self.lang_model.qpc_path] elif self.vision_model.qpc_path: @@ -584,6 +937,24 @@ def export( export_dir: Optional[str] = None, **kwargs, ) -> str: + """ + Exports both the vision encoder and language decoder components to ONNX format. + + This method exports the vision component (optionally without offloading PyTorch weights) + and the language component (with offloading PyTorch weights). + + Parameters + ---------- + export_dir : str, optional + Directory path where the exported ONNX graphs will be saved. Default is None. + **kwargs : + Additional keyword arguments. + + Returns + ------- + List[str] + A list containing the paths to the generated ONNX graph files for both components. + """ inputs = self.model.get_dummy_inputs(kv_offload=True) dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True) output_names = self.model.get_output_names(kv_offload=True) @@ -622,6 +993,58 @@ def compile( skip_lang: Optional[bool] = False, **compiler_options, ) -> str: + """ + Compiles both the vision encoder and language decoder components into QPC packages. + + Parameters + ---------- + img_size : int, optional + The image size to compile the vision model for. Default is None. + vision_onnx_path : str, optional + Path to a pre-exported ONNX file for the vision encoder. If None, it will be exported. + lang_onnx_path : str, optional + Path to a pre-exported ONNX file for the language decoder. If None, it will be exported. + compile_dir : str, optional + Directory to save the generated QPC packages. + prefill_seq_len : int, optional + Length of the prefill prompt for the language model. Default is None. + ctx_len : int, optional + Maximum context length for the language model. Default is None. + batch_size : int, optional + Batch size. Default is 1. + full_batch_size : int, optional + Not supported for this model; must be None. + kv_cache_batch_size : int, optional + Not supported for this model; must be None. + num_devices : int, optional + Number of devices to compile for. Default is 1. + num_cores : int, optional + Number of cores to use for compilation. + mxfp6_matmul : bool, optional + Use MXFP6 compression for weights in the language model. Default is False. + mxint8_kv_cache : bool, optional + Use MXINT8 compression for KV cache. Default is False. + num_speculative_tokens : int, optional + Not supported for this model; must be None. + skip_vision : bool, optional + If True, skips compilation of the vision encoder. Default is False. + skip_lang : bool, optional + If True, skips compilation of the language decoder. Default is False. + **compiler_options : dict + Additional compiler options for QAIC or QNN compilers. + + Returns + ------- + Union[List[str], str, None] + A list of paths to the compiled QPC packages, or a single path if only + one component is compiled, or None if neither is compiled. + + Raises + ------ + ValueError + If `full_batch_size`, `kv_cache_batch_size`, or `num_speculative_tokens` are not None. + If both `skip_lang` and `skip_vision` are True. + """ if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]): raise ValueError( f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: " @@ -713,14 +1136,35 @@ def generate( generation_len: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ - This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - ``Mandatory`` Args: - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - Returns: - :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + Generates output by executing the compiled QPC(s) on Cloud AI 100 Hardware cards. + + This method coordinates inference between the vision encoder and language model decoder. + + Parameters + ---------- + inputs : Dict[str, Union[torch.Tensor, np.ndarray]] + Inputs to run the execution, typically includes `pixel_values`, `input_ids`, + `attention_mask`, etc. + streamer : TextStreamer, optional + A streamer object to display generated tokens in real-time. Default is None. + device_ids : List[int], optional + IDs of devices for running the QPC. E.g., `[0]` for a single device or + `[0, 1, 2, 3]` for tensor slicing. Defaults to `[0]` if not specified. + runtime_ai100 : bool, optional + If True, uses the AI 100 runtime. PyTorch runtime is not supported for this model. + Default is True. + generation_len : int, optional + The maximum number of tokens to generate. If None, it's inferred from `ctx_len`. + + Returns + ------- + CloudAI100ExecInfoNew or np.ndarray + Output from the AI 100 runtime, including generated IDs and performance metrics. + + Raises + ------ + NotImplementedError + If `runtime_ai100` is False. """ if not runtime_ai100: raise NotImplementedError("PyTorch execution is not supported yet for this model!") @@ -736,6 +1180,35 @@ def kv_offload_generate( device_ids: List[int] = None, generation_len: int = None, ): + """ + Performs generation for multimodal models with KV offloading to CPU. + + This method orchestrates the inference by running the vision encoder (if compiled) + and then iteratively running the language decoder, managing KV cache states. + + Parameters + ---------- + inputs : Dict[str, Union[torch.Tensor, np.ndarray]] + Input tensors for the multimodal model. + streamer : TextStreamer, optional + A streamer object to display generated tokens in real-time. Default is None. + device_ids : List[int], optional + IDs of devices for running the QPC. Defaults to `[0]` if not specified. + generation_len : int, optional + The maximum number of tokens to generate. If None, it's inferred from `ctx_len`. + + Returns + ------- + CloudAI100ExecInfoNew + Execution information including generated IDs and performance metrics. + + Raises + ------ + TypeError + If the language model QPC is not compiled. + AssertionError + If `generation_len` is not greater than zero. + """ if not self.lang_model.qpc_path: raise TypeError("Please run compile API for language model first!") @@ -888,6 +1361,13 @@ def kv_offload_generate( class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin): + """ + Internal class handling multimodal image-text-to-text models using a single QPC approach. + + In this approach, the entire multimodal model (vision encoder + language model decoder) + is compiled into a single QPC package. + """ + _hf_auto_class = AutoModelForImageTextToText _pytorch_transforms = [ AwqToMatmulNbitsTransform, @@ -905,6 +1385,21 @@ def __init__( model: nn.Module, **kwargs, ): + """ + Initializes the single QPC multimodal model wrapper. + + Parameters + ---------- + model : nn.Module + The full HuggingFace multimodal model. + **kwargs : + Additional keyword arguments. `full_batch_size` is not supported here. + + Raises + ------ + NotImplementedError + If `full_batch_size` is provided. + """ if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) @@ -925,6 +1420,26 @@ def from_pretrained( *args, **kwargs, ): + """ + Load a QEfficient multimodal model for single QPC from a pretrained HuggingFace model or local path. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + *args : + Positional arguments passed directly to `cls._hf_auto_class.from_pretrained`. + **kwargs : + Additional keyword arguments passed directly to `cls._hf_auto_class.from_pretrained`. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + Also, `_attn_implementation` and `use_flash_attn` are configured for VLM models. + + Returns + ------- + _QEFFAutoModelForImageTextToTextSingleQPC + An instance initialized with the pretrained weights. + """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -946,6 +1461,21 @@ def export( export_dir: Optional[str] = None, **kwargs, ) -> str: + """ + Exports the entire multimodal model to ONNX format. + + Parameters + ---------- + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. Default is None. + **kwargs : + Additional keyword arguments. + + Returns + ------- + str + Path to the generated ONNX graph file. + """ inputs = self.model.get_dummy_inputs() dynamic_axes = self.model.get_onnx_dynamic_axes() output_names = self.model.get_output_names() @@ -969,6 +1499,52 @@ def compile( num_speculative_tokens: Optional[int] = None, **compiler_options, ) -> str: + """ + Compiles the exported ONNX model (single QPC) using the Cloud AI 100 Platform SDK compiler. + + This method generates a single ``qpc`` package for the entire multimodal model. + + Parameters + ---------- + onnx_path : str, optional + Path to a pre-exported ONNX model. If not provided, the model will be exported first. + img_size : int, optional + The image size to compile the vision part of the model for. Default is None. + compile_dir : str, optional + Directory to save the generated QPC package. + prefill_seq_len : int, optional + Length of the prefill prompt. Default is None. + ctx_len : int, optional + Maximum context length the compiled model can remember. Default is None. + batch_size : int, optional + Batch size. Default is 1. + full_batch_size : int, optional + Not supported for this model; must be None. + kv_cache_batch_size : int, optional + Not supported for this model; must be None. + num_devices : int, optional + Number of devices to compile for. Default is 1. + num_cores : int, optional + Number of cores to use for compilation. + mxfp6_matmul : bool, optional + Use MXFP6 compression for weights. Default is False. + mxint8_kv_cache : bool, optional + Use MXINT8 compression for KV cache. Default is False. + num_speculative_tokens : int, optional + Not supported for this model; must be None. + **compiler_options : dict + Additional compiler options for QAIC or QNN compilers. + + Returns + ------- + str + Path to the compiled QPC package. + + Raises + ------ + ValueError + If `full_batch_size`, `kv_cache_batch_size`, or `num_speculative_tokens` are not None. + """ if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]): raise ValueError( f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: " @@ -1018,6 +1594,14 @@ def compile( return self.qpc_path def get_onnx_dynamic_axes(self): + """ + Retrieves the dynamic axes configuration for ONNX export for this model. + + Returns + ------- + Dict[str, Dict[int, str]] + A dictionary specifying the dynamic axes for inputs. + """ return self.model.get_onnx_dynamic_axes() def generate( @@ -1029,14 +1613,33 @@ def generate( generation_len: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ - This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - ``Mandatory`` Args: - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - Returns: - :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + Generates output by executing the compiled single QPC on Cloud AI 100 Hardware cards. + + Parameters + ---------- + inputs : Dict[str, Union[torch.Tensor, np.ndarray]] + Inputs to run the execution, typically includes `pixel_values`, `input_ids`, + `attention_mask`, etc. + streamer : TextStreamer, optional + A streamer object to display generated tokens in real-time. Default is None. + device_ids : List[int], optional + IDs of devices for running the QPC. E.g., `[0]` for a single device or + `[0, 1, 2, 3]` for tensor slicing. Defaults to `[0]` if not specified. + runtime_ai100 : bool, optional + If True, uses the AI 100 runtime. PyTorch runtime is not supported for this model. + Default is True. + generation_len : int, optional + The maximum number of tokens to generate. If None, it's inferred from `ctx_len`. + + Returns + ------- + CloudAI100ExecInfoNew or np.ndarray + Output from the AI 100 runtime, including generated IDs and performance metrics. + + Raises + ------ + NotImplementedError + If `runtime_ai100` is False. """ if not runtime_ai100: raise NotImplementedError("PyTorch execution is not supported yet for this model!") @@ -1053,6 +1656,32 @@ def cloud_ai_100_generate( generation_len: int = None, streamer: Optional[TextStreamer] = None, ) -> np.ndarray: + """ + Performs generation for multimodal models using a single QPC on Cloud AI 100 hardware. + + Parameters + ---------- + inputs : Dict[str, Union[torch.Tensor, np.ndarray]] + Input tensors for the multimodal model. + device_ids : List[int] + IDs of devices for running the QPC. + enable_debug_logs : bool, optional + If True, enables debug logging for the QAIC inference session. Default is False. + generation_len : int, optional + The maximum number of tokens to generate. If None, it's inferred from `ctx_len`. + streamer : TextStreamer, optional + A streamer object to display generated tokens in real-time. Default is None. + + Returns + ------- + CloudAI100ExecInfoNew + Execution information including generated IDs and performance metrics. + + Raises + ------ + AssertionError + If `generation_len` is not greater than zero. + """ inputs = self.auto_correct_inputs(inputs) qpc_session = QAICInferenceSession( self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False @@ -1171,6 +1800,14 @@ def cloud_ai_100_generate( @property def model_name(self) -> str: + """ + Get the name of the underlying multimodal model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -1178,41 +1815,45 @@ def model_name(self) -> str: @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying HuggingFace model. + + Returns + ------- + dict + The configuration dictionary. + """ return self.model.config.__dict__ class QEFFAutoModelForImageTextToText: """ - The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub. - While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches. - Attributes: - _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models. + QEfficient class for multimodal (image-text-to-text) models from the HuggingFace hub. - ``Mandatory`` Args: - :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory. - - ``Optional`` Args: - :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True. + This class supports both single and dual QPC (Quantized Package Compilation) approaches for efficient deployment on Cloud AI 100 hardware. + It is recommended to use the ``from_pretrained`` method for initialization. + Example + ------- .. code-block:: python import requests from PIL import Image from transformers import AutoProcessor, TextStreamer - from QEfficient import QEFFAutoModelForImageTextToText - # Add HuggingFace Token to access the model - HF_TOKEN = "" + HF_TOKEN = "" # Your HuggingFace token if needed model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" query = "Describe this image." image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc + # STEP 1: Load processor and model processor = AutoProcessor.from_pretrained(model_name, token=HF_TOKEN) - model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=HF_TOKEN, attn_implementation="eager", kv_offload=False) + model = QEFFAutoModelForImageTextToText.from_pretrained( + model_name, token=HF_TOKEN, attn_implementation="eager", kv_offload=False # kv_offload=False for single QPC + ) - ## STEP - 2 Export & Compile the Model + # STEP 2: Export & Compile model.compile( prefill_seq_len=32, ctx_len=512, @@ -1222,7 +1863,7 @@ class QEFFAutoModelForImageTextToText: mxfp6_matmul=False, ) - ## STEP - 3 Load and process the inputs for Inference + # STEP 3: Prepare inputs image = Image.open(requests.get(image_url, stream=True).raw) messages = [ { @@ -1239,19 +1880,37 @@ class QEFFAutoModelForImageTextToText: images=image, return_tensors="pt", add_special_tokens=False, - padding="max_length", + padding="max_length", # Consider padding strategy if max_length is crucial max_length=32, ) - ## STEP - 4 Run Inference on the compiled model + # STEP 4: Run inference streamer = TextStreamer(processor.tokenizer) model.generate(inputs=inputs, streamer=streamer, generation_len=512) - """ _hf_auto_class = AutoModelForImageTextToText def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs): + """ + Instantiate the appropriate internal class for single or dual QPC mode. + + Parameters + ---------- + model : nn.Module + The loaded HuggingFace multimodal model. + kv_offload : bool, optional + If True, uses the dual QPC approach (vision encoder KV offloaded). + If False, uses the single QPC approach (entire model in one QPC). + Default is True. + **kwargs : + Additional keyword arguments passed to the constructor of the selected internal class. + + Returns + ------- + Union[_QEffAutoModelForImageTextToTextDualQPC, _QEFFAutoModelForImageTextToTextSingleQPC] + The wrapped model instance, configured for either dual or single QPC. + """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs) else: @@ -1260,14 +1919,32 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs) @classmethod @with_replaced_quantizers def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs): - """Used to load models supported by transformers.AutoModelForImageTextToText for Cloud AI 100. - - Args: - pretrained_model_name_or_path (str): Path or model card name on HuggingFace - kv_offload (Optional[bool], optional): Should the KV of vision encoder be offloaded to CPU and use Two QPC. Defaults to None. - - Returns: - _type_: _description_ + """ + Load a QEfficient image-text-to-text model from a pretrained HuggingFace model or local path. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + kv_offload : bool, optional + If True, uses the dual QPC approach (vision encoder KV offloaded). + If False, uses the single QPC approach (entire model in one QPC). + If None, the default behavior of the internal classes is used (typically dual QPC). + **kwargs : + Additional arguments passed to HuggingFace's ``from_pretrained``. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + `continuous_batching` is not supported for image-text-to-text models. + + Returns + ------- + QEFFAutoModelForImageTextToText + An instance initialized with the pretrained weights, wrapped for QEfficient. + + Raises + ------ + NotImplementedError + If `continuous_batching` is provided as True. """ # TODO: add a check to see if kv_offload is allowed for given model by loading the config and checking architecture or type of config here. if kwargs.get("attn_implementation", None) not in {None, "eager"}: @@ -1289,34 +1966,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona class QEFFAutoModelForCausalLM(QEFFBaseModel): """ - The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. - Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. - - ``Mandatory`` Args: - :model (nn.Module): PyTorch model - :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. - ``Optional`` Args: - :qaic_config (dict): QAIC config dictionary with the following supported keys: - :speculative_model_type (str): To specify Speculative Decoding Target Language Models. - :include_sampler (bool): Enable/Disable sampling of next tokens. - :return_pdfs (bool): Return probability distributions along with sampled - next tokens. For Speculative Decoding Target Language Model, - `return_pdfs`=True always. Otherwise, `return_pdfs`=True for Speculative - Decoding Draft Language Model and `return_pdfs`=False for regular model. - :max_top_k_ids (int): Specify the maximum number of top K tokens - (<= vocab size) to consider during sampling. The values provided in - `top_ks` tensor must be less than this maximum limit. + QEfficient class for Causal Language Models from the HuggingFace hub (e.g., GPT-2, Llama). + + This class provides a unified interface for loading, exporting, compiling, and generating + text with causal language models on Cloud AI 100 hardware. It supports features like + continuous batching, speculative decoding (TLM), and on-device sampling. + Example + ------- .. code-block:: python from QEfficient import QEFFAutoModelForCausalLM from transformers import AutoTokenizer - model_name = "gpt2" - model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2) - model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1) - - tokenizer = AutoTokenizer.from_pretrained(model_name) + model = QEFFAutoModelForCausalLM.from_pretrained("gpt2") + model.compile(num_cores=16) + tokenizer = AutoTokenizer.from_pretrained("gpt2") model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) """ @@ -1339,6 +2004,31 @@ def __init__( qaic_config: Optional[dict] = None, **kwargs, ): + """ + Initializes a QEFFAutoModelForCausalLM instance. + + Parameters + ---------- + model : nn.Module + The underlying HuggingFace PyTorch Causal Language Model. + continuous_batching : bool, optional + If True, enables continuous batching mode for future compilation and execution. + This setting must be consistent across `from_pretrained` and `compile` calls. Default is False. + qaic_config : dict, optional + A dictionary for QAIC-specific configurations. Supported keys include: + - **speculative_model_type** (str): Specifies the type of Speculative Decoding model (e.g., "target"). + - **include_sampler** (bool): If True, enables on-device sampling of next tokens. + - **return_pdfs** (bool): If True, returns probability distributions along with sampled tokens. + For Speculative Decoding Target Language Models, this is always True. + - **max_top_k_ids** (int): Maximum number of top K tokens (<= vocab size) to consider during sampling. + **kwargs : + Additional keyword arguments passed to the base class constructor. + + Raises + ------ + TypeError + If the provided `model` is not a CausalLM or LMHeadModel type. + """ model_class_name = model.__class__.__name__ if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") @@ -1376,6 +2066,14 @@ def __init__( @property def model_name(self) -> str: + """ + Get the name of the underlying Causal Language Model. + + Returns + ------- + str + The model's class name, with "QEff" or "QEFF" prefix removed if present. + """ mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] @@ -1395,40 +2093,44 @@ def from_pretrained( **kwargs, ): """ - This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. - Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. - - This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API - Args: - :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. - :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. - ``Optional`` Args: - :qaic_config (dict): QAIC config dictionary with the following supported keys: - :speculative_model_type (str): To specify Speculative Decoding Target Language Models. - :include_sampler (bool): Enable/Disable sampling of next tokens. - :return_pdfs (bool): Return probability distributions along with sampled - next tokens. For Speculative Decoding Target Language Model, - `return_pdfs`=True always. Otherwise, `return_pdfs`=True for Speculative - Decoding Draft Language Model and `return_pdfs`=False for regular model. - :max_top_k_ids (int): Specify the maximum number of top K tokens - (<= vocab size) to consider during sampling. The values provided in - `top_ks` tensor must be less than this maximum limit. - - .. code-block:: python - - from QEfficient import QEFFAutoModelForCausalLM - from transformers import AutoTokenizer - - # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM - model_name = "gpt2" - model = QEFFAutoModelForCausalLM.from_pretrained(model_name) - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 Standard SKU - - # You can now execute the model - tokenizer = AutoTokenizer.from_pretrained(model_name) - model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + Load a QEfficient Causal Language Model from a pretrained HuggingFace model or local path. + + This is the recommended way to initialize a QEfficient Causal Language Model. + The interface is similar to ``transformers.AutoModelForCausalLM.from_pretrained``. + Once initialized, you can use methods such as ``export``, ``compile``, and ``generate``. + + Parameters + ---------- + pretrained_model_name_or_path : str + Model card name from HuggingFace or local path to model directory. + continuous_batching : bool, optional + Whether this model will be used for continuous batching in the future. + If not set to True here, the model cannot be exported/compiled for + continuous batching later. Default is False. + qaic_config : dict, optional + QAIC config dictionary. Supported keys include: + + - **speculative_model_type** (str): Specify Speculative Decoding Target Language Models. + - **include_sampler** (bool): Enable/Disable sampling of next tokens. + - **return_pdfs** (bool): Return probability distributions along with sampled next tokens. + For Speculative Decoding Target Language Model, ``return_pdfs=True`` always. + Otherwise, ``return_pdfs=True`` for Speculative Decoding Draft Language Model + and ``return_pdfs=False`` for regular model. + - **max_top_k_ids** (int): Maximum number of top K tokens (<= vocab size) to consider during sampling. + The values provided in ``top_ks`` tensor must be less than this maximum limit. + + *args : + Positional arguments passed directly to `cls._hf_auto_class.from_pretrained`. + **kwargs : + Additional keyword arguments passed directly to `cls._hf_auto_class.from_pretrained`. + Note: `attn_implementation` and `low_cpu_mem_usage` are automatically + set to "eager" and False respectively to ensure compatibility. + `kv_offload` is explicitly handled. + + Returns + ------- + QEFFAutoModelForCausalLM + An instance initialized with the pretrained weights. """ if kwargs.pop("full_batch_size", None): continuous_batching = True @@ -1465,17 +2167,34 @@ def from_pretrained( @property def get_model_config(self) -> dict: + """ + Get the model configuration as a dictionary. + + Returns + ------- + dict + The configuration dictionary of the underlying HuggingFace model. + """ return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. - - ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. - - Returns: - :str: Path of the generated ``ONNX`` graph. + Export the model to ONNX format using ``torch.onnx.export``. + + This method prepares example inputs and dynamic axes based on the model configuration, + then exports the model to an ONNX graph suitable for compilation and deployment + on Cloud AI 100 hardware. It handles KV cache inputs/outputs and sampler-related inputs. + + Parameters + ---------- + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. + If not provided, the default export directory is used. + + Returns + ------- + str + Path to the generated ONNX graph file. """ bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN @@ -1561,8 +2280,23 @@ def get_sampling_inputs_and_outputs( dynamic_axes: Dict[str, Dict[int, str]], ): """ - Update the example inputs and outputs with respect to the On Device Sampler - for the ONNX export. + Updates the example inputs, output names, and dynamic axes to include + parameters relevant for on-device sampling during ONNX export. + + Parameters + ---------- + example_inputs : Dict[str, torch.Tensor] + Current dictionary of example inputs. + output_names : List[str] + Current list of output names. + dynamic_axes : Dict[str, Dict[int, str]] + Current dictionary of dynamic axes configurations. + + Returns + ------- + Tuple[Dict[str, torch.Tensor], List[str], Dict[str, Dict[int, str]]] + Updated example inputs, output names, and dynamic axes including + sampling-related parameters. """ bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS @@ -1626,6 +2360,27 @@ def build_prefill_specialization( kv_cache_batch_size: Optional[int] = None, full_batch_size: Optional[int] = None, ): + """ + Builds a dictionary representing a compilation specialization for the prefill phase. + + Parameters + ---------- + prefill_seq_len : int, optional + Length of the prefill prompt. Default is 32. + ctx_len : int, optional + Maximum context length the compiled model can remember. Default is 128. + batch_size : int, optional + Batch size for the prefill. Default is 1. + kv_cache_batch_size : int, optional + Batch size for KV cache. If not provided, it defaults based on `full_batch_size` or `batch_size`. + full_batch_size : int, optional + Continuous batching batch size. Used if `continuous_batching` is enabled. Default is None. + + Returns + ------- + Dict[str, Union[int, str]] + A dictionary defining the prefill specialization. + """ spec = { "batch_size": 1 if self.continuous_batching else batch_size, "seq_len": prefill_seq_len, @@ -1649,6 +2404,30 @@ def build_decode_specialization( full_batch_size: Optional[int] = None, num_speculative_tokens: Optional[int] = None, ): + """ + Builds a dictionary representing a compilation specialization for the decode phase. + + Parameters + ---------- + prefill_seq_len : int, optional + Length of the prefill prompt. Used to avoid duplicate specializations. Default is 32. + ctx_len : int, optional + Maximum context length the compiled model can remember. Default is 128. + batch_size : int, optional + Batch size for the decode phase. Default is 1. + kv_cache_batch_size : int, optional + Batch size for KV cache. If not provided, it defaults based on `full_batch_size` or `batch_size`. + full_batch_size : int, optional + Continuous batching batch size. Used if `continuous_batching` is enabled. Default is None. + num_speculative_tokens : int, optional + Number of speculative tokens for Speculative Decoding Target Language Model. Default is None. + + Returns + ------- + Optional[Dict[str, Union[int, str]]] + A dictionary defining the decode specialization, or None if it would be a duplicate + of the prefill specialization (e.g., if prefill_seq_len is 1 and not continuous batching). + """ if prefill_seq_len == 1 and not self.continuous_batching: return None # Avoid duplication with prefill spec = { @@ -1683,37 +2462,63 @@ def compile( **compiler_options, ) -> str: """ - This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. - If the model has not been exported yet, this method will handle the export process. - You can pass any other arguments that the `qaic-exec` takes as extra kwargs. - - ``Optional`` Args: - :onnx_path (str, optional): Path to pre-exported onnx model. - :compile_dir (str, optional): Path for saving the qpc generated. - :num_cores (int): Number of cores used to compile the model. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. - :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. - :full_batch_size (int, optional): Continuous batching batch size. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. - :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. - :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``. - :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``. - For QAIC Compiler: Extra arguments for qaic-exec can be passed. - :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. - :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` - Params are converted to flags as below: - - aic_num_cores=16 -> -aic-num-cores=16 - - convert_to_fp16=True -> -convert-to-fp16 - For QNN Compiler: Following arguments can be passed. - :enable_qnn (bool): Enables QNN Compilation. - :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. - - Returns: - :str: Path of the compiled ``qpc`` package. + Compile the exported ONNX model using the Cloud AI 100 Platform SDK compiler. + + This method generates a ``qpc`` package. If the model has not been exported yet, + this method will handle the export process. Additional arguments for the `qaic-exec` + compiler can be passed as keyword arguments. + + Parameters + ---------- + onnx_path : str, optional + Path to a pre-exported ONNX model. If not provided, the model will be exported first. + compile_dir : str, optional + Directory to save the generated QPC package. If not provided, a default directory is used. + prefill_seq_len : int, optional + Length of the prefill prompt. Default is 32. + ctx_len : int, optional + Maximum context length the compiled model can remember. Default is 128. + batch_size : int, optional + Batch size. Default is 1. + full_batch_size : int, optional + Continuous batching batch size. Required if `continuous_batching=True` was + set during `from_pretrained`. + kv_cache_batch_size : int, optional + Batch size for KV cache. If not provided, it defaults to `full_batch_size` (if + continuous batching) or `batch_size`. + num_devices : int, optional + Number of devices to compile for. Default is 1. + num_cores : int, optional + Number of cores to use for compilation. + mxfp6_matmul : bool, optional + Use MXFP6 compression for weights. Default is False. + mxint8_kv_cache : bool, optional + Use MXINT8 compression for KV cache. Default is False. + num_speculative_tokens : int, optional + Number of speculative tokens for Speculative Decoding Target Language Model. + Required if the model is configured as a Target Language Model (`is_tlm=True`). + prefill_only : bool, optional + If True, compiles only for the prefill stage. If False, compiles only for + the decode stage. If None, compiles for both stages. Default is None. + **compiler_options : dict + Additional compiler options for QAIC or QNN compilers. + + Returns + ------- + str + Path to the compiled QPC package. + + Raises + ------ + TypeError + If `prefill_only` is not a boolean. + If `full_batch_size` is None when `continuous_batching` is True. + If `num_speculative_tokens` is None when the model is a TLM. + ValueError + If KV caching is requested without continuous batching (`full_batch_size`). + If `include_sampler` is True and `num_speculative_tokens` is greater than 0. + If `num_speculative_tokens` is not an integer greater than 1. + If `prefill_seq_len` is less than `num_speculative_tokens + 1` for TLM models. """ # --- Validation --- if prefill_only is not None and not isinstance(prefill_only, bool): @@ -1803,18 +2608,36 @@ def generate( **kwargs, ): """ - This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. - If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. - - ``Mandatory`` Args: - :tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Pass tokenizer of the model. - :prompts (List[str]): List of prompts to run the execution. - - ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - + Generate output by executing the compiled QPC on Cloud AI 100 hardware. + + This method runs sequential execution based on the compiled model's batch size and the number of prompts. + If the number of prompts is not divisible by the batch size, the last batch will be dropped. + + Parameters + ---------- + tokenizer : PreTrainedTokenizer or PreTrainedTokenizerFast + Tokenizer for the model. + prompts : list of str + List of prompts to generate output for. + device_id : list of int, optional + Device IDs for running the QPC. Defaults to `[0]` if not specified. + runtime_ai100 : bool, optional + Whether to use AI 100 runtime. Default is True. + **kwargs : + Additional keyword arguments. Currently supports: + - `generation_len (int, optional)`: The maximum number of tokens to generate. + + Returns + ------- + CloudAI100ExecInfoNew + Output from the AI 100 runtime, containing generated IDs and performance metrics. + + Raises + ------ + TypeError + If the QPC path is not set (i.e., `compile` was not run). + NotImplementedError + If `runtime_ai100` is False. """ if runtime_ai100: if not isinstance(self.qpc_path, Path): @@ -1832,6 +2655,29 @@ def generate( raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int): + """ + Validates and retrieves the number of speculative tokens for TLM models. + + Parameters + ---------- + num_speculative_tokens : int, optional + The number of speculative tokens provided by the user. + prefill_seq_len : int + The prefill sequence length. + + Returns + ------- + int + The determined number of speculative tokens. + + Raises + ------ + TypeError + If `num_speculative_tokens` is None when `is_tlm` is True. + ValueError + If `num_speculative_tokens` is not an integer greater than 1. + If `prefill_seq_len` is less than `num_speculative_tokens + 1`. + """ if hasattr(self.model.config, "speculative_config"): num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"] if num_speculative_tokens is not None: @@ -1857,41 +2703,51 @@ def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[ class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin): """ - The QEFFAutoModelForSpeechSeq2Seq class is designed for transformers models with a sequence-to-sequence speech-to-text modeling head, including Whisper and other Encoder-Decoder speech models. - Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + QEfficient class for sequence-to-sequence speech-to-text models (e.g., Whisper, Encoder-Decoder speech models). - ``Mandatory`` Args: - :model (nn.Module): PyTorch model + This class enables efficient export, compilation, and inference of speech models on Cloud AI 100 hardware. + It is recommended to use the ``from_pretrained`` method for initialization. + Example + ------- .. code-block:: python from QEfficient import QEFFAutoModelForSpeechSeq2Seq - from processors import AutoProcessor + from transformers import AutoProcessor + import torch + import numpy as np - # Initialize the model using from_pretrained similar to transformers.AutoModelForSpeechSeq2Seq. - model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained("model_name") + model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny") + model.compile(num_cores=16, num_devices=1) # Corrected device_group to num_devices - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU + processor = AutoProcessor.from_pretrained("openai/whisper-tiny") + # Assume input_audio and sample_rate are loaded from somewhere + input_audio = np.random.rand(16000 * 5).astype(np.float32) # 5 seconds of audio + sample_rate = 16000 - #prepare inputs - processor = AutoProcessor.from_pretrained(model_name) - input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile input_features = ( - processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32) + processor(input_audio, sampling_rate=sample_rate, return_tensors="pt") + .input_features.numpy() + .astype(np.float16) # Should match custom_io dtype ) + batch_size = input_features.shape[0] + decoder_start_token_id = model.model.config.decoder_start_token_id + + # Initial decoder inputs for generation decoder_input_ids = ( - torch.ones((batch_size, 1), dtype=torch.int64) * model.model.config.decoder_start_token_id + torch.ones((batch_size, 1), dtype=torch.int64) * decoder_start_token_id ).numpy() decoder_position_ids = torch.arange(1, dtype=torch.int64).view(1, 1).repeat(batch_size, 1).numpy() + inputs = dict( input_features=input_features, decoder_input_ids=decoder_input_ids, decoder_position_ids=decoder_position_ids, ) - # You can now execute the model - model.generate(inputs, generation_len=150) + output_info = model.generate(inputs, generation_len=150) + # You can now process output_info.generated_ids to get the decoded text + print(output_info) """ _hf_auto_class = AutoModelForSpeechSeq2Seq @@ -1899,6 +2755,21 @@ class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.Module, **kwargs): + """ + Initialize a QEFFAutoModelForSpeechSeq2Seq instance. + + Parameters + ---------- + model : nn.Module + A PyTorch model with a sequence-to-sequence speech-to-text head (e.g., Whisper). + **kwargs : + Additional keyword arguments passed to the base class constructor. + + Raises + ------ + TypeError + If the model is not a supported speech-to-text model (i.e., not a `ForConditionalGeneration` model). + """ model_class_name = model.__class__.__name__ if not (model_class_name.endswith("ForConditionalGeneration")): raise TypeError(f"Required pytorch module with ForConditionalGeneration, got {model_class_name}") @@ -1910,17 +2781,33 @@ def __init__(self, model: nn.Module, **kwargs): @property def get_model_config(self) -> dict: + """ + Get the configuration dictionary of the underlying HuggingFace model. + + Returns + ------- + dict + The configuration dictionary. + """ return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. + Export the model to ONNX format using ``torch.onnx.export``. - ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. + This method prepares example inputs and dynamic axes based on the model configuration, + then exports the model to an ONNX graph suitable for compilation and deployment on Cloud AI 100 hardware. - Returns: - :str: Path of the generated ``ONNX`` graph. + Parameters + ---------- + export_dir : str, optional + Directory path where the exported ONNX graph will be saved. + If not provided, the default export directory is used. + + Returns + ------- + str + Path to the generated ONNX graph file. """ inputs = self.model.get_dummy_inputs() dynamic_axes = self.model.get_onnx_dynamic_axes() @@ -1946,24 +2833,51 @@ def compile( **compiler_options, ) -> str: """ - This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. - If the model has not been exported yet, this method will handle the export process. - You can pass any other arguments that the `qaic-exec` takes as extra kwargs. - - ``Optional`` Args: - :onnx_path (str, optional): Path to pre-exported onnx model. - :compile_dir (str, optional): Path for saving the qpc generated. - :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500`` - :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. - :num_cores (int): Number of cores used to compile the model. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - - Other args are not yet implemented for AutoModelForSpeechSeq2Seq - Returns: - :str: Path of the compiled ``qpc`` package. + Compile the exported ONNX model using the Cloud AI 100 Platform SDK compiler. + + This method generates a ``qpc`` package. If the model has not been exported yet, + this method will handle the export process. Additional arguments for the `qaic-exec` + compiler can be passed as keyword arguments. + + Parameters + ---------- + onnx_path : str, optional + Path to a pre-exported ONNX model. If not provided, the model will be exported first. + compile_dir : str, optional + Directory to save the generated QPC package. + prefill_seq_len : int, optional + Prefill sequence length. This parameter is typically not critically used for + SpeechSeq2Seq models' decoder compilation as the first decoder input is `seq_len=1`. + Default is 1. + encoder_ctx_len : int, optional + Maximum context length for the encoder part of the model. If None, it's inferred + from the model configuration or defaults (e.g., 1500 for Whisper). + ctx_len : int, optional + Maximum decoder context length. This defines the maximum output sequence length + the compiled model can handle. Default is 150. + batch_size : int, optional + Batch size. Default is 1. + num_devices : int, optional + Number of devices to compile for. Default is 1. + num_cores : int, optional + Number of cores to use for compilation. + mxfp6_matmul : bool, optional + Use MXFP6 compression for weights. Default is False. + mxint8_kv_cache : bool, optional + Use MXINT8 compression for KV cache. Default is False. + full_batch_size : int, optional + Not yet supported for this model. + kv_cache_batch_size : int, optional + Not yet supported for this model. + num_speculative_tokens : int, optional + Not yet supported for this model. + **compiler_options : dict + Additional compiler options for QAIC. + + Returns + ------- + str + Path to the compiled QPC package. """ specializations, compiler_options = self.model.get_specializations( batch_size, @@ -2023,16 +2937,37 @@ def generate( device_ids: List[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ - This method generates output until ``endoftranscript`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of audio tensor passed. - - ``Mandatory`` Args: - :processor: autoprocessor to process inputs and decode logits - :inputs (torch.Tensor): inputs to run the execution. - :generation_len (int): length upto which to generate - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - Returns: - :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + Generate output until ``<|endoftext|>`` token or `generation_len` is reached, + by executing the compiled QPC on Cloud AI 100 hardware. + + This method performs sequential execution based on the compiled model's batch size + and the provided audio tensors. It manages the iterative decoding process and KV cache. + + Parameters + ---------- + inputs : Dict[str, np.ndarray] + Model inputs for inference, typically a dictionary containing: + - `input_features` (np.ndarray): Preprocessed audio features. + - `decoder_input_ids` (np.ndarray): Initial decoder input IDs (e.g., start token). + - `decoder_position_ids` (np.ndarray): Initial decoder position IDs. + These should be prepared to match the compiled model's expectations. + generation_len : int + Maximum number of tokens to generate. The generation stops if this limit is reached + or the model generates an end-of-sequence token. + streamer : TextStreamer, optional + Streamer to receive generated tokens in real-time. Default is None. + device_ids : List[int], optional + Device IDs for running the QPC. Defaults to `[0]` if not specified. + + Returns + ------- + CloudAI100ExecInfoNew + Output from the AI 100 runtime, including generated IDs and performance metrics. + + Raises + ------ + TypeError + If the QPC path is not set (i.e., `compile` was not run). """ if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py index 5b11dd060..ba204e419 100644 --- a/QEfficient/transformers/quantizers/auto.py +++ b/QEfficient/transformers/quantizers/auto.py @@ -5,6 +5,8 @@ # # ---------------------------------------------------------------------------- +from functools import wraps + from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING, AUTO_QUANTIZER_MAPPING from transformers.quantizers.quantizer_awq import AwqQuantizer from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer @@ -47,6 +49,7 @@ def with_replaced_quantizers(func): + @wraps(func) def wrapper(*args, **kwargs): transformers_replaced_quantization_config_mapping = dict() transformers_replaced_quantizer_mapping = dict() diff --git a/docs/_static/my_theme.css b/docs/_static/my_theme.css index 00a18c905..24ac21296 100644 --- a/docs/_static/my_theme.css +++ b/docs/_static/my_theme.css @@ -1,3 +1,26 @@ .wy-nav-content { max-width: 1200px !important; +} + +/* Make Parameters, Returns, and Example sections align with page content */ +.wy-nav-content .rst-content .field-list, +.wy-nav-content .rst-content .field-list .field-name, +.wy-nav-content .rst-content .field-list .field-body, +.wy-nav-content .rst-content .field-list p, +.wy-nav-content .rst-content .field-list ul, +.wy-nav-content .rst-content .field-list li { + margin-left: 0 !important; + padding-left: 0 !important; + max-width: 100% !important; + white-space: normal !important; + word-break: break-word !important; + overflow-wrap: break-word !important; +} + +/* Make code blocks (examples) use full width and wrap if needed */ +.wy-nav-content .rst-content pre { + white-space: pre-wrap !important; + word-break: break-word !important; + max-width: 100% !important; + overflow-x: auto !important; } \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 2cc71c8a7..bd8c90f8a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,7 +37,14 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ["myst_parser", "sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx.ext.autodoc", "sphinx_multiversion"] +extensions = [ + "myst_parser", + "sphinx.ext.todo", + "sphinx.ext.viewcode", + "sphinx.ext.autodoc", + "sphinx_multiversion", + "sphinx.ext.napoleon", +] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -55,18 +62,17 @@ # html_theme = "sphinx_rtd_theme" - -def setup(app): - app.add_css_file("my_theme.css") - - # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] -source = [".md"] +# source = [".md"] todo_include_todos = True suppress_warnings = [ "ref.rst_pilog", # Suppress warnings about excluded toctree entries ] + + +def setup(app): + app.add_css_file("my_theme.css") diff --git a/docs/index.md b/docs/index.md index e04a22829..7504ed3b7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,9 +19,10 @@ source/release_docs ```{toctree} :caption: 'Getting Started' -:maxdepth: 4 +:maxdepth: 2 source/introduction +source/supported_features.rst source/validate ``` @@ -33,20 +34,15 @@ source/validate source/installation ``` -```{toctree} -:caption: 'Upgrade Efficient-Transformers' -:maxdepth: 2 - -source/upgrade -``` ```{toctree} :caption: 'Inference on Cloud AI 100' -:maxdepth: 4 +:maxdepth: 2 source/quick_start +source/features_enablement +source/autoclass source/cli_api -source/python_api ``` diff --git a/docs/source/autoclass.md b/docs/source/autoclass.md new file mode 100644 index 000000000..2bc1b4e26 --- /dev/null +++ b/docs/source/autoclass.md @@ -0,0 +1,122 @@ +**This page give you an overview about the all the Auto Classes that you might need to integrate the `QEfficient` into your python applications.** + +# Auto Classes + +## `QEFFAutoModelForCausalLM` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` + +(QEFFAutoModel)= +## `QEFFAutoModel` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` + +(QEffAutoPeftModelForCausalLM)= +## `QEffAutoPeftModelForCausalLM` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` + +(QEffAutoLoraModelForCausalLM)= +## `QEffAutoLoraModelForCausalLM` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` + +(QEFFAutoModelForImageTextToText)= +## `QEFFAutoModelForImageTextToText` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` + +(QEFFAutoModelForSpeechSeq2Seq)= +## `QEFFAutoModelForSpeechSeq2Seq` + +### HL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq + :members: from_pretrained, export, compile, generate + :member-order: bysource +``` + +### LL API + +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq + :member-order: bysource + :members: + :exclude-members: from_pretrained, generate, export, compile +``` \ No newline at end of file diff --git a/docs/source/cli_api.md b/docs/source/cli_api.md index a6ec86554..18587d480 100644 --- a/docs/source/cli_api.md +++ b/docs/source/cli_api.md @@ -1,33 +1,40 @@ - -# Command Line Interface Use (CLI) +# CLI API Reference ```{NOTE} -Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g. ``'--device_group [0]'`` +Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group`` should be in single quotes e.g. ``'--device_group [0]'`` ``` (infer_api)= ## `QEfficient.cloud.infer` ```{eval-rst} -.. automodule:: QEfficient.cloud.infer.main -``` +.. autofunction:: QEfficient.cloud.infer.main + :noindex: +``` + +(execute_api)= ## `QEfficient.cloud.execute` ```{eval-rst} -.. automodule:: QEfficient.cloud.execute.main +.. autofunction:: QEfficient.cloud.execute.main + :noindex: ``` + +(compile_api)= ## `QEfficient.cloud.compile` ```{eval-rst} - .. automodule:: QEfficient.compile.compile_helper.compile - .. code-block:: bash - - python -m QEfficient.cloud.compile OPTIONS +.. autofunction:: QEfficient.compile.compile_helper.compile + :noindex: ``` + +(export_api)= ## `QEfficient.cloud.export` ```{eval-rst} - .. automodule:: QEfficient.cloud.export.main - +.. autofunction:: QEfficient.cloud.export.main + :noindex: ``` + +(finetune_api)= ## `QEfficient.cloud.finetune` ```{eval-rst} - .. automodule:: QEfficient.cloud.finetune.main - +.. autofunction:: QEfficient.cloud.finetune.main + :noindex: ``` \ No newline at end of file diff --git a/docs/source/features_enablement.md b/docs/source/features_enablement.md new file mode 100644 index 000000000..81bd6e814 --- /dev/null +++ b/docs/source/features_enablement.md @@ -0,0 +1,85 @@ +# Fetaures Enablement Guide +Below guide highlights the steps to enable supported features in QEfficient. + +## QNN Compilation via command line interface + +QEfficient provides a command line utility which can be used to export, compile and execute onnx models using QNN SDK. + +(id-continuous-batching)= +## Continuous Batching + +Users can compile a model utilizing the continuous batching feature by specifying full_batch_size in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way. + +When enabling continuous batching, batch size should not be specified. + +Users can leverage multi-Qranium and other supported features along with continuous batching. + +```bash +python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3 +``` +(id-multi-qranium-inference)= +## Multi-Qranium Inference + +You can also enable MQ, just based on the number of devices. Based on the `--device-group` as input it will create TS config on the fly. If `--device-group [0,1]` it will create TS config for 2 devices and use it for compilation, if `--device-group [0]` then TS compilation is skipped and single soc execution is enabled. + +```bash +python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first +``` + +Above step will save the `qpc` files under `efficient-transformers/qeff_models/{model_card_name}`, you can use the execute API to run for different prompts. This will automatically pick the pre-compiled `qpc` files. + +```bash +python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device-group [0,1] +``` + +To disable MQ, just pass single soc like below, below step will compile the model again and reuse the `ONNX` file as only compilation argument are different from above commands. + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +``` + +(id-qnn-compilation-via-python-api)= +## QNN Compilation via Python API + +Users can also use python API to export, compile and execute onnx models using QNN SDK. + +```Python +# We can now export the modified models to ONNX framework +# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for +# Cloud AI 100 Platform. +from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM + +# Model-Card name (This is HF Model Card name) : https://huggingface.co/gpt2-xl +model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. + +qeff_model = AutoModelForCausalLM.from_pretrained(model_name) + +generated_qpc_path = qeff_model.compile( + num_cores=14, + mxfp6=True, + enable_qnn=True, + qnn_config = qnn_config_file_path # QNN compilation configuration is passed. +) + +qeff_model.generate(prompts=["My name is"]) +``` + +(id-draft-based-speculative-decoding)= +## Draft-Based Speculative Decoding +Draft-based speculative decoding is a technique where a small Draft Language Model (DLM) makes `num_speculative_tokens` autoregressive speculations ahead of the Target Language Model (TLM). The objective is to predict what the TLM would have predicted if it would have been used instead of the DLM. This approach is beneficial when the autoregressive decode phase of the TLM is memory bound and thus, we can leverage the extra computing resources of our hardware by batching the speculations of the DLM as an input to TLM to validate the speculations. + +To export and compile both DLM/TLM, add corresponding `qaic_config` and `num_speculative_tokens` for TLM and export DLM as you would any other QEfficient LLM model: + +```Python +tlm_name = "meta-llama/Llama-2-70b-chat-hf" +dlm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +k = 3 # DLM will make `k` speculations +qaic_config = dict(speculative_model_type="target") +tlm = AutoModelForCausalLM.from_pretrained(tlm_name, qaic_config=qaic_config) +dlm = AutoModelForCausalLM.from_pretrained(dlm_name) +tlm.compile(num_speculative_tokens=k) +dlm.compile() +``` + +The `qaic_config` dictionary is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level. \ No newline at end of file diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 311605709..490f0affb 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -153,4 +153,3 @@ tensorboard --logdir runs/ --bind_all # from transformers import DataCollatorForLanguageModeling # return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) ``` ---- diff --git a/docs/source/installation.md b/docs/source/installation.md index ae9742cea..cbf040f74 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -7,10 +7,9 @@ System Requirements: # Installation -### 1. Download Apps SDK +## 1. Download Apps SDK * [Cloud AI 100 Apps SDK install](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/) -### 2. Install Efficient-Transformers Uninstall existing Apps SDK ``` sudo ./uninstall.sh @@ -33,7 +32,23 @@ Apply chmod commands sudo chmod a+x /opt/qti-aic/dev/hexagon_tools/bin/* sudo chmod a+x /opt/qti-aic/exec/* ``` +## 2. Install Efficient-Transformers +### Using GitHub Repository + +``Warning: Efficient Transformers have been validated to work with the same compatible SDK. Upgrading this may result in certain models becoming incompatible.`` + +```bash +# Create Python virtual env and activate it. (Required Python 3.10) + +python3.10 -m venv qeff_env +source qeff_env/bin/activate +pip install -U pip + +# Clone and Install the QEfficient Repo. +pip install git+https://github.com/quic/efficient-transformers + +``` # Sanity Check After above installation methods, you can check if ``QEfficient`` is installed correctly by using diff --git a/docs/source/python_api.md b/docs/source/python_api.md deleted file mode 100644 index 668861373..000000000 --- a/docs/source/python_api.md +++ /dev/null @@ -1,134 +0,0 @@ -# Python API - -**This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.** - -## High Level API - -### `QEFFAutoModelForCausalLM` - -```{eval-rst} -.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM - :member-order: bysource - :members: -``` - -(QEFFAutoModel)= -### `QEFFAutoModel` - -```{eval-rst} -.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel - :member-order: bysource - :members: -``` - -(QEffAutoPeftModelForCausalLM)= -### `QEffAutoPeftModelForCausalLM` - -```{eval-rst} -.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM - :member-order: bysource - :members: -``` - -(QEffAutoLoraModelForCausalLM)= -### `QEffAutoLoraModelForCausalLM` - -```{eval-rst} -.. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM - :member-order: bysource - :members: -``` - -(QEFFAutoModelForImageTextToText)= -### `QEFFAutoModelForImageTextToText` - -```{eval-rst} -.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText - :member-order: bysource - :members: -``` - -(QEFFAutoModelForSpeechSeq2Seq)= -### `QEFFAutoModelForSpeechSeq2Seq` - -```{eval-rst} -.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq - :member-order: bysource - :members: -``` - -### `export` - -```{eval-rst} -.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 - :members: - :show-inheritance: - :exclude-members: convert_to_cloud_kvstyle, convert_to_cloud_bertstyle -.. deprecated:: - This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.export instead -``` - -### `compile` - -```{eval-rst} -.. automodule:: QEfficient.compile.compile_helper - :members: - :show-inheritance: -.. code-block:: python - - import QEfficient - base_path, onnx_model_path = QEfficient.export(model_name="gpt2") - qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) -.. deprecated:: - This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead -``` - -### `Execute` - -```{eval-rst} -.. automodule:: QEfficient.generation.text_generation_inference - :members: - :show-inheritance: - :exclude-members: latency_stats_bertstyle,cloud_ai_100_exec_kv_helper -``` -## Low Level API - -### `convert_to_cloud_kvstyle` - -```{eval-rst} -.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 - :members: - :show-inheritance: - :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle -``` - -### `convert_to_cloud_bertstyle` - -```{eval-rst} -.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 - :members: - :show-inheritance: - :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle -``` - -### `utils` - -```{eval-rst} -.. automodule:: QEfficient.utils.device_utils - :members: - :show-inheritance: -``` - -```{eval-rst} -.. automodule:: QEfficient.utils.generate_inputs - :members: - :undoc-members: - :show-inheritance: -``` - -```{eval-rst} -.. automodule:: QEfficient.utils.run_utils - :members: - :undoc-members: - :show-inheritance: -``` \ No newline at end of file diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 233fb491a..3d7e4f23c 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -9,37 +9,6 @@ To achieve this, we have 2 levels of APIs, with different levels of abstraction. 2. Python high level APIs offer more granular control, ideal for when customization is necessary. -## Supported Features - -| Feature | Impact | -| --- | --- | -| Context Length Specializations (upcoming) | Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. | -| Block Attention (in progress) | Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. | -| Sentence embedding, Flexible Pooling configuration and compilation with multiple sequence lengths| Supports standard/custom pooling with AI 100 acceleration and sentence embedding. Enables efficient sentence embeddings via Efficient-Transformers. Compile with one or multiple seq_len; optimal graph auto-selected at runtime. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/embedding_model.py) for more **details**.| -| [SpD, multiprojection heads](https://quic.github.io/efficient-transformers/source/quick_start.html#draft-based-speculative-decoding) | Implemented post-attention hidden size projections to speculate tokens ahead of the base model. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/multiprojs_spd_inference.py) for more **details**.| -| [QNN Compilation support](https://github.com/quic/efficient-transformers/pull/374) | Enabled for AutoModel classes QNN compilation capabilities for multi-models, embedding models and causal models.| -| [Disaggregated serving](https://github.com/quic/efficient-transformers/pull/365) | It support for separate prefill and decode compilation for encoder (vision) and language models.| -| [GGUF model execution](https://github.com/quic/efficient-transformers/pull/368) | Supported GGUF model execution (without quantized weights). Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/basic_gguf_models.py) for more **details**. | -| Replication of KV | Enabled FP8 model support on [replicate_kv_heads script](https://github.com/quic/efficient-transformers/tree/main/scripts/replicate_kv_head).| -| [gradient checkpointing](https://github.com/quic/efficient-transformers/pull/338) | Supports gradient checkpointing in the finetuning script| -| Swift KV [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) | Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. Support for both [continuous and non-continuous batching execution](https://github.com/quic/efficient-transformers/pull/367) in SwiftKV | -| [Vision Language Model](QEFFAutoModelForImageTextToText) | Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text_inference.py) for more **details**. | -| [Speech Sequence to Sequence Model](QEFFAutoModelForSpeechSeq2Seq) | Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/speech_to_text/run_whisper_speech_to_text.py) for more **details**. | -| Support for FP8 Execution | Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. | -| Prefill caching | Enhances inference speed by caching key-value pairs for shared prefixes, reducing redundant computations and improving efficiency. | -|Prompt-Lookup Decoding | Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/pld_spd_inference.py) for more **details**.| -| [PEFT LoRA support](QEffAutoPeftModelForCausalLM) | Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/peft_models.py) for more **details**. | -| [QNN support](#qnn-compilation) | Enables compilation using QNN SDK, making Qeff adaptable for various backends in the future. | -| [Embedding model support](QEFFAutoModel) | Facilitates the generation of vector embeddings for retrieval tasks. | -| [Speculative Decoding](#draft-based-speculative-decoding) | Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/draft_spd_inference.py) for more **details**. | -| [Finite lorax](QEffAutoLoraModelForCausalLM) | Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/lora_models.py) for more **details**. | -| Python and CPP Inferencing API support | Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/cpp_execution/text_inference_using_cpp.py) for more **details**.| -| [Continuous batching](#continuous-batching) | Optimizes throughput and latency by dynamically batching requests, ensuring efficient use of computational resources. | -| AWQ and GPTQ support | Supports advanced quantization techniques, improving model efficiency and performance on AI 100. | -| Support serving successive requests in same session | An API that yields tokens as they are generated, facilitating seamless integration with various applications and enhancing accessibility for developers. | -| Perplexity calculation | A script for computing the perplexity of a model, allowing for the evaluation of model performance and comparison across different models and datasets. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/perplexity_computation/calculate_perplexity.py) for more **details**. | -| KV Heads Replication Script| A sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model, running inference with the original model, replicating KV heads, validating changes, and exporting the modified model to ONNX format. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/replicate_kv_head/replicate_kv_heads.py) for more **details**.| - ## Transformed models and QPC storage By default, the library exported models and Qaic Program Container (QPC) files, which are compiled and inference-ready model binaries generated by the compiler, are stored in `~/.cache/qeff_cache`. You can customize this storage path using the following environment variables: @@ -49,88 +18,34 @@ By default, the library exported models and Qaic Program Container (QPC) files, 3. **Default**: If neither `QEFF_HOME` nor `XDG_CACHE_HOME` are set, the default path `~/.cache/qeff_cache` will be used. -## Command Line Interface - +## Command Line Interface Execution ```{NOTE} Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g. ``'--device_group [0]'`` ``` -### QEfficient.cloud.infer - -This is the single e2e CLI API, which takes `model_card` name as input along with other compilation arguments. Check [Infer API doc](infer_api) for more details. - -* HuggingFace model files Download → Optimize for Cloud AI 100 → Export to `ONNX` → Compile on Cloud AI 100 → [Execute](#qefficientcloudexecute) -* It skips the export/compile stage based if `ONNX` or `qpc` files are found. If you use infer second time with different compilation arguments, it will automatically skip `ONNX` model creation and directly jump to compile stage. - - -```bash -# Check out the options using the help -python -m QEfficient.cloud.infer --help -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first -``` -If executing for batch size>1, -You can pass input prompts in single string but separate with pipe (|) symbol". Example below - -```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth -theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first -``` - -You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples folder. +### Export +**QEfficient.cloud.export** ```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.export --model_name gpt2 ``` -### QEfficient.cloud.execute -You can first run `infer` API and then use `execute` to run the pre-compiled model on Cloud AI 100 cards. -Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts. Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details. - -```bash -python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] -``` - -### QEfficient.cloud.finetune -You can run the finetune with set of predefined existing datasets on QAIC using the eager pipeline - -```bash -python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 -``` -For more details on finetune, checkout the subsection. - -### Multi-Qranium Inference -You can also enable MQ, just based on the number of devices. Based on the `--device-group` as input it will create TS config on the fly. If `--device-group [0,1]` it will create TS config for 2 devices and use it for compilation, if `--device-group [0]` then TS compilation is skipped and single soc execution is enabled. - -```bash -python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first -``` - -Above step will save the `qpc` files under `efficient-transformers/qeff_models/{model_card_name}`, you can use the execute API to run for different prompts. This will automatically pick the pre-compiled `qpc` files. - -```bash -python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device-group [0,1] -``` +### Compile +**CLI Compile Command** -To disable MQ, just pass single soc like below, below step will compile the model again and reuse the `ONNX` file as only compilation argument are different from above commands. +Users can also use `compile` API to compile pre exported onnx models using QNN SDK. +Without QNN Config ```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.compile --onnx_path --qpc-path --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn ``` -### Continuous Batching - -Users can compile a model utilizing the continuous batching feature by specifying full_batch_size in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way. - -When enabling continuous batching, batch size should not be specified. - -Users can leverage multi-Qranium and other supported features along with continuous batching. - +With QNN Config ```bash -python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth -theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3 +python -m QEfficient.cloud.compile --onnx_path --qpc-path --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json ``` -### QNN Compilation +**QNN Compilation** Users can compile a model with QNN SDK by following the steps below: @@ -170,22 +85,20 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 With QNN Config ```bash python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json -```` +``` -**CLI Compile Command** +**Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.** -Users can also use `compile` API to compile pre exported onnx models using QNN SDK. -Without QNN Config -```bash -python -m QEfficient.cloud.compile --onnx_path --qpc-path --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn -``` +### Execute -With QNN Config -```bash -python -m QEfficient.cloud.compile --onnx_path --qpc-path --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json -```` +**QEfficient.cloud.execute** +You can first run `infer` API and then use `execute` to run the pre-compiled model on Cloud AI 100 cards. +Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts. Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details. +```bash +python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] +``` **CLI Execute Command** Once we have compiled the QPC using `infer` or `compile` API, we can now use the precompiled QPC in `execute` API to run for different prompts. @@ -196,36 +109,52 @@ Make sure to pass same `--device_group` as used during infer. Refer [Execute API python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_qnn_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] ``` -**QNN Compilation via Python API** +### Infer +**QEfficient.cloud.infer** -Users can also use python API to export, compile and execute onnx models using QNN SDK. +This is the single e2e CLI API, which takes `model_card` name as input along with other compilation arguments. Check [Infer API doc](infer_api) for more details. -```Python -# We can now export the modified models to ONNX framework -# This will generate single ONNX Model for both Prefill and Decode Variations which are optimized for -# Cloud AI 100 Platform. -from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM +* HuggingFace model files Download → Optimize for Cloud AI 100 → Export to `ONNX` → Compile on Cloud AI 100 → [Execute](#qefficientcloudexecute) +* It skips the export/compile stage based if `ONNX` or `qpc` files are found. If you use infer second time with different compilation arguments, it will automatically skip `ONNX` model creation and directly jump to compile stage. -# Model-Card name (This is HF Model Card name) : https://huggingface.co/gpt2-xl -model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib. -qeff_model = AutoModelForCausalLM.from_pretrained(model_name) +```bash +# Check out the options using the help +python -m QEfficient.cloud.infer --help +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +``` +If executing for batch size>1, +You can pass input prompts in single string but separate with pipe (|) symbol". Example below -generated_qpc_path = qeff_model.compile( - num_cores=14, - mxfp6=True, - enable_qnn=True, - qnn_config = qnn_config_file_path # QNN compilation configuration is passed. -) +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first +``` -qeff_model.generate(prompts=["My name is"]) +You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples folder. + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` -**Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.** +### Finetune +**QEfficient.cloud.finetune** +You can run the finetune with set of predefined existing datasets on QAIC using the eager pipeline + +```bash +python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 +``` +For more details on finetune, checkout the subsection. + + + + +## High Level Execution +Here is the high level API to compile and run the model on Cloud AI 100 via Python. -## Python API -### 1. Model download and Optimize for Cloud AI 100 +### 1. Model download and Optimize for Cloud AI 100 + If your models falls into the model architectures that are [already supported](validated_models), Below steps should work fine. Please raise an [issue](https://github.com/quic/efficient-transformers/issues), in case of trouble. @@ -283,21 +212,3 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) qeff_model.generate(prompts=["My name is"],tokenizer=tokenizer) ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. - -### Draft-Based Speculative Decoding -Draft-based speculative decoding is a technique where a small Draft Language Model (DLM) makes `num_speculative_tokens` autoregressive speculations ahead of the Target Language Model (TLM). The objective is to predict what the TLM would have predicted if it would have been used instead of the DLM. This approach is beneficial when the autoregressive decode phase of the TLM is memory bound and thus, we can leverage the extra computing resources of our hardware by batching the speculations of the DLM as an input to TLM to validate the speculations. - -To export and compile both DLM/TLM, add corresponding `qaic_config` and `num_speculative_tokens` for TLM and export DLM as you would any other QEfficient LLM model: - -```Python -tlm_name = "meta-llama/Llama-2-70b-chat-hf" -dlm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" -k = 3 # DLM will make `k` speculations -qaic_config = dict(speculative_model_type="target") -tlm = AutoModelForCausalLM.from_pretrained(tlm_name, qaic_config=qaic_config) -dlm = AutoModelForCausalLM.from_pretrained(dlm_name) -tlm.compile(num_speculative_tokens=k) -dlm.compile() -``` - -The `qaic_config` dictionary is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level. diff --git a/docs/source/release_docs.md b/docs/source/release_docs.md index 0d9b9c3ef..c78e941ba 100644 --- a/docs/source/release_docs.md +++ b/docs/source/release_docs.md @@ -1,4 +1,4 @@ -# 🚀 Efficient Transformer Library - Release 1.20.0 (Beta) +# Efficient Transformer Library - Release 1.20.0 (Beta) Welcome to the official release of **Efficient Transformer Library v1.20.0**! This release brings a host of new model integrations, performance enhancements, and fine-tuning capabilities to accelerate your AI development. @@ -6,7 +6,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th --- -## 🧠 Newly Supported Models +## Newly Supported Models - **Llama-4-Scout-17B-16E-Instruct** - Text & Image+Text support @@ -36,7 +36,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th --- -## ✨ Key Features & Enhancements +## Key Features & Enhancements - **Transformer Upgrade**: Now using version `4.51.3` - **SpD & Multi-Projection Heads**: Token speculation via post-attention projections @@ -46,7 +46,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th --- -## 🔍 Embedding Model Upgrades +## Embedding Model Upgrades - **Flexible Pooling**: Choose from standard or custom strategies - **Sentence Embedding**: Now runs directly on AI100 @@ -54,9 +54,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th --- -## 🛠️ Fine-Tuning Support +## Fine-Tuning Support - BERT fine-tuning support with templates and documentation - Gradient checkpointing, device-aware `GradScaler`, and CLI `--help` added - ---- \ No newline at end of file diff --git a/docs/source/supported_features.rst b/docs/source/supported_features.rst new file mode 100644 index 000000000..a49e9186b --- /dev/null +++ b/docs/source/supported_features.rst @@ -0,0 +1,60 @@ +Supported Features +=================== +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Feature + - Impact + * - Context Length Specializations (upcoming) + - Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. + * - Block Attention (in progress) + - Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. + * - Sentence embedding, Flexible Pooling configuration and compilation with multiple sequence lengths + - Supports standard/custom pooling with AI 100 acceleration and sentence embedding. Enables efficient sentence embeddings via Efficient-Transformers. Compile with one or multiple seq_len; optimal graph auto-selected at runtime. Refer `sample script `_ for more **details**. + * - `SpD, multiprojection heads `_ + - Implemented post-attention hidden size projections to speculate tokens ahead of the base model. Refer `sample script `_ for more **details**. + * - `QNN Compilation support `_ + - Enabled for AutoModel classes QNN compilation capabilities for multi-models, embedding models and causal models. + * - `Disaggregated serving `_ + - It support for separate prefill and decode compilation for encoder (vision) and language models. + * - `GGUF model execution `_ + - Supported GGUF model execution (without quantized weights). Refer `sample script `_ for more **details**. + * - Replication of KV + - Enabled FP8 model support on `replicate_kv_heads script `_. + * - `gradient checkpointing `_ + - Supports gradient checkpointing in the finetuning script + * - Swift KV `Snowflake/Llama-3.1-SwiftKV-8B-Instruct `_ + - Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. Support for both `continuous and non-continuous batching execution `_ in SwiftKV + * - :ref:`Vision Language Model ` + - Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer `sample script `_ for more **details**. + * - :ref:`Speech Sequence to Sequence Model ` + - Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer `sample script `_ for more **details**. + * - Support for FP8 Execution + - Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. + * - Prefill caching + - Enhances inference speed by caching key-value pairs for shared prefixes, reducing redundant computations and improving efficiency. + * - Prompt-Lookup Decoding + - Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer `sample script `_ for more **details**. + * - :ref:`PEFT LoRA support ` + - Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer `sample script `_ for more **details**. + * - :ref:`QNN support ` + - Enables compilation using QNN SDK, making Qeff adaptable for various backends in the future. + * - :ref:`Embedding model support ` + - Facilitates the generation of vector embeddings for retrieval tasks. + * - :ref:`Speculative Decoding ` + - Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer `sample script `_ for more **details**. + * - :ref:`Finite lorax ` + - Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer `sample script `_ for more **details**. + * - Python and CPP Inferencing API support + - Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer `sample script `_ for more **details**. + * - :ref:`Continuous batching ` + - Optimizes throughput and latency by dynamically batching requests, ensuring efficient use of computational resources. + * - AWQ and GPTQ support + - Supports advanced quantization techniques, improving model efficiency and performance on AI 100. + * - Support serving successive requests in same session + - An API that yields tokens as they are generated, facilitating seamless integration with various applications and enhancing accessibility for developers. + * - Perplexity calculation + - A script for computing the perplexity of a model, allowing for the evaluation of model performance and comparison across different models and datasets. Refer `sample script `_ for more **details**. + * - KV Heads Replication Script + - A sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model, running inference with the original model, replicating KV heads, validating changes, and exporting the modified model to ONNX format. Refer `sample script `_ for more **details**. \ No newline at end of file diff --git a/docs/source/upgrade.md b/docs/source/upgrade.md deleted file mode 100644 index 2d44219bb..000000000 --- a/docs/source/upgrade.md +++ /dev/null @@ -1,16 +0,0 @@ - -## Using GitHub Repository - -``Warning: Efficient Transformers have been validated to work with the same compatible SDK. Upgrading this may result in certain models becoming incompatible.`` - -```bash -# Create Python virtual env and activate it. (Required Python 3.10) - -python3.10 -m venv qeff_env -source qeff_env/bin/activate -pip install -U pip - -# Clone and Install the QEfficient Repo. -pip install git+https://github.com/quic/efficient-transformers - -``` \ No newline at end of file