From b1a4d2e63dea832eaf72397039643ebcd32b6de1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 16 Feb 2026 18:03:15 +0000 Subject: [PATCH 1/3] adding tutorials from pruna pro --- docs/tutorials/computer_vision.ipynb | 177 +++++++++++++++++++++++ docs/tutorials/index.rst | 18 +++ docs/tutorials/recovery.ipynb | 205 +++++++++++++++++++++++++++ docs/tutorials/ring_attn.ipynb | 176 +++++++++++++++++++++++ 4 files changed, 576 insertions(+) create mode 100644 docs/tutorials/computer_vision.ipynb create mode 100644 docs/tutorials/recovery.ipynb create mode 100644 docs/tutorials/ring_attn.ipynb diff --git a/docs/tutorials/computer_vision.ipynb b/docs/tutorials/computer_vision.ipynb new file mode 100644 index 00000000..0e43690b --- /dev/null +++ b/docs/tutorials/computer_vision.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Blazingly fast Computer Vision Models" + ] + }, + { + "cell_type": "raw", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial demonstrates how to use the `pruna` package to optimize any custom computer vision model. We will use the `vit_b_16` model as an example. Any execution times given below are measured on a T4 GPU." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the CV Model\n", + "\n", + "First, load your ViT model.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torchvision\n", + "\n", + "model = torchvision.models.vit_b_16(weights=\"ViT_B_16_Weights.DEFAULT\").cuda()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the Smash Config\n", + "\n", + "Next, initialize the smash_config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig\n", + "\n", + "# Initialize the SmashConfig\n", + "smash_config = SmashConfig([\"x_fast\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, you can smash the model, which will take around 5 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import smash\n", + "\n", + "# Smash the model\n", + "smashed_model = smash(\n", + " model=model,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Preparing the Input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from torchvision import transforms\n", + "\n", + "# Generating a random image\n", + "image = np.random.randint(0, 256, size=(224, 224, 3), dtype=np.uint8)\n", + "input_tensor = transforms.ToTensor()(image).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Running the Model\n", + "\n", + "After the model has been compiled, we run inference for a few iterations as warm-up. This will take around 8 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run some warm-up iterations\n", + "for _ in range(5):\n", + " smashed_model(input_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, run the model to transcribe the audio file with accelerated inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the result\n", + "smashed_model(input_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap Up\n", + "\n", + "Congratulations! You have successfully smashed a CV model. You can now use the `pruna` package to optimize any custom CV model. The only parts that you should modify are step 1, 4 and 5 to fit your use case" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pruna", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 68873a6e..02d35834 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -87,6 +87,24 @@ These tutorials will guide you through the process of using |pruna| to optimize Learn how to use the ``target_modules`` parameter to target specific modules in your model. + .. grid-item-card:: Blazingly Fast Computer Vision + :text-align: center + :link: ./computer_vision.ipynb + + Optimize any ``computer vision`` model with ``x_fast`` ``compilation``. + + .. grid-item-card:: Recover Quality after Quantization + :text-align: center + :link: ./recovery.ipynb + + Recover quality using ``text_to_image_perp`` after ``diffusers_int8`` ``quantization``. + + .. grid-item-card:: Distribute across GPUs with Ring Attention + :text-align: center + :link: ./ring_attn.ipynb + + Distribute your ``Flux`` model across multiple GPUs with ``ring_attn`` and ``torch_compile``. + .. toctree:: :hidden: :maxdepth: 1 diff --git a/docs/tutorials/recovery.ipynb b/docs/tutorials/recovery.ipynb new file mode 100644 index 00000000..2bfcf6aa --- /dev/null +++ b/docs/tutorials/recovery.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recovering Quality after Quantizing Models to 4 Bits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + "\n", + "This tutorial demonstrates how to use the ``pruna`` package to use our experimental \"recovery\" feature to recover the model quality after quantization. This option allows you to push quantization or other compression techniques to the limit without compromising quality.\n", + "\n", + "We will use :doc:`PERP ` on the Sana model as an example, but you can also use Stable Diffusion and Flux models depending on your device. Any execution times given below are measured on a L40S GPU.\n", + "\n", + "Note that recovery is available in the ``pruna`` package." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the Sana Model\n", + "\n", + "First, load the Sana model, and generate an image for quality reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import SanaPipeline\n", + "\n", + "pipe = SanaPipeline.from_pretrained(\n", + " \"Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers\",\n", + " torch_dtype=torch.bfloat16,\n", + ").to(\"cuda\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate an image to have a reference for quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"A crow walking along a river near a foggy cliff, with cute yellow ducklings following it in a line, at sunset.\"\n", + "pipe(prompt).images[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the SmashConfig" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "Next, initialize the SmashConfig. We'll use :doc:`bitsandbytes' quantization ` to 4 bits, and recover quality by finetuning with PERP on a text-to-image dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig\n", + "\n", + "smash_config = SmashConfig({\n", + " # Quantize the model to 4-bits\n", + " \"diffusers_int8\": {\n", + " \"weight_bits\": 4\n", + " },\n", + " # Recover, allowing you to push quantization to lower bit rates without compromising quality\n", + " \"text_to_image_perp\": {\n", + " # you can increase or reduce 'batch_size' depending on your GPU, or use 'gradient_accumulation_steps' with it\n", + " \"batch_size\": 8,\n", + " \"num_epochs\": 4,\n", + " \"validate_every_n_epoch\": 0.5 # run validation every half epoch\n", + " }\n", + "})\n", + "# Attach a text-to-image dataset, used for recovery\n", + "smash_config.add_data(\"COCO\")\n", + "smash_config.data.limit_datasets((256, 64, 1)) # training on 256 samples and validating on 64" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, smash the model. This takes about 9 minutes on an L40S GPU, but it depends on how many samples are used for recovery.\n", + "Recovery logging is handled though __Weights & Biases__, make sure you have it installed and set up in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import smash\n", + "\n", + "smashed_model = smash(\n", + " model=pipe,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Running the Model\n", + "Finally, we run the model which has been quantized and recovered. It has a lower memory footprint than the original because of the quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "smashed_model(prompt).images[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap up" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "Congratulations! You have successfully recovered quality on your compressed Sana model. You can now use the ``pruna`` package to its limit by using aggressive compression alongside recovery. The only parts you should modify are steps 1 and 4 to fit your use case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "prunatree", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb new file mode 100644 index 00000000..e31ad05d --- /dev/null +++ b/docs/tutorials/ring_attn.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributing Flux on Multiple GPUs" + ] + }, + { + "cell_type": "raw", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will walk you through how to use the `pruna` package to optimize your Flux model for faster inference on multiple GPUs. Any execution times below are measured on a set of 2 H100 PCIes.\n", + "Note that the `pruna` distributers are also compatible with `torchrun`, simply convert this tutorial to a script and run with `torchrun --nproc_per_node=2 flux_tutorial.py`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the Flux Model\n", + "\n", + "First, load your Flux model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import FluxPipeline\n", + "\n", + "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-dev\", torch_dtype=torch.bfloat16)\n", + "pipe.to(\"cuda\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the Smash Config\n", + "\n", + "Next, initialize the `smash_config`. For this tutorial, we will select our `ring_attn` distributer and `torch_compile`. If this is not enough for you, you can play around with additionally activating e.g. the quantizer, factorizer and pruner below!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig, smash\n", + "\n", + "# Initialize the SmashConfig and configure the algorithms\n", + "smash_config = SmashConfig([\"ring_attn\", \"torch_compile\"])\n", + "# Additionally configure suitable hyperparameters\n", + "smash_config.add({\n", + " \"torch_compile_target\": \"module_list\"\n", + "})\n", + "\n", + "# You can choose to activate further algorithms compatible with the ring_attn distributer!\n", + "# smash_config.add([\"qkv_diffusers\", \"fp8\", \"padding_pruning\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, you can smash the model, which can take up to one minute. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe = smash(\n", + " model=pipe,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Running the Model\n", + "\n", + "After the model has been distributed and compiled, we run inference for a few iterations as warm-up. The initial inference time of 10.4 seconds has now been reduced to around 2.7 seconds!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = (\n", + " \"An anime illustration of Sydney Opera House sitting next to Eiffel tower, under a blue night sky of \"\n", + " \"roiling energy, exploding yellow stars, and radiating swirls of blue.\"\n", + ")\n", + "\n", + "for _ in range(5):\n", + " output = pipe(prompt, num_inference_steps=50).images[0]\n", + "output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Clean-Up\n", + "\n", + "To properly clean up the distributed model, make sure to call the `destroy` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.destroy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap Up\n", + "\n", + "Congratulations! You have successfully distributed a Flux model on multiple GPUs and combined it with other `pruna` algorithms - it is that easy." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pruna", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From feaf89f47bac05a41123f03a4eb51b0bb2f34644 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 17 Feb 2026 17:44:38 +0000 Subject: [PATCH 2/3] small changes to rin_attn tutorial --- docs/tutorials/ring_attn.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb index e31ad05d..cc2ffa1d 100644 --- a/docs/tutorials/ring_attn.ipynb +++ b/docs/tutorials/ring_attn.ipynb @@ -75,7 +75,7 @@ "})\n", "\n", "# You can choose to activate further algorithms compatible with the ring_attn distributer!\n", - "# smash_config.add([\"qkv_diffusers\", \"fp8\", \"padding_pruning\"])" + "# smash_config.add([\"qkv_diffusers\", \"padding_pruning\"])" ] }, { @@ -173,4 +173,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From c51c063847957c8a0291f8cb2b5484d2ef6c8c03 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 19 Feb 2026 14:15:25 +0000 Subject: [PATCH 3/3] make tutorials toctree explicit --- docs/tutorials/index.rst | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 02d35834..11f0da6a 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -109,6 +109,22 @@ These tutorials will guide you through the process of using |pruna| to optimize :hidden: :maxdepth: 1 :caption: Pruna - :glob: - - ./* + + image_generation + video_generation + llms + reasoning_llm + asr_tutorial + cv_cpu + diffusion_quantization_acceleration + evaluation_agent_cmmd + sana_diffusers_int8 + flux2klein4b_tutorial + sd_deepcache + deploying_sana_tutorial + target_modules_quanto + portable_compilation + llm_quantization_compilation_acceleration + computer_vision + recovery + ring_attn