From f43862b9224d99da8c4ac4a4912ea828f4cdbc5e Mon Sep 17 00:00:00 2001
From: Jason Antic <jsa169@gmail.com>
Date: Wed, 12 Oct 2022 21:11:50 -0700
Subject: [PATCH 1/2] Memory efficiency improvements

Removing unnecessary additional memory usage by replacing separate Dreambooth db_pipe assignment with pipe, and then deleting that pipe before running "Looking inside the pipeline" section.

Models in "Looking inside the pipeline" set to fp16 to further memory efficiency.

Combined the changes allow for running the notebook beginning to end on a 11GB 1080TI gpu.
---
 stable_diffusion.ipynb | 72 +++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/stable_diffusion.ipynb b/stable_diffusion.ipynb
index f7edf67..2d706de 100644
--- a/stable_diffusion.ipynb
+++ b/stable_diffusion.ipynb
@@ -70,6 +70,7 @@
     "from PIL import Image\n",
     "from fastcore.all import concat\n",
     "import torch, logging\n",
+    "from torch import autocast\n",
     "from pathlib import Path\n",
     "from huggingface_hub import notebook_login\n",
     "from diffusers import StableDiffusionPipeline\n",
@@ -363,8 +364,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "models--CompVis--stable-diffusion-v1-4\tmodels--google--ddpm-church-256\r\n",
-      "models--google--ddpm-celebahq-256\r\n"
+      "models--CompVis--stable-diffusion-v1-4\tmodels--google--ddpm-church-256\n",
+      "models--google--ddpm-celebahq-256\n"
      ]
     }
    ],
@@ -1323,9 +1324,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1384,7 +1383,11 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
    },
    "outputs": [
     {
@@ -1613,8 +1616,8 @@
     }
    ],
    "source": [
-    "db_pipe = StableDiffusionPipeline.from_pretrained(\"pcuenq/jh_dreambooth_1000\", torch_dtype=torch.float16)\n",
-    "db_pipe = db_pipe.to(\"cuda\")"
+    "pipe = StableDiffusionPipeline.from_pretrained(\"pcuenq/jh_dreambooth_1000\", torch_dtype=torch.float16)\n",
+    "pipe = pipe.to(\"cuda\")"
    ]
   },
   {
@@ -1652,7 +1655,7 @@
     "torch.manual_seed(1000)\n",
     "\n",
     "prompt = \"Painting of sks person in the style of Paul Signac\"\n",
-    "images = db_pipe(prompt, num_images_per_prompt=4).images\n",
+    "images = pipe(prompt, num_images_per_prompt=4).images\n",
     "image_grid(images, 1, 4)"
    ]
   },
@@ -1806,6 +1809,15 @@
     "First, we need the text encoder and the tokenizer. These come from the text portion of a standard CLIP model, so we'll use the weights released by Open AI."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del pipe"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 25,
@@ -1814,8 +1826,8 @@
    "source": [
     "from transformers import CLIPTextModel, CLIPTokenizer\n",
     "\n",
-    "tokenizer = CLIPTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
-    "text_encoder = CLIPTextModel.from_pretrained(\"openai/clip-vit-large-patch14\")"
+    "tokenizer = CLIPTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", torch_dtype=torch.float16)\n",
+    "text_encoder = CLIPTextModel.from_pretrained(\"openai/clip-vit-large-patch14\", torch_dtype=torch.float16)"
    ]
   },
   {
@@ -1833,8 +1845,8 @@
    "source": [
     "from diffusers import AutoencoderKL, UNet2DConditionModel\n",
     "\n",
-    "vae = AutoencoderKL.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"vae\")\n",
-    "unet = UNet2DConditionModel.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"unet\")"
+    "vae = AutoencoderKL.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"vae\", torch_dtype=torch.float16)\n",
+    "unet = UNet2DConditionModel.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"unet\", torch_dtype=torch.float16)"
    ]
   },
   {
@@ -2299,20 +2311,22 @@
    "source": [
     "from tqdm.auto import tqdm\n",
     "\n",
-    "for i, t in enumerate(tqdm(scheduler.timesteps)):\n",
-    "    latent_model_input = torch.cat([latents] * 2)\n",
-    "    latent_model_input = scheduler.scale_model_input(latent_model_input, t)\n",
+    "with autocast('cuda'):\n",
+    "\n",
+    "    for i, t in enumerate(tqdm(scheduler.timesteps)):\n",
+    "        latent_model_input = torch.cat([latents] * 2)\n",
+    "        latent_model_input = scheduler.scale_model_input(latent_model_input, t)\n",
     "\n",
-    "    # predict the noise residual\n",
-    "    with torch.no_grad():\n",
-    "        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample\n",
+    "        # predict the noise residual\n",
+    "        with torch.no_grad():\n",
+    "            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample\n",
     "\n",
-    "    # perform guidance\n",
-    "    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n",
-    "    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)\n",
+    "        # perform guidance\n",
+    "        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n",
+    "        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)\n",
     "\n",
-    "    # compute the previous noisy sample x_t -> x_t-1\n",
-    "    latents = scheduler.step(noise_pred, t, latents).prev_sample"
+    "        # compute the previous noisy sample x_t -> x_t-1\n",
+    "        latents = scheduler.step(noise_pred, t, latents).prev_sample"
    ]
   },
   {
@@ -2335,7 +2349,8 @@
     "latents = 1 / 0.18215 * latents\n",
     "\n",
     "with torch.no_grad():\n",
-    "    image = vae.decode(latents).sample"
+    "    with autocast('cuda'):\n",
+    "        image = vae.decode(latents).sample"
    ]
   },
   {
@@ -2356,8 +2371,7 @@
      "height": 529
     },
     "id": "AAVZStIokTVv",
-    "outputId": "7af6a1ea-f20a-4445-d756-8bb0dd6a0747",
-    "scrolled": false
+    "outputId": "7af6a1ea-f20a-4445-d756-8bb0dd6a0747"
    },
    "outputs": [
     {
@@ -2405,7 +2419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.10"
+   "version": "3.10.6"
   },
   "toc": {
    "base_numbering": 1,
@@ -16198,5 +16212,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }

From 2cb2544d9d95313f82260336c55a319b1fff5db1 Mon Sep 17 00:00:00 2001
From: Jason Antic <jsa169@gmail.com>
Date: Thu, 13 Oct 2022 09:10:38 -0700
Subject: [PATCH 2/2] Replacing autocast calls with conversions to fp16 on
 model inputs

---
 stable_diffusion.ipynb | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/stable_diffusion.ipynb b/stable_diffusion.ipynb
index 2d706de..63226bd 100644
--- a/stable_diffusion.ipynb
+++ b/stable_diffusion.ipynb
@@ -70,7 +70,6 @@
     "from PIL import Image\n",
     "from fastcore.all import concat\n",
     "import torch, logging\n",
-    "from torch import autocast\n",
     "from pathlib import Path\n",
     "from huggingface_hub import notebook_login\n",
     "from diffusers import StableDiffusionPipeline\n",
@@ -2071,7 +2070,7 @@
     }
    ],
    "source": [
-    "text_embeddings = text_encoder(text_input.input_ids.to(\"cuda\"))[0]\n",
+    "text_embeddings = text_encoder(text_input.input_ids.to(\"cuda\"))[0].half()\n",
     "text_embeddings.shape"
    ]
   },
@@ -2105,7 +2104,7 @@
     "uncond_input = tokenizer(\n",
     "    [\"\"] * batch_size, padding=\"max_length\", max_length=max_length, return_tensors=\"pt\"\n",
     ")\n",
-    "uncond_embeddings = text_encoder(uncond_input.input_ids.to(\"cuda\"))[0]\n",
+    "uncond_embeddings = text_encoder(uncond_input.input_ids.to(\"cuda\"))[0].half()\n",
     "uncond_embeddings.shape"
    ]
   },
@@ -2159,7 +2158,7 @@
    "source": [
     "torch.manual_seed(100)\n",
     "latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8))\n",
-    "latents = latents.to(\"cuda\")\n",
+    "latents = latents.to(\"cuda\").half()\n",
     "latents.shape"
    ]
   },
@@ -2311,22 +2310,20 @@
    "source": [
     "from tqdm.auto import tqdm\n",
     "\n",
-    "with autocast('cuda'):\n",
+    "for i, t in enumerate(tqdm(scheduler.timesteps)):\n",
+    "    latent_model_input = torch.cat([latents] * 2)\n",
+    "    latent_model_input = scheduler.scale_model_input(latent_model_input, t)\n",
     "\n",
-    "    for i, t in enumerate(tqdm(scheduler.timesteps)):\n",
-    "        latent_model_input = torch.cat([latents] * 2)\n",
-    "        latent_model_input = scheduler.scale_model_input(latent_model_input, t)\n",
+    "    # predict the noise residual\n",
+    "    with torch.no_grad():\n",
+    "        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample\n",
     "\n",
-    "        # predict the noise residual\n",
-    "        with torch.no_grad():\n",
-    "            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample\n",
+    "    # perform guidance\n",
+    "    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n",
+    "    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)\n",
     "\n",
-    "        # perform guidance\n",
-    "        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n",
-    "        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)\n",
-    "\n",
-    "        # compute the previous noisy sample x_t -> x_t-1\n",
-    "        latents = scheduler.step(noise_pred, t, latents).prev_sample"
+    "    # compute the previous noisy sample x_t -> x_t-1\n",
+    "    latents = scheduler.step(noise_pred, t, latents).prev_sample"
    ]
   },
   {
@@ -2349,8 +2346,7 @@
     "latents = 1 / 0.18215 * latents\n",
     "\n",
     "with torch.no_grad():\n",
-    "    with autocast('cuda'):\n",
-    "        image = vae.decode(latents).sample"
+    "    image = vae.decode(latents).sample"
    ]
   },
   {