Skip to content

Commit 677a9f2

Browse files
Make ipynb / activate loss
1 parent 2721635 commit 677a9f2

File tree

2 files changed

+103
-48
lines changed

2 files changed

+103
-48
lines changed

Deforum_Stable_Diffusion.ipynb

Lines changed: 99 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
" all_process = [\n",
9797
" ['pip', 'install', 'torch==1.12.1+cu113', 'torchvision==0.13.1+cu113', '--extra-index-url', 'https://download.pytorch.org/whl/cu113'],\n",
9898
" ['pip', 'install', 'omegaconf==2.2.3', 'einops==0.4.1', 'pytorch-lightning==1.7.4', 'torchmetrics==0.9.3', 'torchtext==0.13.1', 'transformers==4.21.2', 'kornia==0.6.7'],\n",
99-
" ['git', 'clone', 'https://github.com/deforum/stable-diffusion'],\n",
99+
" ['git', 'clone', '-b', 'conditioning', 'https://github.com/deforum/stable-diffusion'],\n",
100100
" ['pip', 'install', '-e', 'git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers'],\n",
101101
" ['pip', 'install', '-e', 'git+https://github.com/openai/CLIP.git@main#egg=clip'],\n",
102102
" ['pip', 'install', 'accelerate', 'ftfy', 'jsonmerge', 'matplotlib', 'resize-right', 'timm', 'torchdiffeq'],\n",
@@ -200,7 +200,7 @@
200200
" borderMode=cv2.BORDER_WRAP if anim_args.border == 'wrap' else cv2.BORDER_REPLICATE\n",
201201
" )\n",
202202
"\n",
203-
"def anim_frame_warp_3d(prev_img_cv2, anim_args, keys, frame_idx, adabins_helper, midas_model, midas_transform):\n",
203+
"def anim_frame_warp_3d(prev_img_cv2, depth, anim_args, keys, frame_idx):\n",
204204
" TRANSLATION_SCALE = 1.0/200.0 # matches Disco\n",
205205
" translate_xyz = [\n",
206206
" -keys.translation_x_series[frame_idx] * TRANSLATION_SCALE, \n",
@@ -213,7 +213,7 @@
213213
" math.radians(keys.rotation_3d_z_series[frame_idx])\n",
214214
" ]\n",
215215
" rot_mat = p3d.euler_angles_to_matrix(torch.tensor(rotate_xyz, device=device), \"XYZ\").unsqueeze(0)\n",
216-
" result = transform_image_3d(prev_img_cv2, adabins_helper, midas_model, midas_transform, rot_mat, translate_xyz, anim_args)\n",
216+
" result = transform_image_3d(prev_img_cv2, depth, rot_mat, translate_xyz, anim_args)\n",
217217
" torch.cuda.empty_cache()\n",
218218
" return result\n",
219219
"\n",
@@ -332,7 +332,7 @@
332332
" denoised_sample = model.differentiable_decode_first_stage(denoised).requires_grad_()\n",
333333
" loss = loss_fn(denoised_sample, sigma, **kwargs) * scale\n",
334334
" grad = -torch.autograd.grad(loss, x)[0]\n",
335-
" \n",
335+
" verbose_print()\n",
336336
" verbose_print('Loss:', loss.item())\n",
337337
" verbose_print(\"Max cond_grad\", torch.max(grad))\n",
338338
" verbose_print(\"Min cond_grad\", torch.min(grad))\n",
@@ -486,27 +486,24 @@
486486
" sample = torch.from_numpy(sample)\n",
487487
" return sample\n",
488488
"\n",
489-
"def sample_to_cv2(sample: torch.Tensor) -> np.ndarray:\n",
489+
"def sample_to_cv2(sample: torch.Tensor, type=np.uint8) -> np.ndarray:\n",
490490
" sample_f32 = rearrange(sample.squeeze().cpu().numpy(), \"c h w -> h w c\").astype(np.float32)\n",
491491
" sample_f32 = ((sample_f32 * 0.5) + 0.5).clip(0, 1)\n",
492-
" sample_int8 = (sample_f32 * 255).astype(np.uint8)\n",
493-
" return sample_int8\n",
492+
" sample_int8 = (sample_f32 * 255)\n",
493+
" return sample_int8.astype(type)\n",
494494
"\n",
495495
"@torch.no_grad()\n",
496-
"def transform_image_3d(prev_img_cv2, adabins_helper, midas_model, midas_transform, rot_mat, translate, anim_args):\n",
497-
" # adapted and optimized version of transform_image_3d from Disco Diffusion https://github.com/alembics/disco-diffusion \n",
498-
"\n",
496+
"def predict_depth(prev_img_cv2, adabins_helper, midas_model, midas_transform, anim_args) -> torch.Tensor:\n",
499497
" w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0]\n",
500498
"\n",
501499
" # predict depth with AdaBins \n",
502500
" use_adabins = anim_args.midas_weight < 1.0 and adabins_helper is not None\n",
503501
" if use_adabins:\n",
504-
" print(f\"Estimating depth of {w}x{h} image with AdaBins...\")\n",
505502
" MAX_ADABINS_AREA = 500000\n",
506503
" MIN_ADABINS_AREA = 448*448\n",
507504
"\n",
508505
" # resize image if too large or too small\n",
509-
" img_pil = Image.fromarray(cv2.cvtColor(prev_img_cv2, cv2.COLOR_RGB2BGR))\n",
506+
" img_pil = Image.fromarray(cv2.cvtColor(prev_img_cv2.astype(np.uint8), cv2.COLOR_RGB2BGR))\n",
510507
" image_pil_area = w*h\n",
511508
" resized = True\n",
512509
" if image_pil_area > MAX_ADABINS_AREA:\n",
@@ -525,10 +522,10 @@
525522
" try:\n",
526523
" _, adabins_depth = adabins_helper.predict_pil(depth_input)\n",
527524
" if resized:\n",
528-
" adabins_depth = torchvision.transforms.functional.resize(\n",
525+
" adabins_depth = TF.resize(\n",
529526
" torch.from_numpy(adabins_depth), \n",
530527
" torch.Size([h, w]),\n",
531-
" interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC\n",
528+
" interpolation=TF.InterpolationMode.BICUBIC\n",
532529
" )\n",
533530
" adabins_depth = adabins_depth.squeeze()\n",
534531
" except:\n",
@@ -542,7 +539,6 @@
542539
" img_midas_input = midas_transform({\"image\": img_midas})[\"image\"]\n",
543540
"\n",
544541
" # MiDaS depth estimation implementation\n",
545-
" print(f\"Estimating depth of {w}x{h} image with MiDaS...\")\n",
546542
" sample = torch.from_numpy(img_midas_input).float().to(device).unsqueeze(0)\n",
547543
" if device == torch.device(\"cuda\"):\n",
548544
" sample = sample.to(memory_format=torch.channels_last) \n",
@@ -571,6 +567,12 @@
571567
" depth_tensor = torch.from_numpy(depth_map).squeeze().to(device)\n",
572568
" else:\n",
573569
" depth_tensor = torch.ones((h, w), device=device)\n",
570+
" \n",
571+
" return depth_tensor\n",
572+
"\n",
573+
"def transform_image_3d(prev_img_cv2, depth_tensor, rot_mat, translate, anim_args):\n",
574+
" # adapted and optimized version of transform_image_3d from Disco Diffusion https://github.com/alembics/disco-diffusion \n",
575+
" w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0]\n",
574576
"\n",
575577
" pixel_aspect = 1.0 # aspect of an individual pixel (so usually 1.0)\n",
576578
" near, far, fov_deg = anim_args.near_plane, anim_args.far_plane, anim_args.fov\n",
@@ -592,7 +594,7 @@
592594
" coords_2d = torch.nn.functional.affine_grid(identity_2d_batch, [1,1,h,w], align_corners=False)\n",
593595
" offset_coords_2d = coords_2d - torch.reshape(offset_xy, (h,w,2)).unsqueeze(0)\n",
594596
"\n",
595-
" image_tensor = torchvision.transforms.functional.to_tensor(Image.fromarray(prev_img_cv2)).to(device)\n",
597+
" image_tensor = rearrange(torch.from_numpy(prev_img_cv2.astype(np.float32)), 'h w c -> c h w').to(device)\n",
596598
" new_image = torch.nn.functional.grid_sample(\n",
597599
" image_tensor.add(1/512 - 0.0001).unsqueeze(0), \n",
598600
" offset_coords_2d, \n",
@@ -601,11 +603,11 @@
601603
" align_corners=False\n",
602604
" )\n",
603605
"\n",
604-
" # convert back to cv2 style numpy array 0->255 uint8\n",
606+
" # convert back to cv2 style numpy array\n",
605607
" result = rearrange(\n",
606-
" new_image.squeeze().clamp(0,1) * 255.0, \n",
608+
" new_image.squeeze().clamp(0,255), \n",
607609
" 'c h w -> h w c'\n",
608-
" ).cpu().numpy().astype(np.uint8)\n",
610+
" ).cpu().numpy().astype(prev_img_cv2.dtype)\n",
609611
" return result\n",
610612
"\n",
611613
"def generate(args, return_latent=False, return_sample=False, return_c=False):\n",
@@ -670,7 +672,8 @@
670672
" mask=mask, \n",
671673
" init_latent=init_latent,\n",
672674
" sigmas=k_sigmas,\n",
673-
" sampler=sampler) \n",
675+
" sampler=sampler,\n",
676+
" cond_fns=cond_fns) \n",
674677
"\n",
675678
" \n",
676679
"\n",
@@ -891,8 +894,9 @@
891894
"\n",
892895
" #@markdown ####**Coherence:**\n",
893896
" color_coherence = 'Match Frame 0 LAB' #@param ['None', 'Match Frame 0 HSV', 'Match Frame 0 LAB', 'Match Frame 0 RGB'] {type:'string'}\n",
897+
" diffusion_cadence = '3' #@param ['1','2','3','4','5','6','7','8'] {type:'string'}\n",
894898
"\n",
895-
" #@markdown #### Depth Warping\n",
899+
" #@markdown ####**3D Depth Warping:**\n",
896900
" use_depth_warping = True #@param {type:\"boolean\"}\n",
897901
" midas_weight = 0.3#@param {type:\"number\"}\n",
898902
" near_plane = 200\n",
@@ -1017,7 +1021,7 @@
10171021
},
10181022
"source": [
10191023
"def DeforumArgs():\n",
1020-
" \n",
1024+
"\n",
10211025
" #@markdown **Image Settings**\n",
10221026
" W = 512 #@param\n",
10231027
" H = 512 #@param\n",
@@ -1045,7 +1049,7 @@
10451049
" make_grid = False #@param {type:\"boolean\"}\n",
10461050
" grid_rows = 2 #@param \n",
10471051
" outdir = get_output_folder(output_path, batch_name)\n",
1048-
" \n",
1052+
"\n",
10491053
" #@markdown **Init Settings**\n",
10501054
" use_init = False #@param {type:\"boolean\"}\n",
10511055
" strength = 0.0 #@param {type:\"number\"}\n",
@@ -1082,7 +1086,7 @@
10821086
" elif args.seed_behavior == 'fixed':\n",
10831087
" pass # always keep seed the same\n",
10841088
" else:\n",
1085-
" args.seed = random.randint(0, 2**32)\n",
1089+
" args.seed = random.randint(0, 2**32 - 1)\n",
10861090
" return args.seed\n",
10871091
"\n",
10881092
"def render_image_batch(args):\n",
@@ -1126,7 +1130,7 @@
11261130
" args.prompt = prompt\n",
11271131
" print(f\"Prompt {iprompt+1} of {len(prompts)}\")\n",
11281132
" print(f\"{args.prompt}\")\n",
1129-
" \n",
1133+
"\n",
11301134
" all_images = []\n",
11311135
"\n",
11321136
" for batch_index in range(args.n_batch):\n",
@@ -1208,29 +1212,75 @@
12081212
" else:\n",
12091213
" adabins_helper, midas_model, midas_transform = None, None, None\n",
12101214
"\n",
1211-
" args.n_samples = 1\n",
1215+
" # state for interpolating between diffusion steps\n",
1216+
" turbo_steps = 1 if using_vid_init else int(anim_args.diffusion_cadence)\n",
1217+
" turbo_prev_image, turbo_prev_frame_idx = None, 0\n",
1218+
" turbo_next_image, turbo_next_frame_idx = None, 0\n",
1219+
"\n",
1220+
" # resume animation\n",
12121221
" prev_sample = None\n",
12131222
" color_match_sample = None\n",
1214-
" for frame_idx in range(start_frame,anim_args.max_frames):\n",
1223+
" if anim_args.resume_from_timestring:\n",
1224+
" last_frame = start_frame-1\n",
1225+
" if turbo_steps > 1:\n",
1226+
" last_frame -= last_frame%turbo_steps\n",
1227+
" path = os.path.join(args.outdir,f\"{args.timestring}_{last_frame:05}.png\")\n",
1228+
" img = cv2.imread(path)\n",
1229+
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
1230+
" prev_sample = sample_from_cv2(img)\n",
1231+
" if anim_args.color_coherence != 'None':\n",
1232+
" color_match_sample = img\n",
1233+
" if turbo_steps > 1:\n",
1234+
" turbo_next_image, turbo_next_frame_idx = sample_to_cv2(prev_sample, type=np.float32), last_frame\n",
1235+
" turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx\n",
1236+
" start_frame = last_frame+turbo_steps\n",
1237+
"\n",
1238+
" args.n_samples = 1\n",
1239+
" frame_idx = start_frame\n",
1240+
" while frame_idx < anim_args.max_frames:\n",
12151241
" print(f\"Rendering animation frame {frame_idx} of {anim_args.max_frames}\")\n",
12161242
" noise = keys.noise_schedule_series[frame_idx]\n",
12171243
" strength = keys.strength_schedule_series[frame_idx]\n",
12181244
" contrast = keys.contrast_schedule_series[frame_idx]\n",
12191245
" \n",
1220-
" # resume animation\n",
1221-
" if anim_args.resume_from_timestring:\n",
1222-
" path = os.path.join(args.outdir,f\"{args.timestring}_{frame_idx-1:05}.png\")\n",
1223-
" img = cv2.imread(path)\n",
1224-
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
1225-
" prev_sample = sample_from_cv2(img)\n",
1246+
" # emit in-between frames\n",
1247+
" if turbo_steps > 1:\n",
1248+
" tween_frame_start_idx = max(0, frame_idx-turbo_steps)\n",
1249+
" for tween_frame_idx in range(tween_frame_start_idx, frame_idx):\n",
1250+
" tween = float(tween_frame_idx - tween_frame_start_idx + 1) / float(frame_idx - tween_frame_start_idx)\n",
1251+
" print(f\" creating in between frame {tween_frame_idx} tween:{tween:0.2f}\")\n",
1252+
" if anim_args.animation_mode == '2D':\n",
1253+
" if turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx:\n",
1254+
" turbo_prev_image = anim_frame_warp_2d(turbo_prev_image, args, anim_args, keys, tween_frame_idx)\n",
1255+
" if tween_frame_idx > turbo_next_frame_idx:\n",
1256+
" turbo_next_image = anim_frame_warp_2d(turbo_next_image, args, anim_args, keys, tween_frame_idx)\n",
1257+
" else: # '3D'\n",
1258+
" if turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx:\n",
1259+
" prev_depth = predict_depth(turbo_prev_image, adabins_helper, midas_model, midas_transform, anim_args)\n",
1260+
" turbo_prev_image = anim_frame_warp_3d(turbo_prev_image, prev_depth, anim_args, keys, tween_frame_idx)\n",
1261+
" if tween_frame_idx > turbo_next_frame_idx:\n",
1262+
" next_depth = predict_depth(turbo_next_image, adabins_helper, midas_model, midas_transform, anim_args)\n",
1263+
" turbo_next_image = anim_frame_warp_3d(turbo_next_image, next_depth, anim_args, keys, tween_frame_idx)\n",
1264+
" turbo_prev_frame_idx = turbo_next_frame_idx = tween_frame_idx\n",
1265+
"\n",
1266+
" if turbo_prev_image is not None and tween < 1.0:\n",
1267+
" img = turbo_prev_image*(1.0-tween) + turbo_next_image*tween\n",
1268+
" else:\n",
1269+
" img = turbo_next_image\n",
1270+
"\n",
1271+
" filename = f\"{args.timestring}_{tween_frame_idx:05}.png\"\n",
1272+
" cv2.imwrite(os.path.join(args.outdir, filename), cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR))\n",
1273+
" if turbo_next_image is not None:\n",
1274+
" prev_sample = sample_from_cv2(turbo_next_image)\n",
12261275
"\n",
12271276
" # apply transforms to previous frame\n",
12281277
" if prev_sample is not None:\n",
1229-
"\n",
12301278
" if anim_args.animation_mode == '2D':\n",
12311279
" prev_img = anim_frame_warp_2d(sample_to_cv2(prev_sample), args, anim_args, keys, frame_idx)\n",
12321280
" else: # '3D'\n",
1233-
" prev_img = anim_frame_warp_3d(sample_to_cv2(prev_sample), anim_args, keys, frame_idx, adabins_helper, midas_model, midas_transform)\n",
1281+
" prev_img_cv2 = sample_to_cv2(prev_sample)\n",
1282+
" depth = predict_depth(prev_img_cv2, adabins_helper, midas_model, midas_transform, anim_args)\n",
1283+
" prev_img = anim_frame_warp_3d(prev_img_cv2, depth, anim_args, keys, frame_idx)\n",
12341284
"\n",
12351285
" # apply color matching\n",
12361286
" if anim_args.color_coherence != 'None':\n",
@@ -1246,7 +1296,6 @@
12461296
"\n",
12471297
" # use transformed previous frame as init for current\n",
12481298
" args.use_init = True\n",
1249-
" #args.init_sample = noised_sample.half().to(device)\n",
12501299
" if half_precision:\n",
12511300
" args.init_sample = noised_sample.half().to(device)\n",
12521301
" else:\n",
@@ -1264,14 +1313,19 @@
12641313
" args.init_image = init_frame\n",
12651314
"\n",
12661315
" # sample the diffusion model\n",
1267-
" results = generate(args, return_latent=False, return_sample=True)\n",
1268-
" sample, image = results[0], results[1]\n",
1269-
" \n",
1270-
" filename = f\"{args.timestring}_{frame_idx:05}.png\"\n",
1271-
" image.save(os.path.join(args.outdir, filename))\n",
1316+
" sample, image = generate(args, return_latent=False, return_sample=True)\n",
12721317
" if not using_vid_init:\n",
12731318
" prev_sample = sample\n",
1274-
" \n",
1319+
"\n",
1320+
" if turbo_steps > 1:\n",
1321+
" turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx\n",
1322+
" turbo_next_image, turbo_next_frame_idx = sample_to_cv2(sample, type=np.float32), frame_idx\n",
1323+
" frame_idx += turbo_steps\n",
1324+
" else: \n",
1325+
" filename = f\"{args.timestring}_{frame_idx:05}.png\"\n",
1326+
" image.save(os.path.join(args.outdir, filename))\n",
1327+
" frame_idx += 1\n",
1328+
"\n",
12751329
" display.clear_output(wait=True)\n",
12761330
" display.display(image)\n",
12771331
"\n",
@@ -1412,7 +1466,7 @@
14121466
"args.strength = max(0.0, min(1.0, args.strength))\n",
14131467
"\n",
14141468
"if args.seed == -1:\n",
1415-
" args.seed = random.randint(0, 2**32)\n",
1469+
" args.seed = random.randint(0, 2**32 - 1)\n",
14161470
"if not args.use_init:\n",
14171471
" args.init_image = None\n",
14181472
"if args.sampler == 'plms' and (args.use_init or anim_args.animation_mode != 'None'):\n",
@@ -1455,10 +1509,10 @@
14551509
"id": "no2jP8HTMBM0"
14561510
},
14571511
"source": [
1458-
"skip_video_for_run_all = False #@param {type: 'boolean'}\n",
1512+
"skip_video_for_run_all = True #@param {type: 'boolean'}\n",
14591513
"fps = 12 #@param {type:\"number\"}\n",
14601514
"#@markdown **Manual Settings**\n",
1461-
"use_manual_settings = True #@param {type:\"boolean\"}\n",
1515+
"use_manual_settings = False #@param {type:\"boolean\"}\n",
14621516
"image_path = \"/content/drive/MyDrive/AI/StableDiffusion/2022-09/20220903000939_%05d.png\" #@param {type:\"string\"}\n",
14631517
"mp4_path = \"/content/drive/MyDrive/AI/StableDiffusion/2022-09/20220903000939.mp4\" #@param {type:\"string\"}\n",
14641518
"\n",

0 commit comments

Comments
 (0)