|
96 | 96 | " all_process = [\n", |
97 | 97 | " ['pip', 'install', 'torch==1.12.1+cu113', 'torchvision==0.13.1+cu113', '--extra-index-url', 'https://download.pytorch.org/whl/cu113'],\n", |
98 | 98 | " ['pip', 'install', 'omegaconf==2.2.3', 'einops==0.4.1', 'pytorch-lightning==1.7.4', 'torchmetrics==0.9.3', 'torchtext==0.13.1', 'transformers==4.21.2', 'kornia==0.6.7'],\n", |
99 | | - " ['git', 'clone', 'https://github.com/deforum/stable-diffusion'],\n", |
| 99 | + " ['git', 'clone', '-b', 'conditioning', 'https://github.com/deforum/stable-diffusion'],\n", |
100 | 100 | " ['pip', 'install', '-e', 'git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers'],\n", |
101 | 101 | " ['pip', 'install', '-e', 'git+https://github.com/openai/CLIP.git@main#egg=clip'],\n", |
102 | 102 | " ['pip', 'install', 'accelerate', 'ftfy', 'jsonmerge', 'matplotlib', 'resize-right', 'timm', 'torchdiffeq'],\n", |
|
200 | 200 | " borderMode=cv2.BORDER_WRAP if anim_args.border == 'wrap' else cv2.BORDER_REPLICATE\n", |
201 | 201 | " )\n", |
202 | 202 | "\n", |
203 | | - "def anim_frame_warp_3d(prev_img_cv2, anim_args, keys, frame_idx, adabins_helper, midas_model, midas_transform):\n", |
| 203 | + "def anim_frame_warp_3d(prev_img_cv2, depth, anim_args, keys, frame_idx):\n", |
204 | 204 | " TRANSLATION_SCALE = 1.0/200.0 # matches Disco\n", |
205 | 205 | " translate_xyz = [\n", |
206 | 206 | " -keys.translation_x_series[frame_idx] * TRANSLATION_SCALE, \n", |
|
213 | 213 | " math.radians(keys.rotation_3d_z_series[frame_idx])\n", |
214 | 214 | " ]\n", |
215 | 215 | " rot_mat = p3d.euler_angles_to_matrix(torch.tensor(rotate_xyz, device=device), \"XYZ\").unsqueeze(0)\n", |
216 | | - " result = transform_image_3d(prev_img_cv2, adabins_helper, midas_model, midas_transform, rot_mat, translate_xyz, anim_args)\n", |
| 216 | + " result = transform_image_3d(prev_img_cv2, depth, rot_mat, translate_xyz, anim_args)\n", |
217 | 217 | " torch.cuda.empty_cache()\n", |
218 | 218 | " return result\n", |
219 | 219 | "\n", |
|
332 | 332 | " denoised_sample = model.differentiable_decode_first_stage(denoised).requires_grad_()\n", |
333 | 333 | " loss = loss_fn(denoised_sample, sigma, **kwargs) * scale\n", |
334 | 334 | " grad = -torch.autograd.grad(loss, x)[0]\n", |
335 | | - " \n", |
| 335 | + " verbose_print()\n", |
336 | 336 | " verbose_print('Loss:', loss.item())\n", |
337 | 337 | " verbose_print(\"Max cond_grad\", torch.max(grad))\n", |
338 | 338 | " verbose_print(\"Min cond_grad\", torch.min(grad))\n", |
|
486 | 486 | " sample = torch.from_numpy(sample)\n", |
487 | 487 | " return sample\n", |
488 | 488 | "\n", |
489 | | - "def sample_to_cv2(sample: torch.Tensor) -> np.ndarray:\n", |
| 489 | + "def sample_to_cv2(sample: torch.Tensor, type=np.uint8) -> np.ndarray:\n", |
490 | 490 | " sample_f32 = rearrange(sample.squeeze().cpu().numpy(), \"c h w -> h w c\").astype(np.float32)\n", |
491 | 491 | " sample_f32 = ((sample_f32 * 0.5) + 0.5).clip(0, 1)\n", |
492 | | - " sample_int8 = (sample_f32 * 255).astype(np.uint8)\n", |
493 | | - " return sample_int8\n", |
| 492 | + " sample_int8 = (sample_f32 * 255)\n", |
| 493 | + " return sample_int8.astype(type)\n", |
494 | 494 | "\n", |
495 | 495 | "@torch.no_grad()\n", |
496 | | - "def transform_image_3d(prev_img_cv2, adabins_helper, midas_model, midas_transform, rot_mat, translate, anim_args):\n", |
497 | | - " # adapted and optimized version of transform_image_3d from Disco Diffusion https://github.com/alembics/disco-diffusion \n", |
498 | | - "\n", |
| 496 | + "def predict_depth(prev_img_cv2, adabins_helper, midas_model, midas_transform, anim_args) -> torch.Tensor:\n", |
499 | 497 | " w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0]\n", |
500 | 498 | "\n", |
501 | 499 | " # predict depth with AdaBins \n", |
502 | 500 | " use_adabins = anim_args.midas_weight < 1.0 and adabins_helper is not None\n", |
503 | 501 | " if use_adabins:\n", |
504 | | - " print(f\"Estimating depth of {w}x{h} image with AdaBins...\")\n", |
505 | 502 | " MAX_ADABINS_AREA = 500000\n", |
506 | 503 | " MIN_ADABINS_AREA = 448*448\n", |
507 | 504 | "\n", |
508 | 505 | " # resize image if too large or too small\n", |
509 | | - " img_pil = Image.fromarray(cv2.cvtColor(prev_img_cv2, cv2.COLOR_RGB2BGR))\n", |
| 506 | + " img_pil = Image.fromarray(cv2.cvtColor(prev_img_cv2.astype(np.uint8), cv2.COLOR_RGB2BGR))\n", |
510 | 507 | " image_pil_area = w*h\n", |
511 | 508 | " resized = True\n", |
512 | 509 | " if image_pil_area > MAX_ADABINS_AREA:\n", |
|
525 | 522 | " try:\n", |
526 | 523 | " _, adabins_depth = adabins_helper.predict_pil(depth_input)\n", |
527 | 524 | " if resized:\n", |
528 | | - " adabins_depth = torchvision.transforms.functional.resize(\n", |
| 525 | + " adabins_depth = TF.resize(\n", |
529 | 526 | " torch.from_numpy(adabins_depth), \n", |
530 | 527 | " torch.Size([h, w]),\n", |
531 | | - " interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC\n", |
| 528 | + " interpolation=TF.InterpolationMode.BICUBIC\n", |
532 | 529 | " )\n", |
533 | 530 | " adabins_depth = adabins_depth.squeeze()\n", |
534 | 531 | " except:\n", |
|
542 | 539 | " img_midas_input = midas_transform({\"image\": img_midas})[\"image\"]\n", |
543 | 540 | "\n", |
544 | 541 | " # MiDaS depth estimation implementation\n", |
545 | | - " print(f\"Estimating depth of {w}x{h} image with MiDaS...\")\n", |
546 | 542 | " sample = torch.from_numpy(img_midas_input).float().to(device).unsqueeze(0)\n", |
547 | 543 | " if device == torch.device(\"cuda\"):\n", |
548 | 544 | " sample = sample.to(memory_format=torch.channels_last) \n", |
|
571 | 567 | " depth_tensor = torch.from_numpy(depth_map).squeeze().to(device)\n", |
572 | 568 | " else:\n", |
573 | 569 | " depth_tensor = torch.ones((h, w), device=device)\n", |
| 570 | + " \n", |
| 571 | + " return depth_tensor\n", |
| 572 | + "\n", |
| 573 | + "def transform_image_3d(prev_img_cv2, depth_tensor, rot_mat, translate, anim_args):\n", |
| 574 | + " # adapted and optimized version of transform_image_3d from Disco Diffusion https://github.com/alembics/disco-diffusion \n", |
| 575 | + " w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0]\n", |
574 | 576 | "\n", |
575 | 577 | " pixel_aspect = 1.0 # aspect of an individual pixel (so usually 1.0)\n", |
576 | 578 | " near, far, fov_deg = anim_args.near_plane, anim_args.far_plane, anim_args.fov\n", |
|
592 | 594 | " coords_2d = torch.nn.functional.affine_grid(identity_2d_batch, [1,1,h,w], align_corners=False)\n", |
593 | 595 | " offset_coords_2d = coords_2d - torch.reshape(offset_xy, (h,w,2)).unsqueeze(0)\n", |
594 | 596 | "\n", |
595 | | - " image_tensor = torchvision.transforms.functional.to_tensor(Image.fromarray(prev_img_cv2)).to(device)\n", |
| 597 | + " image_tensor = rearrange(torch.from_numpy(prev_img_cv2.astype(np.float32)), 'h w c -> c h w').to(device)\n", |
596 | 598 | " new_image = torch.nn.functional.grid_sample(\n", |
597 | 599 | " image_tensor.add(1/512 - 0.0001).unsqueeze(0), \n", |
598 | 600 | " offset_coords_2d, \n", |
|
601 | 603 | " align_corners=False\n", |
602 | 604 | " )\n", |
603 | 605 | "\n", |
604 | | - " # convert back to cv2 style numpy array 0->255 uint8\n", |
| 606 | + " # convert back to cv2 style numpy array\n", |
605 | 607 | " result = rearrange(\n", |
606 | | - " new_image.squeeze().clamp(0,1) * 255.0, \n", |
| 608 | + " new_image.squeeze().clamp(0,255), \n", |
607 | 609 | " 'c h w -> h w c'\n", |
608 | | - " ).cpu().numpy().astype(np.uint8)\n", |
| 610 | + " ).cpu().numpy().astype(prev_img_cv2.dtype)\n", |
609 | 611 | " return result\n", |
610 | 612 | "\n", |
611 | 613 | "def generate(args, return_latent=False, return_sample=False, return_c=False):\n", |
|
670 | 672 | " mask=mask, \n", |
671 | 673 | " init_latent=init_latent,\n", |
672 | 674 | " sigmas=k_sigmas,\n", |
673 | | - " sampler=sampler) \n", |
| 675 | + " sampler=sampler,\n", |
| 676 | + " cond_fns=cond_fns) \n", |
674 | 677 | "\n", |
675 | 678 | " \n", |
676 | 679 | "\n", |
|
891 | 894 | "\n", |
892 | 895 | " #@markdown ####**Coherence:**\n", |
893 | 896 | " color_coherence = 'Match Frame 0 LAB' #@param ['None', 'Match Frame 0 HSV', 'Match Frame 0 LAB', 'Match Frame 0 RGB'] {type:'string'}\n", |
| 897 | + " diffusion_cadence = '3' #@param ['1','2','3','4','5','6','7','8'] {type:'string'}\n", |
894 | 898 | "\n", |
895 | | - " #@markdown #### Depth Warping\n", |
| 899 | + " #@markdown ####**3D Depth Warping:**\n", |
896 | 900 | " use_depth_warping = True #@param {type:\"boolean\"}\n", |
897 | 901 | " midas_weight = 0.3#@param {type:\"number\"}\n", |
898 | 902 | " near_plane = 200\n", |
|
1017 | 1021 | }, |
1018 | 1022 | "source": [ |
1019 | 1023 | "def DeforumArgs():\n", |
1020 | | - " \n", |
| 1024 | + "\n", |
1021 | 1025 | " #@markdown **Image Settings**\n", |
1022 | 1026 | " W = 512 #@param\n", |
1023 | 1027 | " H = 512 #@param\n", |
|
1045 | 1049 | " make_grid = False #@param {type:\"boolean\"}\n", |
1046 | 1050 | " grid_rows = 2 #@param \n", |
1047 | 1051 | " outdir = get_output_folder(output_path, batch_name)\n", |
1048 | | - " \n", |
| 1052 | + "\n", |
1049 | 1053 | " #@markdown **Init Settings**\n", |
1050 | 1054 | " use_init = False #@param {type:\"boolean\"}\n", |
1051 | 1055 | " strength = 0.0 #@param {type:\"number\"}\n", |
|
1082 | 1086 | " elif args.seed_behavior == 'fixed':\n", |
1083 | 1087 | " pass # always keep seed the same\n", |
1084 | 1088 | " else:\n", |
1085 | | - " args.seed = random.randint(0, 2**32)\n", |
| 1089 | + " args.seed = random.randint(0, 2**32 - 1)\n", |
1086 | 1090 | " return args.seed\n", |
1087 | 1091 | "\n", |
1088 | 1092 | "def render_image_batch(args):\n", |
|
1126 | 1130 | " args.prompt = prompt\n", |
1127 | 1131 | " print(f\"Prompt {iprompt+1} of {len(prompts)}\")\n", |
1128 | 1132 | " print(f\"{args.prompt}\")\n", |
1129 | | - " \n", |
| 1133 | + "\n", |
1130 | 1134 | " all_images = []\n", |
1131 | 1135 | "\n", |
1132 | 1136 | " for batch_index in range(args.n_batch):\n", |
|
1208 | 1212 | " else:\n", |
1209 | 1213 | " adabins_helper, midas_model, midas_transform = None, None, None\n", |
1210 | 1214 | "\n", |
1211 | | - " args.n_samples = 1\n", |
| 1215 | + " # state for interpolating between diffusion steps\n", |
| 1216 | + " turbo_steps = 1 if using_vid_init else int(anim_args.diffusion_cadence)\n", |
| 1217 | + " turbo_prev_image, turbo_prev_frame_idx = None, 0\n", |
| 1218 | + " turbo_next_image, turbo_next_frame_idx = None, 0\n", |
| 1219 | + "\n", |
| 1220 | + " # resume animation\n", |
1212 | 1221 | " prev_sample = None\n", |
1213 | 1222 | " color_match_sample = None\n", |
1214 | | - " for frame_idx in range(start_frame,anim_args.max_frames):\n", |
| 1223 | + " if anim_args.resume_from_timestring:\n", |
| 1224 | + " last_frame = start_frame-1\n", |
| 1225 | + " if turbo_steps > 1:\n", |
| 1226 | + " last_frame -= last_frame%turbo_steps\n", |
| 1227 | + " path = os.path.join(args.outdir,f\"{args.timestring}_{last_frame:05}.png\")\n", |
| 1228 | + " img = cv2.imread(path)\n", |
| 1229 | + " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", |
| 1230 | + " prev_sample = sample_from_cv2(img)\n", |
| 1231 | + " if anim_args.color_coherence != 'None':\n", |
| 1232 | + " color_match_sample = img\n", |
| 1233 | + " if turbo_steps > 1:\n", |
| 1234 | + " turbo_next_image, turbo_next_frame_idx = sample_to_cv2(prev_sample, type=np.float32), last_frame\n", |
| 1235 | + " turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx\n", |
| 1236 | + " start_frame = last_frame+turbo_steps\n", |
| 1237 | + "\n", |
| 1238 | + " args.n_samples = 1\n", |
| 1239 | + " frame_idx = start_frame\n", |
| 1240 | + " while frame_idx < anim_args.max_frames:\n", |
1215 | 1241 | " print(f\"Rendering animation frame {frame_idx} of {anim_args.max_frames}\")\n", |
1216 | 1242 | " noise = keys.noise_schedule_series[frame_idx]\n", |
1217 | 1243 | " strength = keys.strength_schedule_series[frame_idx]\n", |
1218 | 1244 | " contrast = keys.contrast_schedule_series[frame_idx]\n", |
1219 | 1245 | " \n", |
1220 | | - " # resume animation\n", |
1221 | | - " if anim_args.resume_from_timestring:\n", |
1222 | | - " path = os.path.join(args.outdir,f\"{args.timestring}_{frame_idx-1:05}.png\")\n", |
1223 | | - " img = cv2.imread(path)\n", |
1224 | | - " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", |
1225 | | - " prev_sample = sample_from_cv2(img)\n", |
| 1246 | + " # emit in-between frames\n", |
| 1247 | + " if turbo_steps > 1:\n", |
| 1248 | + " tween_frame_start_idx = max(0, frame_idx-turbo_steps)\n", |
| 1249 | + " for tween_frame_idx in range(tween_frame_start_idx, frame_idx):\n", |
| 1250 | + " tween = float(tween_frame_idx - tween_frame_start_idx + 1) / float(frame_idx - tween_frame_start_idx)\n", |
| 1251 | + " print(f\" creating in between frame {tween_frame_idx} tween:{tween:0.2f}\")\n", |
| 1252 | + " if anim_args.animation_mode == '2D':\n", |
| 1253 | + " if turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx:\n", |
| 1254 | + " turbo_prev_image = anim_frame_warp_2d(turbo_prev_image, args, anim_args, keys, tween_frame_idx)\n", |
| 1255 | + " if tween_frame_idx > turbo_next_frame_idx:\n", |
| 1256 | + " turbo_next_image = anim_frame_warp_2d(turbo_next_image, args, anim_args, keys, tween_frame_idx)\n", |
| 1257 | + " else: # '3D'\n", |
| 1258 | + " if turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx:\n", |
| 1259 | + " prev_depth = predict_depth(turbo_prev_image, adabins_helper, midas_model, midas_transform, anim_args)\n", |
| 1260 | + " turbo_prev_image = anim_frame_warp_3d(turbo_prev_image, prev_depth, anim_args, keys, tween_frame_idx)\n", |
| 1261 | + " if tween_frame_idx > turbo_next_frame_idx:\n", |
| 1262 | + " next_depth = predict_depth(turbo_next_image, adabins_helper, midas_model, midas_transform, anim_args)\n", |
| 1263 | + " turbo_next_image = anim_frame_warp_3d(turbo_next_image, next_depth, anim_args, keys, tween_frame_idx)\n", |
| 1264 | + " turbo_prev_frame_idx = turbo_next_frame_idx = tween_frame_idx\n", |
| 1265 | + "\n", |
| 1266 | + " if turbo_prev_image is not None and tween < 1.0:\n", |
| 1267 | + " img = turbo_prev_image*(1.0-tween) + turbo_next_image*tween\n", |
| 1268 | + " else:\n", |
| 1269 | + " img = turbo_next_image\n", |
| 1270 | + "\n", |
| 1271 | + " filename = f\"{args.timestring}_{tween_frame_idx:05}.png\"\n", |
| 1272 | + " cv2.imwrite(os.path.join(args.outdir, filename), cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR))\n", |
| 1273 | + " if turbo_next_image is not None:\n", |
| 1274 | + " prev_sample = sample_from_cv2(turbo_next_image)\n", |
1226 | 1275 | "\n", |
1227 | 1276 | " # apply transforms to previous frame\n", |
1228 | 1277 | " if prev_sample is not None:\n", |
1229 | | - "\n", |
1230 | 1278 | " if anim_args.animation_mode == '2D':\n", |
1231 | 1279 | " prev_img = anim_frame_warp_2d(sample_to_cv2(prev_sample), args, anim_args, keys, frame_idx)\n", |
1232 | 1280 | " else: # '3D'\n", |
1233 | | - " prev_img = anim_frame_warp_3d(sample_to_cv2(prev_sample), anim_args, keys, frame_idx, adabins_helper, midas_model, midas_transform)\n", |
| 1281 | + " prev_img_cv2 = sample_to_cv2(prev_sample)\n", |
| 1282 | + " depth = predict_depth(prev_img_cv2, adabins_helper, midas_model, midas_transform, anim_args)\n", |
| 1283 | + " prev_img = anim_frame_warp_3d(prev_img_cv2, depth, anim_args, keys, frame_idx)\n", |
1234 | 1284 | "\n", |
1235 | 1285 | " # apply color matching\n", |
1236 | 1286 | " if anim_args.color_coherence != 'None':\n", |
|
1246 | 1296 | "\n", |
1247 | 1297 | " # use transformed previous frame as init for current\n", |
1248 | 1298 | " args.use_init = True\n", |
1249 | | - " #args.init_sample = noised_sample.half().to(device)\n", |
1250 | 1299 | " if half_precision:\n", |
1251 | 1300 | " args.init_sample = noised_sample.half().to(device)\n", |
1252 | 1301 | " else:\n", |
|
1264 | 1313 | " args.init_image = init_frame\n", |
1265 | 1314 | "\n", |
1266 | 1315 | " # sample the diffusion model\n", |
1267 | | - " results = generate(args, return_latent=False, return_sample=True)\n", |
1268 | | - " sample, image = results[0], results[1]\n", |
1269 | | - " \n", |
1270 | | - " filename = f\"{args.timestring}_{frame_idx:05}.png\"\n", |
1271 | | - " image.save(os.path.join(args.outdir, filename))\n", |
| 1316 | + " sample, image = generate(args, return_latent=False, return_sample=True)\n", |
1272 | 1317 | " if not using_vid_init:\n", |
1273 | 1318 | " prev_sample = sample\n", |
1274 | | - " \n", |
| 1319 | + "\n", |
| 1320 | + " if turbo_steps > 1:\n", |
| 1321 | + " turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx\n", |
| 1322 | + " turbo_next_image, turbo_next_frame_idx = sample_to_cv2(sample, type=np.float32), frame_idx\n", |
| 1323 | + " frame_idx += turbo_steps\n", |
| 1324 | + " else: \n", |
| 1325 | + " filename = f\"{args.timestring}_{frame_idx:05}.png\"\n", |
| 1326 | + " image.save(os.path.join(args.outdir, filename))\n", |
| 1327 | + " frame_idx += 1\n", |
| 1328 | + "\n", |
1275 | 1329 | " display.clear_output(wait=True)\n", |
1276 | 1330 | " display.display(image)\n", |
1277 | 1331 | "\n", |
|
1412 | 1466 | "args.strength = max(0.0, min(1.0, args.strength))\n", |
1413 | 1467 | "\n", |
1414 | 1468 | "if args.seed == -1:\n", |
1415 | | - " args.seed = random.randint(0, 2**32)\n", |
| 1469 | + " args.seed = random.randint(0, 2**32 - 1)\n", |
1416 | 1470 | "if not args.use_init:\n", |
1417 | 1471 | " args.init_image = None\n", |
1418 | 1472 | "if args.sampler == 'plms' and (args.use_init or anim_args.animation_mode != 'None'):\n", |
|
1455 | 1509 | "id": "no2jP8HTMBM0" |
1456 | 1510 | }, |
1457 | 1511 | "source": [ |
1458 | | - "skip_video_for_run_all = False #@param {type: 'boolean'}\n", |
| 1512 | + "skip_video_for_run_all = True #@param {type: 'boolean'}\n", |
1459 | 1513 | "fps = 12 #@param {type:\"number\"}\n", |
1460 | 1514 | "#@markdown **Manual Settings**\n", |
1461 | | - "use_manual_settings = True #@param {type:\"boolean\"}\n", |
| 1515 | + "use_manual_settings = False #@param {type:\"boolean\"}\n", |
1462 | 1516 | "image_path = \"/content/drive/MyDrive/AI/StableDiffusion/2022-09/20220903000939_%05d.png\" #@param {type:\"string\"}\n", |
1463 | 1517 | "mp4_path = \"/content/drive/MyDrive/AI/StableDiffusion/2022-09/20220903000939.mp4\" #@param {type:\"string\"}\n", |
1464 | 1518 | "\n", |
|
0 commit comments