Added Depth Anything V2 support

semjonsona · graemeniedermayer · semjonsona · commit 740444219027 · 2024-07-21T19:29:53.000+03:00
Co-authored-by: Grae &lt;graen.ai.ar@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,6 @@
 ## Changelog
+### 0.4.8
+ * Depth Anything V2 support Thanks [@graemeniedermayer](https://github.com/graemeniedermayer)!
 ### 0.4.7
  * Tiling mode
  * Reduced VRAM consumption for Depth Anything, as well as for ZoeDepth k and nk
diff --git a/README.md b/README.md
@@ -225,3 +225,14 @@ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
       primaryClass={cs.CV}
 }
 ```
+
+Depth Anything V2
+
+```bibtex
+@article{depth_anything_v2,
+  title={Depth Anything V2},
+  author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
+  journal={arXiv:2406.09414},
+  year={2024}
+}
+```
diff --git a/scripts/depthmap_api.py b/scripts/depthmap_api.py
@@ -96,9 +96,12 @@ async def process_video(
             'zoedepth_k': 8, #outdoor
             'zoedepth_nk': 9,
             'marigold_v1': 10,
-            'depth_anything': 11
+            'depth_anything': 11,
+            'depth_anything_v2_small': 12,
+            'depth_anything_v2_base': 13,
+            'depth_anything_v2_large': 14
         }
-        
+
         model_type = options["model_type"]
         
         model_id = None
diff --git a/src/common_ui.py b/src/common_ui.py
@@ -38,7 +38,8 @@ def main_ui_panel(is_depth_tab):
                                                       'dpt_hybrid_384 (midas 3.0)',
                                                       'midas_v21', 'midas_v21_small',
                                                       'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk',
-                                                      'Marigold v1', 'Depth Anything'],
+                                                      'Marigold v1', 'Depth Anything', 'Depth Anything v2 Small',
+                                                      'Depth Anything v2 Base', 'Depth Anything v2 Large'],
                                              type="index")
         with gr.Box() as cur_option_root:
             inp -= 'depthmap_gen_row_1', cur_option_root
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
@@ -25,6 +25,11 @@
 from marigold.marigold import MarigoldPipeline
 # pix2pix/merge net imports
 from pix2pix.options.test_options import TestOptions
+# depthanyting v2
+try:
+    from depth_anything_v2 import DepthAnythingV2
+except:
+    print('depth_anything_v2 import failed... somehow')
 
 # Our code
 from src.misc import *
@@ -80,6 +85,8 @@ def load_models(self, model_type, device: torch.device, boost: bool, tiling_mode
             model_dir = "./models/leres"
         if model_type == 11:
             model_dir = "./models/depth_anything"
+        if model_type in [12, 13, 14]:
+            model_dir = "./models/depth_anything_v2"
 
         # create paths to model if not present
         os.makedirs(model_dir, exist_ok=True)
@@ -227,6 +234,19 @@ def load_models(self, model_type, device: torch.device, boost: bool, tiling_mode
                                    "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth")
 
             model.load_state_dict(torch.load(model_path))
+        elif model_type in [12, 13, 14]:  # depth_anything_v2 small, base, large
+            letter = {12: 's', 13: 'b', 14: 'l'}[model_type]
+            word = {12: 'Small', 13: 'Base', 14: 'Large'}[model_type]
+            model_path = f"{model_dir}/depth_anything_v2_vit{letter}.pth"
+            ensure_file_downloaded(model_path,
+                                   f"https://huggingface.co/depth-anything/Depth-Anything-V2-{word}/resolve/main/depth_anything_v2_vit{letter}.pth")
+            model_configs = {'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+                             'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+                             'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+                             'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}}
+            model = DepthAnythingV2(**model_configs[f'vit{letter}'])
+            model.load_state_dict(torch.load(model_path, map_location='cpu'))
+        # 15 is reserved for Depth Anything V2 Giant
 
         if tiling_mode:
             def flatten(el):
@@ -250,6 +270,9 @@ def flatten(el):
                 # TODO: Fix for zoedepth_n - it completely trips and generates black images
                 if model_type in [1, 2, 3, 4, 5, 6, 8, 9, 11] and not boost:
                     model = model.half()
+                if model_type in [12, 13, 14]:
+                    model.depth_head.half()
+                    model.pretrained.half()
         model.to(device)  # to correct device
 
         self.depth_model = model
@@ -291,7 +314,10 @@ def get_default_net_size(model_type):
             8: [384, 768],
             9: [384, 512],
             10: [768, 768],
-            11: [518, 518]
+            11: [518, 518],
+            12: [518, 518],
+            13: [518, 518],
+            14: [518, 518]
         }
         if model_type in sizes:
             return sizes[model_type]
@@ -350,6 +376,8 @@ def get_raw_prediction(self, input, net_width, net_height):
                                                   self.marigold_ensembles, self.marigold_steps)
             elif self.depth_model_type == 11:
                 raw_prediction = estimatedepthanything(img, self.depth_model, net_width, net_height)
+            elif self.depth_model_type in [12, 13, 14]:
+                raw_prediction = estimatedepthanything_v2(img, self.depth_model, net_width, net_height)
         else:
             raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model,
                                            self.boost_rmax)
@@ -499,6 +527,20 @@ def estimatedepthanything(image, model, w, h):
     return depth.cpu().numpy()
 
 
+def estimatedepthanything_v2(image, model, w, h):
+    # This is an awkward re-conversion, but I believe it should not impact quality
+    img = cv2.cvtColor((image * 255.1).astype('uint8'), cv2.COLOR_BGR2RGB)
+    with torch.no_grad():
+        # Compare to: model.infer_image(img, w)
+        image, (h, w) = model.image2tensor(img, w)
+        # Casting to correct type, it is the same as type of some model tensor (the one here is arbitrary)
+        image_casted = image.type_as(model.pretrained.blocks[0].norm1.weight.data)
+        depth = model.forward(image_casted).type_as(image)
+        import torch.nn.functional as F
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        return depth.cpu().numpy()
+
+
 class ImageandPatchs:
     def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
         self.root_dir = root_dir
@@ -720,6 +762,8 @@ def estimateboost(img, model, model_type, pix2pixmodel, whole_size_threshold):
         net_receptive_field_size = 512
     elif model_type == 11:  # depth_anything
         net_receptive_field_size = 518
+    elif model_type in [12, 13, 14]:  # depth_anything_v2
+        net_receptive_field_size = 518
     else:  # other midas  # TODO Marigold support
         net_receptive_field_size = 384
     patch_netsize = 2 * net_receptive_field_size
@@ -995,6 +1039,8 @@ def singleestimate(img, msize, model, net_type):
         return estimatemarigold(img, model, msize, msize)
     elif net_type == 11:
         return estimatedepthanything(img, model, msize, msize)
+    elif net_type in [12, 13, 14]:
+        return estimatedepthanything_v2(img, model, msize, msize)
     elif net_type >= 7:
         # np to PIL
         return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
diff --git a/src/misc.py b/src/misc.py
@@ -15,7 +15,7 @@ def get_commit_hash():
 
 REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script"
 SCRIPT_NAME = "DepthMap"
-SCRIPT_VERSION = "v0.4.7"
+SCRIPT_VERSION = "v0.4.8"
 SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"