From c4b280469e82b84100f90cffe5623da5a83e2cc8 Mon Sep 17 00:00:00 2001 From: daniel-z-kaplan <48258016+daniel-z-kaplan@users.noreply.github.com> Date: Fri, 4 Aug 2023 17:51:58 -0400 Subject: [PATCH 1/4] Update vision_encoders.py Fix needed for shapes --- code/decision_transformer/models/vision_encoders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/decision_transformer/models/vision_encoders.py b/code/decision_transformer/models/vision_encoders.py index 36c6cf1..b6d901b 100644 --- a/code/decision_transformer/models/vision_encoders.py +++ b/code/decision_transformer/models/vision_encoders.py @@ -47,8 +47,8 @@ def forward(self, vision_x): vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w") with torch.no_grad(): - vision_x = self.vision_encoder(vision_x)[1] + vision_x = self.vision_encoder(vision_x) vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F) vision_x = self.perceiver(vision_x) - return vision_x \ No newline at end of file + return vision_x From 80660a47ceaf9c05beb1a5b5fe3bfb2f41cbc212 Mon Sep 17 00:00:00 2001 From: daniel-z-kaplan <48258016+daniel-z-kaplan@users.noreply.github.com> Date: Fri, 4 Aug 2023 17:53:38 -0400 Subject: [PATCH 2/4] Update test_perceiver.py Updates + comments --- code/decision_transformer/models/test_perceiver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/code/decision_transformer/models/test_perceiver.py b/code/decision_transformer/models/test_perceiver.py index 25878e6..27cff37 100644 --- a/code/decision_transformer/models/test_perceiver.py +++ b/code/decision_transformer/models/test_perceiver.py @@ -21,8 +21,11 @@ batch = 5 num_images = 27 channels = 3 -height = 512 -width = 512 +height = 224 +width = 224 +#We need these shapes to be 224, reflecting output from preprocess. +#We still need to use openclip preprocess technically, for images. + input_data = torch.randn((batch, num_images, 1, channels, height, width)) # vision_x (torch.Tensor): Vision input # shape (B, T_img, F, C, H, W) with F=1 @@ -56,4 +59,4 @@ def encode_vision_x(vision_x: torch.Tensor): # for layer in lang_encoder._get_decoder_layers(): # layer.condition_vis_x(vision_x) -encode_vision_x(input_data) \ No newline at end of file +encode_vision_x(input_data) From 586fa8832fe22ea65c8f0f752ce096a8415e8200 Mon Sep 17 00:00:00 2001 From: daniel-z-kaplan <48258016+daniel-z-kaplan@users.noreply.github.com> Date: Fri, 4 Aug 2023 20:46:10 -0400 Subject: [PATCH 3/4] Update test_perceiver.py --- .../models/test_perceiver.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/code/decision_transformer/models/test_perceiver.py b/code/decision_transformer/models/test_perceiver.py index 27cff37..c32d9d9 100644 --- a/code/decision_transformer/models/test_perceiver.py +++ b/code/decision_transformer/models/test_perceiver.py @@ -13,6 +13,8 @@ "ViT-L-14", pretrained="openai") vision_encoder = vision_encoder.visual +vision_encoder.output_tokens = True + vis_dim=open_clip.get_model_config("ViT-L-14")["vision_cfg"]["width"] perceiver = PerceiverResampler(dim=vis_dim) @@ -23,14 +25,24 @@ channels = 3 height = 224 width = 224 -#We need these shapes to be 224, reflecting output from preprocess. -#We still need to use openclip preprocess technically, for images. - input_data = torch.randn((batch, num_images, 1, channels, height, width)) # vision_x (torch.Tensor): Vision input # shape (B, T_img, F, C, H, W) with F=1 +if False: + import torch + from PIL import Image + import open_clip + + image = image_processor(Image.open("dog.jpg")).unsqueeze(0) + print(image.shape) + + + with torch.no_grad(), torch.cuda.amp.autocast(): + image_features = vision_encoder.encode_image(image) + print(image_features.shape) + def encode_vision_x(vision_x: torch.Tensor): """ @@ -49,12 +61,16 @@ def encode_vision_x(vision_x: torch.Tensor): assert F == 1, "Only single frame supported" vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w") + print(vision_x.shape) with torch.no_grad(): - vision_x = vision_encoder(vision_x)[1] - + vision_x, tokens = vision_encoder(vision_x) + #We might want the -2 instead by the way. + print(tokens.shape)#batch x frames x 768.. - vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F) + vision_x = rearrange(tokens, "(b T F) v d -> b T F v d", b=b, T=T, F=F) + print(vision_x.shape)#Put back in original shape vision_x = perceiver(vision_x) + print(vision_x.shape) # for layer in lang_encoder._get_decoder_layers(): # layer.condition_vis_x(vision_x) From b3b2af3a7160c119e062212e1d0f53e8a612777c Mon Sep 17 00:00:00 2001 From: daniel-z-kaplan <48258016+daniel-z-kaplan@users.noreply.github.com> Date: Fri, 4 Aug 2023 20:49:42 -0400 Subject: [PATCH 4/4] Update vision_encoders.py --- code/decision_transformer/models/vision_encoders.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/decision_transformer/models/vision_encoders.py b/code/decision_transformer/models/vision_encoders.py index b6d901b..aa836a8 100644 --- a/code/decision_transformer/models/vision_encoders.py +++ b/code/decision_transformer/models/vision_encoders.py @@ -37,6 +37,7 @@ def __init__(self): self.vision_encoder, _, self.image_processor = open_clip.create_model_and_transforms( "ViT-L-14", pretrained="openai") self.vision_encoder = self.vision_encoder.visual + self.vision_encoder.output_tokens = True self.vis_dim = open_clip.get_model_config("ViT-L-14")["vision_cfg"]["width"] self.perceiver = PerceiverResampler(dim=self.vis_dim) @@ -47,7 +48,7 @@ def forward(self, vision_x): vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w") with torch.no_grad(): - vision_x = self.vision_encoder(vision_x) + vision_x = self.vision_encoder(vision_x)[1] vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F) vision_x = self.perceiver(vision_x)