ExponentialML · masoudcharkhabi · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+outputs/*
+runs/*
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@
 ### Requirements & Installation
 
 ```bash
-git clone https://github.com/ExponentialML/Text-To-Video-Finetuning.git
+git clone git@github.com:xfanac/Text-To-Video-Finetuning.git
 cd Text-To-Video-Finetuning
 git lfs install
 git clone https://huggingface.co/damo-vilab/text-to-video-ms-1.7b ./models/model_scope_diffusers/

diff --git a/batch_inference.sh b/batch_inference.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+check_points='./outputs/train_2023-11-20T23-10-32/checkpoint-*'
+suffix=unique_token_dog_only_$(date +%s)
+prompts=("\"A « dog is playing ball\"" "\"A « dog is runnig\"" "\"A dog\"" "\"A dog is running\"")
+for prompt in "${prompts[@]}"
+do
+    per_file_suffix=_"$suffix"_benchmark
+    eval "python inference.py --prompt $prompt --model ./models/model_scope_diffusers --sdp --suffix $per_file_suffix --output-dir ./output/$suffix"
+    for checkpoint_file in $check_points
+    do
+        iteration="${checkpoint_file##*-}"
+        per_file_suffix=_"$suffix"_"$iteration"
+
+        eval "python inference.py --prompt $prompt --model $checkpoint_file --sdp --suffix $per_file_suffix --output-dir ./output/$suffix"
+    done
+done
diff --git a/configs/v2/fine_tune_single_video_config.yaml b/configs/v2/fine_tune_single_video_config.yaml
@@ -0,0 +1,232 @@
+# Pretrained diffusers model path.
+pretrained_model_path: "./models/model_scope_diffusers/" #https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/tree/main
+
+# The folder where your training outputs will be placed.
+output_dir: "./outputs"
+
+# You can train multiple datasets at once. They will be joined together for training.
+# Simply remove the line you don't need, or keep them all for mixed training.
+
+# 'image': A folder of images and captions (.txt)
+# 'folder': A folder a videos and captions (.txt)
+# 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor
+# 'single_video': A single video file.mp4 and text prompt
+dataset_types: 
+  - 'single_video'
+
+# Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise
+# If this is enabled, rescale_schedule will be disabled.
+offset_noise_strength: 0.1
+use_offset_noise: False
+
+# Uses schedule rescale, also known as the "better" offset noise. See https://arxiv.org/pdf/2305.08891.pdf
+# If this is enabled, offset noise will be disabled.
+rescale_schedule: False
+
+# When True, this extends all items in all enabled datasets to the highest length. 
+# For example, if you have 200 videos and 10 images, 10 images will be duplicated to the length of 200. 
+extend_dataset: False
+
+# Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD. 
+# The latents will be saved under your training folder, and loaded automatically for training.
+# This both saves memory and speeds up training and takes very little disk space.
+cache_latents: True
+
+# If you have cached latents set to `True` and have a directory of cached latents,
+# you can skip the caching process and load previously saved ones. 
+cached_latent_dir: null #/path/to/cached_latents
+
+# Train the text encoder for the model. LoRA Training overrides this setting.
+train_text_encoder: False
+
+# https://github.com/cloneofsimo/lora (NOT Compatible with webui extension)
+# This is the first, original implementation of LoRA by cloneofsimo.
+# Use this version if you want to maintain compatibility to the original version.
+
+# https://github.com/ExponentialML/Stable-LoRA/tree/main (Compatible with webui text2video extension)
+# This is an implementation based off of the original LoRA repository by Microsoft, and the default LoRA method here.
+# It works a different by using embeddings instead of the intermediate activations (Linear || Conv).
+# This means that there isn't an extra function when doing low ranking adaption.
+# It solely saves the weight differential between the initialized weights and updates. 
+
+# "cloneofsimo" or "stable_lora"
+lora_version: "cloneofsimo"
+
+# Use LoRA for the UNET model.
+use_unet_lora: True
+
+# Use LoRA for the Text Encoder. If this is set, the text encoder for the model will not be trained.
+use_text_lora: True
+
+# LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
+# See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+lora_unet_dropout: 0.1
+
+lora_text_dropout: 0.1
+
+# https://github.com/kabachuha/sd-webui-text2video
+# This saves a LoRA that is compatible with the text2video webui extension.
+# It only works when the lora version is 'stable_lora'.
+# This is also a DIFFERENT implementation than Kohya's, so it will NOT work the same implementation.
+save_lora_for_webui: True
+
+# The LoRA file will be converted to a different format to be compatible with the webui extension.
+# The difference between this and 'save_lora_for_webui' is that you can continue training a Diffusers pipeline model
+# when this version is set to False
+only_lora_for_webui: False
+
+# Choose whether or not ito save the full pretrained model weights for both checkpoints and after training.
+# The only time you want this off is if you're doing full LoRA training.
+save_pretrained_model: False
+
+# The modules to use for LoRA. Different from 'trainable_modules'.
+unet_lora_modules:
+  - "UNet3DConditionModel"
+  #- "ResnetBlock2D"
+  #- "TransformerTemporalModel"
+  #- "Transformer2DModel"
+  #- "CrossAttention"
+  #- "Attention"
+  #- "GEGLU"
+  #- "TemporalConvLayer"
+
+# The modules to use for LoRA. Different from `trainable_text_modules`.
+text_encoder_lora_modules:
+  - "CLIPEncoderLayer"
+  #- "CLIPAttention"
+
+# The rank for LoRA training. With ModelScope, the maximum should be 1024. 
+# VRAM increases with higher rank, lower when decreased.
+lora_rank: 8
+
+# Training data parameters
+train_data:
+
+  # The width and height in which you want your training data to be resized to.
+  width: 256      
+  height: 256
+
+  # This will find the closest aspect ratio to your input width and height. 
+  # For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256
+  use_bucketing: True
+
+  # The start frame index where your videos should start (Leave this at one for json and folder based training).
+  sample_start_idx: 1
+
+  # Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset.
+  fps: 24
+
+  # For 'single_video' and 'json'. The number of frames to "step" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...).  
+  frame_step: 1
+
+  # The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
+  n_sample_frames: 8
+
+  # 'single_video'
+  single_video_path: "/home/xinzhi/Text-To-Video-Finetuning/input/v_SoccerJuggling_g16_c01.mp4"
+
+  # The prompt when using a a single video file
+  single_video_prompt: "A person is playing soccer."
+
+  # Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'.
+  fallback_prompt: ''
+
+  # 'folder'
+  path: "path/to/folder/of/videos/"
+
+  # 'json'
+  json_path: 'path/to/train/json/'
+
+  # 'image'
+  image_dir: 'path/to/image/directory'
+
+  # The prompt for all image files. Leave blank to use caption files (.txt) 
+  single_img_prompt: ""
+
+# Validation data parameters.
+validation_data:
+
+  # A custom prompt that is different from your training dataset. 
+  prompt: ""
+
+  # Whether or not to sample preview during training (Requires more VRAM).
+  sample_preview: True
+
+  # The number of frames to sample during validation.
+  num_frames: 16
+
+  # Height and width of validation sample.
+  width: 256
+  height: 256
+
+  # Number of inference steps when generating the video.
+  num_inference_steps: 25
+
+  # CFG scale
+  guidance_scale: 9
+
+# Learning rate for AdamW
+learning_rate: 5e-6
+
+# Weight decay. Higher = more regularization. Lower = closer to dataset.
+adam_weight_decay: 0
+
+# Optimizer parameters for the UNET. Overrides base learning rate parameters.
+extra_unet_params: null
+  #learning_rate: 1e-5
+  #adam_weight_decay: 1e-4
+
+# Optimizer parameters for the Text Encoder. Overrides base learning rate parameters.
+extra_text_encoder_params: null
+  #learning_rate: 5e-6
+  #adam_weight_decay: 0.2
+
+# How many batches to train. Not to be confused with video frames.
+train_batch_size: 1
+
+# Maximum number of train steps. Model is saved after training.
+max_train_steps: 5000
+
+# Saves a model every nth step.
+checkpointing_steps: 1000
+
+# How many steps to do for validation if sample_preview is enabled.
+validation_steps: 200
+
+# Which modules we want to unfreeze for the UNET. Advanced usage.
+trainable_modules:
+  - "all"
+  # If you want to ignore temporal attention entirely, remove "attn1-2" and replace with ".attentions"
+  # This is for self attetion. Activates for spatial and temporal dimensions if n_sample_frames > 1
+  #- "attn1"
+
+  # This is for cross attention (image & text data). Activates for spatial and temporal dimensions if n_sample_frames > 1
+  #- "attn2"
+
+  #  Convolution networks that hold temporal information. Activates for spatial and temporal dimensions if n_sample_frames > 1
+  #- 'temp_conv'
+
+
+# Which modules we want to unfreeze for the Text Encoder. Advanced usage.
+trainable_text_modules:
+  - "all"
+
+# Seed for validation.
+seed: 64
+
+# Whether or not we want to use mixed precision with accelerate
+mixed_precision: "fp16"
+
+# This seems to be incompatible at the moment.
+use_8bit_adam: False 
+
+# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
+# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
+gradient_checkpointing: True
+text_encoder_gradient_checkpointing: True
+
+# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
+enable_xformers_memory_efficient_attention: False
+
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
diff --git a/configs/v2/low_vram_config_example.yaml b/configs/v2/low_vram_config_example.yaml
@@ -15,9 +15,6 @@ output_dir: "./outputs"
 # 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor
 # 'single_video': A single video file.mp4 and text prompt
 dataset_types: 
-  - 'image'
-  - 'folder'
-  - 'json'
   - 'single_video'
 
 # Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise
@@ -49,7 +46,7 @@ train_text_encoder: False
 use_unet_lora: False
 
 # Use LoRA for the Text Encoder.
-use_text_lora: True
+use_text_lora: False
 
 # The modules to use for LoRA. Different from 'trainable_modules'.
 unet_lora_modules:
@@ -84,13 +81,13 @@ train_data:
   frame_step: 5
 
   # The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
-  n_sample_frames: 2
+  n_sample_frames: 4
 
   # 'single_video'
-  single_video_path: "path/to/single/video.mp4"
+  single_video_path: "./input/v_SoccerJuggling_g16_c01.mp4"
 
   # The prompt when using a a single video file
-  single_video_prompt: ""
+  single_video_prompt: "A sks person is playing soccer."
 
   # Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'.
   fallback_prompt: ''
@@ -149,13 +146,13 @@ extra_text_encoder_params: null
 train_batch_size: 1
 
 # Maximum number of train steps. Model is saved after training.
-max_train_steps: 10000
+max_train_steps: 8000
 
 # Saves a model every nth step.
-checkpointing_steps: 2500
+checkpointing_steps: 1000
 
 # How many steps to do for validation if sample_preview is enabled.
-validation_steps: 100
+validation_steps: 1000
 
 # Which modules we want to unfreeze for the UNET. Advanced usage.
 trainable_modules:
-Original file line number
+Diff line change
@@ Expand Up / @@ -133,3 +133,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    outputs/*
+    runs/*