Skip to content
This repository was archived by the owner on Dec 14, 2023. It is now read-only.

Evals #152

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open

Evals #152

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
179e4e0
Initial setup for training a single video.
Nov 14, 2023
95a58b8
Add tensorboard to the train.py
Nov 14, 2023
12b2ceb
Fix bug for tensorboard.
Nov 14, 2023
6c44dde
Merge pull request #1 from xfanac/tensorboard
xfanac Nov 14, 2023
24d866d
Use a unique token for training
Nov 15, 2023
18c064d
Support batch inference with a bash script.
Nov 15, 2023
ba9b9ac
Merge pull request #2 from xfanac/batch_inference
xfanac Nov 15, 2023
fa1ca3b
More frequent checkpoints
Nov 15, 2023
0d103b3
added to pytest to requirements
masoudcharkhabi Nov 16, 2023
aa041ec
Add dog dataset and its config.
Nov 21, 2023
c363a54
Merge pull request #3 from xfanac/dog_config
xfanac Nov 21, 2023
3c16628
Update batch inference
Nov 21, 2023
dcabede
Merge pull request #4 from xfanac/batch_inference1
xfanac Nov 21, 2023
a480918
Add a jupyter notebook for examining the output
Nov 29, 2023
cd79f46
Merge pull request #5 from xfanac/check_ouput
xfanac Nov 29, 2023
385fab8
Merge branch 'main' of https://github.com/xfanac/Text-To-Video-Finetu…
masoudcharkhabi Dec 4, 2023
0e613a6
run evals locally
masoudcharkhabi Dec 4, 2023
474ab3d
added clipy
masoudcharkhabi Dec 4, 2023
d8cf188
added clip
masoudcharkhabi Dec 4, 2023
15497c1
added clip
masoudcharkhabi Dec 4, 2023
905330d
working clip
masoudcharkhabi Dec 4, 2023
d3423b7
FID and CLIP demo working
masoudcharkhabi Dec 5, 2023
e08e26b
extract random frames and run FID
masoudcharkhabi Dec 5, 2023
150fcb7
added random and interval frame sampling
masoudcharkhabi Dec 5, 2023
74a017d
added 4 experiment vids
masoudcharkhabi Dec 6, 2023
ffcc597
added clip
masoudcharkhabi Dec 6, 2023
1263456
CLIP, FID, and torchmetrics FID on directory of clips, and new GT vids
masoudcharkhabi Dec 8, 2023
a1f2395
corrected order of target and reference directories, and replaced FID…
masoudcharkhabi Dec 9, 2023
32a7cdc
added new charlie model results
masoudcharkhabi Dec 9, 2023
034d835
added original and new data
masoudcharkhabi Dec 11, 2023
0e01732
added compressibility
masoudcharkhabi Dec 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ dmypy.json

# Pyre type checker
.pyre/

outputs/*
runs/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
### Requirements & Installation

```bash
git clone https://github.com/ExponentialML/Text-To-Video-Finetuning.git
git clone git@github.com:xfanac/Text-To-Video-Finetuning.git
cd Text-To-Video-Finetuning
git lfs install
git clone https://huggingface.co/damo-vilab/text-to-video-ms-1.7b ./models/model_scope_diffusers/
Expand Down
17 changes: 17 additions & 0 deletions batch_inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

check_points='./outputs/train_2023-11-20T23-10-32/checkpoint-*'
suffix=unique_token_dog_only_$(date +%s)
prompts=("\"A « dog is playing ball\"" "\"A « dog is runnig\"" "\"A dog\"" "\"A dog is running\"")
for prompt in "${prompts[@]}"
do
per_file_suffix=_"$suffix"_benchmark
eval "python inference.py --prompt $prompt --model ./models/model_scope_diffusers --sdp --suffix $per_file_suffix --output-dir ./output/$suffix"
for checkpoint_file in $check_points
do
iteration="${checkpoint_file##*-}"
per_file_suffix=_"$suffix"_"$iteration"

eval "python inference.py --prompt $prompt --model $checkpoint_file --sdp --suffix $per_file_suffix --output-dir ./output/$suffix"
done
done
232 changes: 232 additions & 0 deletions configs/v2/fine_tune_single_video_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# Pretrained diffusers model path.
pretrained_model_path: "./models/model_scope_diffusers/" #https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/tree/main

# The folder where your training outputs will be placed.
output_dir: "./outputs"

# You can train multiple datasets at once. They will be joined together for training.
# Simply remove the line you don't need, or keep them all for mixed training.

# 'image': A folder of images and captions (.txt)
# 'folder': A folder a videos and captions (.txt)
# 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor
# 'single_video': A single video file.mp4 and text prompt
dataset_types:
- 'single_video'

# Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise
# If this is enabled, rescale_schedule will be disabled.
offset_noise_strength: 0.1
use_offset_noise: False

# Uses schedule rescale, also known as the "better" offset noise. See https://arxiv.org/pdf/2305.08891.pdf
# If this is enabled, offset noise will be disabled.
rescale_schedule: False

# When True, this extends all items in all enabled datasets to the highest length.
# For example, if you have 200 videos and 10 images, 10 images will be duplicated to the length of 200.
extend_dataset: False

# Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD.
# The latents will be saved under your training folder, and loaded automatically for training.
# This both saves memory and speeds up training and takes very little disk space.
cache_latents: True

# If you have cached latents set to `True` and have a directory of cached latents,
# you can skip the caching process and load previously saved ones.
cached_latent_dir: null #/path/to/cached_latents

# Train the text encoder for the model. LoRA Training overrides this setting.
train_text_encoder: False

# https://github.com/cloneofsimo/lora (NOT Compatible with webui extension)
# This is the first, original implementation of LoRA by cloneofsimo.
# Use this version if you want to maintain compatibility to the original version.

# https://github.com/ExponentialML/Stable-LoRA/tree/main (Compatible with webui text2video extension)
# This is an implementation based off of the original LoRA repository by Microsoft, and the default LoRA method here.
# It works a different by using embeddings instead of the intermediate activations (Linear || Conv).
# This means that there isn't an extra function when doing low ranking adaption.
# It solely saves the weight differential between the initialized weights and updates.

# "cloneofsimo" or "stable_lora"
lora_version: "cloneofsimo"

# Use LoRA for the UNET model.
use_unet_lora: True

# Use LoRA for the Text Encoder. If this is set, the text encoder for the model will not be trained.
use_text_lora: True

# LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
# See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
lora_unet_dropout: 0.1

lora_text_dropout: 0.1

# https://github.com/kabachuha/sd-webui-text2video
# This saves a LoRA that is compatible with the text2video webui extension.
# It only works when the lora version is 'stable_lora'.
# This is also a DIFFERENT implementation than Kohya's, so it will NOT work the same implementation.
save_lora_for_webui: True

# The LoRA file will be converted to a different format to be compatible with the webui extension.
# The difference between this and 'save_lora_for_webui' is that you can continue training a Diffusers pipeline model
# when this version is set to False
only_lora_for_webui: False

# Choose whether or not ito save the full pretrained model weights for both checkpoints and after training.
# The only time you want this off is if you're doing full LoRA training.
save_pretrained_model: False

# The modules to use for LoRA. Different from 'trainable_modules'.
unet_lora_modules:
- "UNet3DConditionModel"
#- "ResnetBlock2D"
#- "TransformerTemporalModel"
#- "Transformer2DModel"
#- "CrossAttention"
#- "Attention"
#- "GEGLU"
#- "TemporalConvLayer"

# The modules to use for LoRA. Different from `trainable_text_modules`.
text_encoder_lora_modules:
- "CLIPEncoderLayer"
#- "CLIPAttention"

# The rank for LoRA training. With ModelScope, the maximum should be 1024.
# VRAM increases with higher rank, lower when decreased.
lora_rank: 8

# Training data parameters
train_data:

# The width and height in which you want your training data to be resized to.
width: 256
height: 256

# This will find the closest aspect ratio to your input width and height.
# For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256
use_bucketing: True

# The start frame index where your videos should start (Leave this at one for json and folder based training).
sample_start_idx: 1

# Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset.
fps: 24

# For 'single_video' and 'json'. The number of frames to "step" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...).
frame_step: 1

# The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
n_sample_frames: 8

# 'single_video'
single_video_path: "/home/xinzhi/Text-To-Video-Finetuning/input/v_SoccerJuggling_g16_c01.mp4"

# The prompt when using a a single video file
single_video_prompt: "A person is playing soccer."

# Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'.
fallback_prompt: ''

# 'folder'
path: "path/to/folder/of/videos/"

# 'json'
json_path: 'path/to/train/json/'

# 'image'
image_dir: 'path/to/image/directory'

# The prompt for all image files. Leave blank to use caption files (.txt)
single_img_prompt: ""

# Validation data parameters.
validation_data:

# A custom prompt that is different from your training dataset.
prompt: ""

# Whether or not to sample preview during training (Requires more VRAM).
sample_preview: True

# The number of frames to sample during validation.
num_frames: 16

# Height and width of validation sample.
width: 256
height: 256

# Number of inference steps when generating the video.
num_inference_steps: 25

# CFG scale
guidance_scale: 9

# Learning rate for AdamW
learning_rate: 5e-6

# Weight decay. Higher = more regularization. Lower = closer to dataset.
adam_weight_decay: 0

# Optimizer parameters for the UNET. Overrides base learning rate parameters.
extra_unet_params: null
#learning_rate: 1e-5
#adam_weight_decay: 1e-4

# Optimizer parameters for the Text Encoder. Overrides base learning rate parameters.
extra_text_encoder_params: null
#learning_rate: 5e-6
#adam_weight_decay: 0.2

# How many batches to train. Not to be confused with video frames.
train_batch_size: 1

# Maximum number of train steps. Model is saved after training.
max_train_steps: 5000

# Saves a model every nth step.
checkpointing_steps: 1000

# How many steps to do for validation if sample_preview is enabled.
validation_steps: 200

# Which modules we want to unfreeze for the UNET. Advanced usage.
trainable_modules:
- "all"
# If you want to ignore temporal attention entirely, remove "attn1-2" and replace with ".attentions"
# This is for self attetion. Activates for spatial and temporal dimensions if n_sample_frames > 1
#- "attn1"

# This is for cross attention (image & text data). Activates for spatial and temporal dimensions if n_sample_frames > 1
#- "attn2"

# Convolution networks that hold temporal information. Activates for spatial and temporal dimensions if n_sample_frames > 1
#- 'temp_conv'


# Which modules we want to unfreeze for the Text Encoder. Advanced usage.
trainable_text_modules:
- "all"

# Seed for validation.
seed: 64

# Whether or not we want to use mixed precision with accelerate
mixed_precision: "fp16"

# This seems to be incompatible at the moment.
use_8bit_adam: False

# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
gradient_checkpointing: True
text_encoder_gradient_checkpointing: True

# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
enable_xformers_memory_efficient_attention: False

# Use scaled dot product attention (Only available with >= Torch 2.0)
enable_torch_2_attn: True
17 changes: 7 additions & 10 deletions configs/v2/low_vram_config_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ output_dir: "./outputs"
# 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor
# 'single_video': A single video file.mp4 and text prompt
dataset_types:
- 'image'
- 'folder'
- 'json'
- 'single_video'

# Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise
Expand Down Expand Up @@ -49,7 +46,7 @@ train_text_encoder: False
use_unet_lora: False

# Use LoRA for the Text Encoder.
use_text_lora: True
use_text_lora: False

# The modules to use for LoRA. Different from 'trainable_modules'.
unet_lora_modules:
Expand Down Expand Up @@ -84,13 +81,13 @@ train_data:
frame_step: 5

# The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
n_sample_frames: 2
n_sample_frames: 4

# 'single_video'
single_video_path: "path/to/single/video.mp4"
single_video_path: "./input/v_SoccerJuggling_g16_c01.mp4"

# The prompt when using a a single video file
single_video_prompt: ""
single_video_prompt: "A sks person is playing soccer."

# Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'.
fallback_prompt: ''
Expand Down Expand Up @@ -149,13 +146,13 @@ extra_text_encoder_params: null
train_batch_size: 1

# Maximum number of train steps. Model is saved after training.
max_train_steps: 10000
max_train_steps: 8000

# Saves a model every nth step.
checkpointing_steps: 2500
checkpointing_steps: 1000

# How many steps to do for validation if sample_preview is enabled.
validation_steps: 100
validation_steps: 1000

# Which modules we want to unfreeze for the UNET. Advanced usage.
trainable_modules:
Expand Down
Loading