@@ -38,7 +38,7 @@ class DDIMSchedulerOutput(BaseOutput):
3838 prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
3939 Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
4040 denoising loop.
41- pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images, *optional* ):
41+ pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
4242 The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
4343 `pred_original_sample` can be used to preview progress or for guidance.
4444 """
@@ -49,36 +49,36 @@ class DDIMSchedulerOutput(BaseOutput):
4949
5050# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
5151def betas_for_alpha_bar (
52- num_diffusion_timesteps : int ,
53- max_beta : float = 0.999 ,
54- alpha_transform_type : Literal [ "cosine" , "exp" ] = "cosine" ,
55- ) -> torch . Tensor :
52+ num_diffusion_timesteps ,
53+ max_beta = 0.999 ,
54+ alpha_transform_type = "cosine" ,
55+ ):
5656 """
5757 Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
5858 (1-beta) over time from t = [0,1].
5959
6060 Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
6161 to that part of the diffusion process.
6262
63+
6364 Args:
64- num_diffusion_timesteps (`int`):
65- The number of betas to produce.
66- max_beta (`float`, defaults to 0.999):
67- The maximum beta to use; use values lower than 1 to avoid numerical instability.
68- alpha_transform_type (`Literal["cosine", "exp"]`, defaults to `"cosine"`):
69- The type of noise schedule for `alpha_bar`. Must be one of `"cosine"` or `"exp"`.
65+ num_diffusion_timesteps (`int`): the number of betas to produce.
66+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
67+ prevent singularities.
68+ alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
69+ Choose from `cosine` or `exp`
7070
7171 Returns:
72- `torch.Tensor`: The betas used by the scheduler to step the model outputs.
72+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
7373 """
7474 if alpha_transform_type == "cosine" :
7575
76- def alpha_bar_fn (t : float ) -> float :
76+ def alpha_bar_fn (t ) :
7777 return math .cos ((t + 0.008 ) / 1.008 * math .pi / 2 ) ** 2
7878
7979 elif alpha_transform_type == "exp" :
8080
81- def alpha_bar_fn (t : float ) -> float :
81+ def alpha_bar_fn (t ) :
8282 return math .exp (t * - 12.0 )
8383
8484 else :
@@ -281,21 +281,13 @@ def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
281281 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
282282 def _threshold_sample (self , sample : torch .Tensor ) -> torch .Tensor :
283283 """
284- Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
284+ " Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
285285 prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
286286 s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
287287 pixels from saturation at each step. We find that dynamic thresholding results in significantly better
288- photorealism as well as better image-text alignment, especially when using very large guidance weights.
289-
290- See https://huggingface.co/papers/2205.11487
288+ photorealism as well as better image-text alignment, especially when using very large guidance weights."
291289
292- Args:
293- sample (`torch.Tensor`):
294- The sample to threshold.
295-
296- Returns:
297- `torch.Tensor`:
298- The thresholded sample.
290+ https://huggingface.co/papers/2205.11487
299291 """
300292 dtype = sample .dtype
301293 batch_size , channels , * remaining_dims = sample .shape
@@ -509,24 +501,6 @@ def add_noise(
509501 noise : torch .Tensor ,
510502 timesteps : torch .IntTensor ,
511503 ) -> torch .Tensor :
512- """
513- Add noise to the original samples according to the noise magnitude at each timestep.
514-
515- This implements the forward diffusion process using the formula: `noisy_sample = sqrt(alpha_prod) *
516- original_sample + sqrt(1 - alpha_prod) * noise`
517-
518- Args:
519- original_samples (`torch.Tensor`):
520- The original clean samples to which noise will be added.
521- noise (`torch.Tensor`):
522- The noise tensor to add, typically sampled from a Gaussian distribution.
523- timesteps (`torch.IntTensor`):
524- The timesteps indicating the noise level from the diffusion schedule.
525-
526- Returns:
527- `torch.Tensor`:
528- The noisy samples with noise added according to the timestep schedule.
529- """
530504 # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
531505 # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
532506 # for the subsequent add_noise calls
@@ -549,27 +523,6 @@ def add_noise(
549523
550524 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
551525 def get_velocity (self , sample : torch .Tensor , noise : torch .Tensor , timesteps : torch .IntTensor ) -> torch .Tensor :
552- """
553- Compute the velocity prediction for v-prediction models.
554-
555- The velocity is computed using the formula: `velocity = sqrt(alpha_prod) * noise - sqrt(1 - alpha_prod) *
556- sample`
557-
558- This is used in v-prediction models where the model directly predicts the velocity instead of the noise or the
559- sample. See section 2.4 of [Imagen Video](https://huggingface.co/papers/2210.02303) paper.
560-
561- Args:
562- sample (`torch.Tensor`):
563- The input sample (x_t) at the current timestep.
564- noise (`torch.Tensor`):
565- The noise tensor corresponding to the sample.
566- timesteps (`torch.IntTensor`):
567- The timesteps at which to compute the velocity.
568-
569- Returns:
570- `torch.Tensor`:
571- The velocity prediction computed from the sample and noise at the given timesteps.
572- """
573526 # Make sure alphas_cumprod and timestep have same device and dtype as sample
574527 self .alphas_cumprod = self .alphas_cumprod .to (device = sample .device )
575528 alphas_cumprod = self .alphas_cumprod .to (dtype = sample .dtype )
0 commit comments