first commit
commit
0835b2db16
@ -0,0 +1,12 @@
|
||||
.idea
|
||||
.DS_Store
|
||||
__pycache__
|
||||
outputs/*
|
||||
other
|
||||
prolly_delete
|
||||
coverage
|
||||
downloads
|
||||
.coverage
|
||||
.coveragerc
|
||||
/imaginairy/data/stable-diffusion-v1.yaml
|
||||
/imaginairy/data/stable-diffusion-v1-4.ckpt
|
@ -0,0 +1,12 @@
|
||||
# ImaginAIry
|
||||
|
||||
AI imagined images.
|
||||
|
||||
|
||||
|
||||
|
||||
# Models
|
||||
|
||||
- LDM - Latent Diffusion
|
||||
- Stable Diffusion
|
||||
-
|
@ -0,0 +1,370 @@
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
from contextlib import nullcontext
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from einops import rearrange
|
||||
from omegaconf import OmegaConf
|
||||
from pytorch_lightning import seed_everything
|
||||
from torch import autocast
|
||||
|
||||
from imaginairy.models.diffusion.ddim import DDIMSampler
|
||||
from imaginairy.models.diffusion.plms import PLMSSampler
|
||||
from imaginairy.utils import get_device, instantiate_from_config
|
||||
|
||||
LIB_PATH = os.path.dirname(__file__)
|
||||
|
||||
|
||||
def load_model_from_config(config, ckpt, verbose=False):
|
||||
print(f"Loading model from {ckpt}")
|
||||
pl_sd = torch.load(ckpt, map_location="cpu")
|
||||
if "global_step" in pl_sd:
|
||||
print(f"Global Step: {pl_sd['global_step']}")
|
||||
sd = pl_sd["state_dict"]
|
||||
model = instantiate_from_config(config.model)
|
||||
m, u = model.load_state_dict(sd, strict=False)
|
||||
if len(m) > 0 and verbose:
|
||||
print("missing keys:")
|
||||
print(m)
|
||||
if len(u) > 0 and verbose:
|
||||
print("unexpected keys:")
|
||||
print(u)
|
||||
|
||||
model.cuda()
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--prompt", type=str, nargs="?", default=None, help="the prompt to render"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outdir",
|
||||
type=str,
|
||||
nargs="?",
|
||||
help="dir to write results to",
|
||||
default="outputs/txt2img-samples",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip_save",
|
||||
action="store_true",
|
||||
help="do not save individual samples. For speed measurements.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--steps",
|
||||
type=int,
|
||||
default=50,
|
||||
help="number of sampling steps",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--plms", action="store_true", help="use plms sampling", default=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fixed_code",
|
||||
action="store_true",
|
||||
help="if enabled, uses the same starting code across samples ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ddim_eta",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="ddim eta (eta=0.0 corresponds to deterministic sampling",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="how many images",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--H",
|
||||
type=int,
|
||||
default=512,
|
||||
help="image height, in pixel space",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--W",
|
||||
type=int,
|
||||
default=512,
|
||||
help="image width, in pixel space",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--C",
|
||||
type=int,
|
||||
default=4,
|
||||
help="latent channels",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--f",
|
||||
type=int,
|
||||
default=8,
|
||||
help="downsampling factor",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale",
|
||||
type=float,
|
||||
default=7.5,
|
||||
help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default="configs/stable-diffusion/v1-inference.yaml",
|
||||
help="path to config which constructs model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="data/sd-v1-4.ckpt",
|
||||
help="path to checkpoint of model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="the seed (for reproducible sampling)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--precision",
|
||||
type=str,
|
||||
help="evaluate at this precision",
|
||||
choices=["full", "autocast"],
|
||||
default="autocast",
|
||||
)
|
||||
opt = parser.parse_args()
|
||||
if opt.plms:
|
||||
sampler = "PLMS"
|
||||
else:
|
||||
sampler = "DDIM"
|
||||
prompt = ImaginePrompt(
|
||||
opt.prompt,
|
||||
seed=opt.seed,
|
||||
sampler_type=sampler,
|
||||
steps=opt.steps,
|
||||
height=opt.H,
|
||||
width=opt.W,
|
||||
upscale=True,
|
||||
fix_faces=True,
|
||||
)
|
||||
imagine(
|
||||
[prompt],
|
||||
config=opt.config,
|
||||
ckpt=opt.ckpt,
|
||||
outdir=opt.outdir,
|
||||
fixed_code=opt.fixed_code,
|
||||
latent_channels=opt.C,
|
||||
precision=opt.precision,
|
||||
downsampling_factor=opt.f,
|
||||
skip_save=opt.skip_save,
|
||||
ddim_eta=opt.ddim_eta,
|
||||
)
|
||||
|
||||
|
||||
class WeightedPrompt:
|
||||
def __init__(self, text, weight=1):
|
||||
self.text = text
|
||||
self.weight = weight
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.weight}*({self.text})"
|
||||
|
||||
|
||||
class ImaginePrompt:
|
||||
def __init__(
|
||||
self,
|
||||
prompt=None,
|
||||
seed=None,
|
||||
prompt_strength=7.5,
|
||||
sampler_type="PLMS",
|
||||
steps=50,
|
||||
height=512,
|
||||
width=512,
|
||||
upscale=True,
|
||||
fix_faces=True,
|
||||
parts=None,
|
||||
):
|
||||
prompt = prompt if prompt is not None else "a scenic landscape"
|
||||
if isinstance(prompt, str):
|
||||
self.prompts = [WeightedPrompt(prompt, 1)]
|
||||
else:
|
||||
self.prompts = prompt
|
||||
self.prompts.sort(key=lambda p: p.weight, reverse=True)
|
||||
self.seed = random.randint(1, 1_000_000_000) if seed is None else seed
|
||||
self.prompt_strength = prompt_strength
|
||||
self.sampler_type = sampler_type
|
||||
self.steps = steps
|
||||
self.height = height
|
||||
self.width = width
|
||||
self.upscale = upscale
|
||||
self.fix_faces = fix_faces
|
||||
self.parts = parts or {}
|
||||
|
||||
@property
|
||||
def prompt_text(self):
|
||||
if len(self.prompts) == 1:
|
||||
return self.prompts[0].text
|
||||
return "|".join(str(p) for p in self.prompts)
|
||||
|
||||
|
||||
def imagine(
|
||||
prompts,
|
||||
config="data/stable-diffusion-v1.yaml",
|
||||
ckpt="data/stable-diffusion-v1-4.ckpt",
|
||||
outdir="outputs/txt2img-samples",
|
||||
fixed_code=None,
|
||||
latent_channels=4,
|
||||
downsampling_factor=8,
|
||||
precision="autocast",
|
||||
skip_save=False,
|
||||
ddim_eta=0.0,
|
||||
):
|
||||
config = OmegaConf.load(f"{LIB_PATH}/{config}")
|
||||
model = load_model_from_config(config, f"{LIB_PATH}/{ckpt}")
|
||||
|
||||
model = model.to(get_device())
|
||||
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
outpath = outdir
|
||||
|
||||
sample_path = os.path.join(outpath)
|
||||
big_path = os.path.join(sample_path, "esrgan")
|
||||
os.makedirs(sample_path, exist_ok=True)
|
||||
os.makedirs(big_path, exist_ok=True)
|
||||
base_count = len(os.listdir(sample_path))
|
||||
|
||||
precision_scope = autocast if precision == "autocast" else nullcontext
|
||||
with (torch.no_grad(), precision_scope("cuda")):
|
||||
for prompt in prompts:
|
||||
seed_everything(prompt.seed)
|
||||
uc = None
|
||||
if prompt.prompt_strength != 1.0:
|
||||
uc = model.get_learned_conditioning(1 * [""])
|
||||
total_weight = sum(wp.weight for wp in prompt.prompts)
|
||||
c = sum(
|
||||
[
|
||||
model.get_learned_conditioning(wp.text) * (wp.weight / total_weight)
|
||||
for wp in prompt.prompts
|
||||
]
|
||||
)
|
||||
# c = model.get_learned_conditioning(prompt.prompt_text)
|
||||
|
||||
shape = [
|
||||
latent_channels,
|
||||
prompt.height // downsampling_factor,
|
||||
prompt.width // downsampling_factor,
|
||||
]
|
||||
|
||||
def img_callback(samples, i):
|
||||
return
|
||||
samples = model.decode_first_stage(samples)
|
||||
samples = torch.clamp((samples + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
for pred_x0 in samples:
|
||||
pred_x0 = 255.0 * rearrange(pred_x0.cpu().numpy(), "c h w -> h w c")
|
||||
filename = f"{base_count:08}_S{seed}_step{i:04}.jpg"
|
||||
Image.fromarray(pred_x0.astype(np.uint8)).save(
|
||||
os.path.join(sample_path, filename)
|
||||
)
|
||||
|
||||
start_code = None
|
||||
if fixed_code:
|
||||
start_code = torch.randn(
|
||||
[1, latent_channels, prompt.height, prompt.width],
|
||||
device=get_device(),
|
||||
)
|
||||
sampler = get_sampler(prompt.sampler_type, model)
|
||||
samples_ddim, _ = sampler.sample(
|
||||
S=prompt.steps,
|
||||
conditioning=c,
|
||||
batch_size=1,
|
||||
shape=shape,
|
||||
verbose=False,
|
||||
unconditional_guidance_scale=prompt.prompt_strength,
|
||||
unconditional_conditioning=uc,
|
||||
eta=ddim_eta,
|
||||
x_T=start_code,
|
||||
img_callback=img_callback,
|
||||
)
|
||||
|
||||
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
||||
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
|
||||
if not skip_save:
|
||||
for x_sample in x_samples_ddim:
|
||||
x_sample = 255.0 * rearrange(
|
||||
x_sample.cpu().numpy(), "c h w -> h w c"
|
||||
)
|
||||
basefilename = f"{base_count:06}_{prompt.seed}_{prompt.sampler_type}{prompt.steps}_PS{prompt.prompt_strength}_{prompt_normalized(prompt.prompt_text)}"
|
||||
filepath = os.path.join(sample_path, f"{basefilename}.jpg")
|
||||
img = Image.fromarray(x_sample.astype(np.uint8))
|
||||
if prompt.fix_faces:
|
||||
img = fix_faces_GFPGAN(img)
|
||||
img.save(filepath)
|
||||
if prompt.upscale:
|
||||
enlarge_realesrgan2x(
|
||||
filepath,
|
||||
os.path.join(big_path, basefilename) + ".jpg",
|
||||
)
|
||||
base_count += 1
|
||||
return f"{basefilename}.jpg"
|
||||
|
||||
|
||||
def prompt_normalized(prompt):
|
||||
return re.sub(r"[^a-zA-Z0-9.,]+", "_", prompt)[:130]
|
||||
|
||||
|
||||
DOWNLOADED_FILES_PATH = f"{LIB_PATH}/../downloads/"
|
||||
ESRGAN_PATH = DOWNLOADED_FILES_PATH + "realesrgan-ncnn-vulkan/realesrgan-ncnn-vulkan"
|
||||
|
||||
|
||||
def enlarge_realesrgan2x(src, dst):
|
||||
process = subprocess.Popen(
|
||||
[ESRGAN_PATH, "-i", src, "-o", dst, "-n", "realesrgan-x4plus"]
|
||||
)
|
||||
process.wait()
|
||||
|
||||
|
||||
def get_sampler(sampler_type, model):
|
||||
sampler_type = sampler_type.upper()
|
||||
if sampler_type == "PLMS":
|
||||
return PLMSSampler(model)
|
||||
elif sampler_type == "DDIM":
|
||||
return DDIMSampler(model)
|
||||
|
||||
|
||||
def gfpgan_model():
|
||||
from gfpgan import GFPGANer
|
||||
|
||||
return GFPGANer(
|
||||
model_path=DOWNLOADED_FILES_PATH
|
||||
+ "GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth",
|
||||
upscale=1,
|
||||
arch="clean",
|
||||
channel_multiplier=2,
|
||||
bg_upsampler=None,
|
||||
device=torch.device(get_device()),
|
||||
)
|
||||
|
||||
|
||||
def fix_faces_GFPGAN(image):
|
||||
image = image.convert("RGB")
|
||||
cropped_faces, restored_faces, restored_img = gfpgan_model().enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
has_aligned=False,
|
||||
only_center_face=False,
|
||||
paste_back=True,
|
||||
)
|
||||
res = Image.fromarray(restored_img)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,151 @@
|
||||
import torch
|
||||
import pytorch_lightning as pl
|
||||
import torch.nn.functional as F
|
||||
|
||||
from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
|
||||
|
||||
from imaginairy.modules.diffusionmodules.model import Encoder, Decoder
|
||||
from imaginairy.modules.distributions import DiagonalGaussianDistribution
|
||||
|
||||
from imaginairy.utils import instantiate_from_config
|
||||
|
||||
|
||||
class VQModel(pl.LightningModule):
|
||||
def __init__(
|
||||
self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
n_embed,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key="image",
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
batch_resize_range=None,
|
||||
scheduler_config=None,
|
||||
lr_g_factor=1.0,
|
||||
remap=None,
|
||||
sane_index_shape=False, # tell vector quantizer to return indices as bhw
|
||||
):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.n_embed = n_embed
|
||||
self.image_key = image_key
|
||||
self.encoder = Encoder(**ddconfig)
|
||||
self.decoder = Decoder(**ddconfig)
|
||||
self.loss = instantiate_from_config(lossconfig)
|
||||
self.quantize = VectorQuantizer(
|
||||
n_embed,
|
||||
embed_dim,
|
||||
beta=0.25,
|
||||
remap=remap,
|
||||
sane_index_shape=sane_index_shape,
|
||||
)
|
||||
self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||
if colorize_nlabels is not None:
|
||||
assert type(colorize_nlabels) == int
|
||||
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
|
||||
if monitor is not None:
|
||||
self.monitor = monitor
|
||||
self.batch_resize_range = batch_resize_range
|
||||
if self.batch_resize_range is not None:
|
||||
print(
|
||||
f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}."
|
||||
)
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
||||
self.scheduler_config = scheduler_config
|
||||
self.lr_g_factor = lr_g_factor
|
||||
|
||||
|
||||
|
||||
class VQModelInterface(VQModel):
|
||||
def __init__(self, embed_dim, *args, **kwargs):
|
||||
super().__init__(embed_dim=embed_dim, *args, **kwargs)
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
def encode(self, x):
|
||||
h = self.encoder(x)
|
||||
h = self.quant_conv(h)
|
||||
return h
|
||||
|
||||
def decode(self, h, force_not_quantize=False):
|
||||
# also go through quantization layer
|
||||
if not force_not_quantize:
|
||||
quant, emb_loss, info = self.quantize(h)
|
||||
else:
|
||||
quant = h
|
||||
quant = self.post_quant_conv(quant)
|
||||
dec = self.decoder(quant)
|
||||
return dec
|
||||
|
||||
|
||||
class AutoencoderKL(pl.LightningModule):
|
||||
def __init__(
|
||||
self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key="image",
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.image_key = image_key
|
||||
self.encoder = Encoder(**ddconfig)
|
||||
self.decoder = Decoder(**ddconfig)
|
||||
self.loss = instantiate_from_config(lossconfig)
|
||||
assert ddconfig["double_z"]
|
||||
self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||
self.embed_dim = embed_dim
|
||||
if colorize_nlabels is not None:
|
||||
assert type(colorize_nlabels) == int
|
||||
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
|
||||
if monitor is not None:
|
||||
self.monitor = monitor
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
||||
|
||||
def init_from_ckpt(self, path, ignore_keys=list()):
|
||||
sd = torch.load(path, map_location="cpu")["state_dict"]
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
for ik in ignore_keys:
|
||||
if k.startswith(ik):
|
||||
print("Deleting key {} from state_dict.".format(k))
|
||||
del sd[k]
|
||||
self.load_state_dict(sd, strict=False)
|
||||
print(f"Restored from {path}")
|
||||
|
||||
def encode(self, x):
|
||||
h = self.encoder(x)
|
||||
moments = self.quant_conv(h)
|
||||
posterior = DiagonalGaussianDistribution(moments)
|
||||
return posterior
|
||||
|
||||
def decode(self, z):
|
||||
z = self.post_quant_conv(z)
|
||||
dec = self.decoder(z)
|
||||
return dec
|
||||
|
||||
def forward(self, input, sample_posterior=True):
|
||||
posterior = self.encode(input)
|
||||
if sample_posterior:
|
||||
z = posterior.sample()
|
||||
else:
|
||||
z = posterior.mode()
|
||||
dec = self.decode(z)
|
||||
return dec, posterior
|
||||
|
||||
def get_input(self, batch, k):
|
||||
x = batch[k]
|
||||
if len(x.shape) == 3:
|
||||
x = x[..., None]
|
||||
x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
|
||||
return x
|
@ -0,0 +1,373 @@
|
||||
"""SAMPLING ONLY."""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from imaginairy.modules.diffusionmodules.util import (
|
||||
make_ddim_sampling_parameters,
|
||||
make_ddim_timesteps,
|
||||
noise_like,
|
||||
extract_into_tensor,
|
||||
)
|
||||
from imaginairy.utils import get_device
|
||||
|
||||
|
||||
class DDIMSampler:
|
||||
def __init__(self, model, schedule="linear", **kwargs):
|
||||
super().__init__()
|
||||
self.model = model
|
||||
self.ddpm_num_timesteps = model.num_timesteps
|
||||
self.schedule = schedule
|
||||
self.device_available = get_device()
|
||||
|
||||
def register_buffer(self, name, attr):
|
||||
if type(attr) == torch.Tensor:
|
||||
if attr.device != torch.device(self.device_available):
|
||||
attr = attr.to(torch.float32).to(torch.device(self.device_available))
|
||||
setattr(self, name, attr)
|
||||
|
||||
def make_schedule(
|
||||
self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
|
||||
):
|
||||
self.ddim_timesteps = make_ddim_timesteps(
|
||||
ddim_discr_method=ddim_discretize,
|
||||
num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,
|
||||
verbose=verbose,
|
||||
)
|
||||
alphas_cumprod = self.model.alphas_cumprod
|
||||
assert (
|
||||
alphas_cumprod.shape[0] == self.ddpm_num_timesteps
|
||||
), "alphas have to be defined for each timestep"
|
||||
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||
|
||||
self.register_buffer("betas", to_torch(self.model.betas))
|
||||
self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
|
||||
self.register_buffer(
|
||||
"alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
|
||||
)
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer(
|
||||
"sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_one_minus_alphas_cumprod",
|
||||
to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
"log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recipm1_alphas_cumprod",
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
|
||||
)
|
||||
|
||||
# ddim sampling parameters
|
||||
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
|
||||
alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,
|
||||
verbose=verbose,
|
||||
)
|
||||
self.register_buffer("ddim_sigmas", ddim_sigmas)
|
||||
self.register_buffer("ddim_alphas", ddim_alphas)
|
||||
self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
|
||||
self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
|
||||
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||
(1 - self.alphas_cumprod_prev)
|
||||
/ (1 - self.alphas_cumprod)
|
||||
* (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
|
||||
)
|
||||
self.register_buffer(
|
||||
"ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(
|
||||
self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.0,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs,
|
||||
):
|
||||
if conditioning is not None:
|
||||
if isinstance(conditioning, dict):
|
||||
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||
if cbs != batch_size:
|
||||
print(
|
||||
f"Warning: Got {cbs} conditionings but batch-size is {batch_size}"
|
||||
)
|
||||
else:
|
||||
if conditioning.shape[0] != batch_size:
|
||||
print(
|
||||
f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}"
|
||||
)
|
||||
|
||||
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||
# sampling
|
||||
C, H, W = shape
|
||||
size = (batch_size, C, H, W)
|
||||
print(f"Data shape for DDIM sampling is {size}, eta {eta}")
|
||||
|
||||
samples, intermediates = self.ddim_sampling(
|
||||
conditioning,
|
||||
size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask,
|
||||
x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
return samples, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def ddim_sampling(
|
||||
self,
|
||||
cond,
|
||||
shape,
|
||||
x_T=None,
|
||||
ddim_use_original_steps=False,
|
||||
callback=None,
|
||||
timesteps=None,
|
||||
quantize_denoised=False,
|
||||
mask=None,
|
||||
x0=None,
|
||||
img_callback=None,
|
||||
log_every_t=100,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
device = self.model.betas.device
|
||||
b = shape[0]
|
||||
if x_T is None:
|
||||
img = torch.randn(shape, device=device)
|
||||
else:
|
||||
img = x_T
|
||||
|
||||
if timesteps is None:
|
||||
timesteps = (
|
||||
self.ddpm_num_timesteps
|
||||
if ddim_use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
elif timesteps is not None and not ddim_use_original_steps:
|
||||
subset_end = (
|
||||
int(
|
||||
min(timesteps / self.ddim_timesteps.shape[0], 1)
|
||||
* self.ddim_timesteps.shape[0]
|
||||
)
|
||||
- 1
|
||||
)
|
||||
timesteps = self.ddim_timesteps[:subset_end]
|
||||
|
||||
intermediates = {"x_inter": [img], "pred_x0": [img]}
|
||||
time_range = (
|
||||
reversed(range(0, timesteps))
|
||||
if ddim_use_original_steps
|
||||
else np.flip(timesteps)
|
||||
)
|
||||
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||
|
||||
iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
|
||||
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||
|
||||
if mask is not None:
|
||||
assert x0 is not None
|
||||
img_orig = self.model.q_sample(
|
||||
x0, ts
|
||||
) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1.0 - mask) * img
|
||||
|
||||
outs = self.p_sample_ddim(
|
||||
img,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised,
|
||||
temperature=temperature,
|
||||
noise_dropout=noise_dropout,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
img, pred_x0 = outs
|
||||
if callback:
|
||||
callback(i)
|
||||
if img_callback:
|
||||
img_callback(pred_x0, i)
|
||||
|
||||
if index % log_every_t == 0 or index == total_steps - 1:
|
||||
intermediates["x_inter"].append(img)
|
||||
intermediates["pred_x0"].append(pred_x0)
|
||||
|
||||
return img, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_ddim(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
index,
|
||||
repeat_noise=False,
|
||||
use_original_steps=False,
|
||||
quantize_denoised=False,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
b, *_, device = *x.shape, x.device
|
||||
|
||||
if unconditional_conditioning is None or unconditional_guidance_scale == 1.0:
|
||||
e_t = self.model.apply_model(x, t, c)
|
||||
else:
|
||||
x_in = torch.cat([x] * 2)
|
||||
t_in = torch.cat([t] * 2)
|
||||
c_in = torch.cat([unconditional_conditioning, c])
|
||||
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
|
||||
|
||||
if score_corrector is not None:
|
||||
assert self.model.parameterization == "eps"
|
||||
e_t = score_corrector.modify_score(
|
||||
self.model, e_t, x, t, c, **corrector_kwargs
|
||||
)
|
||||
|
||||
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||
alphas_prev = (
|
||||
self.model.alphas_cumprod_prev
|
||||
if use_original_steps
|
||||
else self.ddim_alphas_prev
|
||||
)
|
||||
sqrt_one_minus_alphas = (
|
||||
self.model.sqrt_one_minus_alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_sqrt_one_minus_alphas
|
||||
)
|
||||
sigmas = (
|
||||
self.model.ddim_sigmas_for_original_num_steps
|
||||
if use_original_steps
|
||||
else self.ddim_sigmas
|
||||
)
|
||||
# select parameters corresponding to the currently considered timestep
|
||||
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||
sqrt_one_minus_at = torch.full(
|
||||
(b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
|
||||
)
|
||||
|
||||
# current prediction for x_0
|
||||
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||
if quantize_denoised:
|
||||
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||
# direction pointing to x_t
|
||||
dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||
if noise_dropout > 0.0:
|
||||
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||
return x_prev, pred_x0
|
||||
|
||||
@torch.no_grad()
|
||||
def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
|
||||
# fast, but does not allow for exact reconstruction
|
||||
# t serves as an index to gather the correct alphas
|
||||
if use_original_steps:
|
||||
sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
|
||||
sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
|
||||
else:
|
||||
sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
|
||||
sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
|
||||
|
||||
if noise is None:
|
||||
noise = torch.randn_like(x0)
|
||||
return (
|
||||
extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0
|
||||
+ extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def decode(
|
||||
self,
|
||||
x_latent,
|
||||
cond,
|
||||
t_start,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
use_original_steps=False,
|
||||
):
|
||||
|
||||
timesteps = (
|
||||
np.arange(self.ddpm_num_timesteps)
|
||||
if use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
timesteps = timesteps[:t_start]
|
||||
|
||||
time_range = np.flip(timesteps)
|
||||
total_steps = timesteps.shape[0]
|
||||
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||
|
||||
iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
|
||||
x_dec = x_latent
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
ts = torch.full(
|
||||
(x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long
|
||||
)
|
||||
x_dec, _ = self.p_sample_ddim(
|
||||
x_dec,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=use_original_steps,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
return x_dec
|
@ -0,0 +1,904 @@
|
||||
"""
|
||||
wild mixture of
|
||||
https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
|
||||
https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
|
||||
https://github.com/CompVis/taming-transformers
|
||||
-- merci
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange, repeat
|
||||
from torchvision.utils import make_grid
|
||||
from tqdm import tqdm
|
||||
|
||||
from imaginairy.models.autoencoder import (
|
||||
VQModelInterface,
|
||||
)
|
||||
from imaginairy.modules.diffusionmodules.util import (
|
||||
make_beta_schedule,
|
||||
extract_into_tensor,
|
||||
noise_like,
|
||||
)
|
||||
from imaginairy.modules.distributions import DiagonalGaussianDistribution
|
||||
from imaginairy.utils import print_params, instantiate_from_config
|
||||
|
||||
__conditioning_keys__ = {"concat": "c_concat", "crossattn": "c_crossattn", "adm": "y"}
|
||||
|
||||
|
||||
def disabled_train(self, mode=True):
|
||||
"""Overwrite model.train with this function to make sure train/eval mode
|
||||
does not change anymore."""
|
||||
return self
|
||||
|
||||
|
||||
def uniform_on_device(r1, r2, shape, device):
|
||||
return (r1 - r2) * torch.rand(*shape, device=device) + r2
|
||||
|
||||
|
||||
class DDPM(pl.LightningModule):
|
||||
# classic DDPM with Gaussian diffusion, in image space
|
||||
def __init__(
|
||||
self,
|
||||
unet_config,
|
||||
timesteps=1000,
|
||||
beta_schedule="linear",
|
||||
loss_type="l2",
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
load_only_unet=False,
|
||||
monitor="val/loss",
|
||||
first_stage_key="image",
|
||||
image_size=256,
|
||||
channels=3,
|
||||
log_every_t=100,
|
||||
clip_denoised=True,
|
||||
linear_start=1e-4,
|
||||
linear_end=2e-2,
|
||||
cosine_s=8e-3,
|
||||
given_betas=None,
|
||||
original_elbo_weight=0.0,
|
||||
v_posterior=0.0, # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
|
||||
l_simple_weight=1.0,
|
||||
conditioning_key=None,
|
||||
parameterization="eps", # all assuming fixed variance schedules
|
||||
scheduler_config=None,
|
||||
use_positional_encodings=False,
|
||||
learn_logvar=False,
|
||||
logvar_init=0.0,
|
||||
):
|
||||
super().__init__()
|
||||
assert parameterization in [
|
||||
"eps",
|
||||
"x0",
|
||||
], 'currently only supporting "eps" and "x0"'
|
||||
self.parameterization = parameterization
|
||||
print(
|
||||
f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode"
|
||||
)
|
||||
self.cond_stage_model = None
|
||||
self.clip_denoised = clip_denoised
|
||||
self.log_every_t = log_every_t
|
||||
self.first_stage_key = first_stage_key
|
||||
self.image_size = image_size # try conv?
|
||||
self.channels = channels
|
||||
self.use_positional_encodings = use_positional_encodings
|
||||
self.model = DiffusionWrapper(unet_config, conditioning_key)
|
||||
print_params(self.model)
|
||||
|
||||
self.use_scheduler = scheduler_config is not None
|
||||
if self.use_scheduler:
|
||||
self.scheduler_config = scheduler_config
|
||||
|
||||
self.v_posterior = v_posterior
|
||||
self.original_elbo_weight = original_elbo_weight
|
||||
self.l_simple_weight = l_simple_weight
|
||||
|
||||
if monitor is not None:
|
||||
self.monitor = monitor
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(
|
||||
ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet
|
||||
)
|
||||
|
||||
self.register_schedule(
|
||||
given_betas=given_betas,
|
||||
beta_schedule=beta_schedule,
|
||||
timesteps=timesteps,
|
||||
linear_start=linear_start,
|
||||
linear_end=linear_end,
|
||||
cosine_s=cosine_s,
|
||||
)
|
||||
|
||||
self.loss_type = loss_type
|
||||
|
||||
self.learn_logvar = learn_logvar
|
||||
self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
|
||||
if self.learn_logvar:
|
||||
self.logvar = nn.Parameter(self.logvar, requires_grad=True)
|
||||
|
||||
def register_schedule(
|
||||
self,
|
||||
given_betas=None,
|
||||
beta_schedule="linear",
|
||||
timesteps=1000,
|
||||
linear_start=1e-4,
|
||||
linear_end=2e-2,
|
||||
cosine_s=8e-3,
|
||||
):
|
||||
if given_betas is not None:
|
||||
betas = given_betas
|
||||
else:
|
||||
betas = make_beta_schedule(
|
||||
beta_schedule,
|
||||
timesteps,
|
||||
linear_start=linear_start,
|
||||
linear_end=linear_end,
|
||||
cosine_s=cosine_s,
|
||||
)
|
||||
alphas = 1.0 - betas
|
||||
alphas_cumprod = np.cumprod(alphas, axis=0)
|
||||
alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
|
||||
|
||||
(timesteps,) = betas.shape
|
||||
self.num_timesteps = int(timesteps)
|
||||
self.linear_start = linear_start
|
||||
self.linear_end = linear_end
|
||||
assert (
|
||||
alphas_cumprod.shape[0] == self.num_timesteps
|
||||
), "alphas have to be defined for each timestep"
|
||||
|
||||
to_torch = partial(torch.tensor, dtype=torch.float32)
|
||||
|
||||
self.register_buffer("betas", to_torch(betas))
|
||||
self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
|
||||
self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev))
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
|
||||
self.register_buffer(
|
||||
"sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod))
|
||||
)
|
||||
self.register_buffer(
|
||||
"log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1))
|
||||
)
|
||||
|
||||
# calculations for posterior q(x_{t-1} | x_t, x_0)
|
||||
posterior_variance = (1 - self.v_posterior) * betas * (
|
||||
1.0 - alphas_cumprod_prev
|
||||
) / (1.0 - alphas_cumprod) + self.v_posterior * betas
|
||||
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
|
||||
self.register_buffer("posterior_variance", to_torch(posterior_variance))
|
||||
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
|
||||
self.register_buffer(
|
||||
"posterior_log_variance_clipped",
|
||||
to_torch(np.log(np.maximum(posterior_variance, 1e-20))),
|
||||
)
|
||||
self.register_buffer(
|
||||
"posterior_mean_coef1",
|
||||
to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)),
|
||||
)
|
||||
self.register_buffer(
|
||||
"posterior_mean_coef2",
|
||||
to_torch(
|
||||
(1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
|
||||
),
|
||||
)
|
||||
|
||||
if self.parameterization == "eps":
|
||||
lvlb_weights = self.betas ** 2 / (
|
||||
2
|
||||
* self.posterior_variance
|
||||
* to_torch(alphas)
|
||||
* (1 - self.alphas_cumprod)
|
||||
)
|
||||
elif self.parameterization == "x0":
|
||||
lvlb_weights = (
|
||||
0.5
|
||||
* np.sqrt(torch.Tensor(alphas_cumprod))
|
||||
/ (2.0 * 1 - torch.Tensor(alphas_cumprod))
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("mu not supported")
|
||||
# TODO how to choose this term
|
||||
lvlb_weights[0] = lvlb_weights[1]
|
||||
self.register_buffer("lvlb_weights", lvlb_weights, persistent=False)
|
||||
assert not torch.isnan(self.lvlb_weights).all()
|
||||
|
||||
|
||||
|
||||
class LatentDiffusion(DDPM):
|
||||
"""main class"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
first_stage_config,
|
||||
cond_stage_config,
|
||||
num_timesteps_cond=None,
|
||||
cond_stage_key="image",
|
||||
cond_stage_trainable=False,
|
||||
concat_mode=True,
|
||||
cond_stage_forward=None,
|
||||
conditioning_key=None,
|
||||
scale_factor=1.0,
|
||||
scale_by_std=False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
self.num_timesteps_cond = 1 if num_timesteps_cond is None else num_timesteps_cond
|
||||
self.scale_by_std = scale_by_std
|
||||
assert self.num_timesteps_cond <= kwargs["timesteps"]
|
||||
# for backwards compatibility after implementation of DiffusionWrapper
|
||||
if conditioning_key is None:
|
||||
conditioning_key = "concat" if concat_mode else "crossattn"
|
||||
if cond_stage_config == "__is_unconditional__":
|
||||
conditioning_key = None
|
||||
ckpt_path = kwargs.pop("ckpt_path", None)
|
||||
ignore_keys = kwargs.pop("ignore_keys", [])
|
||||
super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
|
||||
self.concat_mode = concat_mode
|
||||
self.cond_stage_trainable = cond_stage_trainable
|
||||
self.cond_stage_key = cond_stage_key
|
||||
try:
|
||||
self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
|
||||
except:
|
||||
self.num_downs = 0
|
||||
if not scale_by_std:
|
||||
self.scale_factor = scale_factor
|
||||
else:
|
||||
self.register_buffer("scale_factor", torch.tensor(scale_factor))
|
||||
self.instantiate_first_stage(first_stage_config)
|
||||
self.instantiate_cond_stage(cond_stage_config)
|
||||
self.cond_stage_forward = cond_stage_forward
|
||||
self.clip_denoised = False
|
||||
self.bbox_tokenizer = None
|
||||
|
||||
self.restarted_from_ckpt = False
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(ckpt_path, ignore_keys)
|
||||
self.restarted_from_ckpt = True
|
||||
|
||||
def make_cond_schedule(
|
||||
self,
|
||||
):
|
||||
self.cond_ids = torch.full(
|
||||
size=(self.num_timesteps,),
|
||||
fill_value=self.num_timesteps - 1,
|
||||
dtype=torch.long,
|
||||
)
|
||||
ids = torch.round(
|
||||
torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)
|
||||
).long()
|
||||
self.cond_ids[: self.num_timesteps_cond] = ids
|
||||
|
||||
def register_schedule(
|
||||
self,
|
||||
given_betas=None,
|
||||
beta_schedule="linear",
|
||||
timesteps=1000,
|
||||
linear_start=1e-4,
|
||||
linear_end=2e-2,
|
||||
cosine_s=8e-3,
|
||||
):
|
||||
super().register_schedule(
|
||||
given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s
|
||||
)
|
||||
|
||||
self.shorten_cond_schedule = self.num_timesteps_cond > 1
|
||||
if self.shorten_cond_schedule:
|
||||
self.make_cond_schedule()
|
||||
|
||||
def instantiate_first_stage(self, config):
|
||||
model = instantiate_from_config(config)
|
||||
self.first_stage_model = model.eval()
|
||||
self.first_stage_model.train = disabled_train
|
||||
for param in self.first_stage_model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def instantiate_cond_stage(self, config):
|
||||
if not self.cond_stage_trainable:
|
||||
if config == "__is_first_stage__":
|
||||
print("Using first stage also as cond stage.")
|
||||
self.cond_stage_model = self.first_stage_model
|
||||
elif config == "__is_unconditional__":
|
||||
print(f"Training {self.__class__.__name__} as an unconditional model.")
|
||||
self.cond_stage_model = None
|
||||
# self.be_unconditional = True
|
||||
else:
|
||||
model = instantiate_from_config(config)
|
||||
self.cond_stage_model = model.eval()
|
||||
self.cond_stage_model.train = disabled_train
|
||||
for param in self.cond_stage_model.parameters():
|
||||
param.requires_grad = False
|
||||
else:
|
||||
assert config != "__is_first_stage__"
|
||||
assert config != "__is_unconditional__"
|
||||
model = instantiate_from_config(config)
|
||||
self.cond_stage_model = model
|
||||
|
||||
def _get_denoise_row_from_list(
|
||||
self, samples, desc="", force_no_decoder_quantization=False
|
||||
):
|
||||
denoise_row = []
|
||||
for zd in tqdm(samples, desc=desc):
|
||||
denoise_row.append(
|
||||
self.decode_first_stage(
|
||||
zd.to(self.device), force_not_quantize=force_no_decoder_quantization
|
||||
)
|
||||
)
|
||||
n_imgs_per_row = len(denoise_row)
|
||||
denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W
|
||||
denoise_grid = rearrange(denoise_row, "n b c h w -> b n c h w")
|
||||
denoise_grid = rearrange(denoise_grid, "b n c h w -> (b n) c h w")
|
||||
denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
|
||||
return denoise_grid
|
||||
|
||||
def get_first_stage_encoding(self, encoder_posterior):
|
||||
if isinstance(encoder_posterior, DiagonalGaussianDistribution):
|
||||
z = encoder_posterior.sample()
|
||||
elif isinstance(encoder_posterior, torch.Tensor):
|
||||
z = encoder_posterior
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
|
||||
)
|
||||
return self.scale_factor * z
|
||||
|
||||
def get_learned_conditioning(self, c):
|
||||
if self.cond_stage_forward is None:
|
||||
if hasattr(self.cond_stage_model, "encode") and callable(
|
||||
self.cond_stage_model.encode
|
||||
):
|
||||
c = self.cond_stage_model.encode(c)
|
||||
if isinstance(c, DiagonalGaussianDistribution):
|
||||
c = c.mode()
|
||||
else:
|
||||
c = self.cond_stage_model(c)
|
||||
else:
|
||||
assert hasattr(self.cond_stage_model, self.cond_stage_forward)
|
||||
c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
|
||||
return c
|
||||
|
||||
def meshgrid(self, h, w):
|
||||
y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
|
||||
x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
|
||||
|
||||
arr = torch.cat([y, x], dim=-1)
|
||||
return arr
|
||||
|
||||
def delta_border(self, h, w):
|
||||
"""
|
||||
:param h: height
|
||||
:param w: width
|
||||
:return: normalized distance to image border,
|
||||
wtith min distance = 0 at border and max dist = 0.5 at image center
|
||||
"""
|
||||
lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
|
||||
arr = self.meshgrid(h, w) / lower_right_corner
|
||||
dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
|
||||
dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
|
||||
edge_dist = torch.min(
|
||||
torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1
|
||||
)[0]
|
||||
return edge_dist
|
||||
|
||||
def get_weighting(self, h, w, Ly, Lx, device):
|
||||
weighting = self.delta_border(h, w)
|
||||
weighting = torch.clip(
|
||||
weighting,
|
||||
self.split_input_params["clip_min_weight"],
|
||||
self.split_input_params["clip_max_weight"],
|
||||
)
|
||||
weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
|
||||
|
||||
if self.split_input_params["tie_braker"]:
|
||||
L_weighting = self.delta_border(Ly, Lx)
|
||||
L_weighting = torch.clip(
|
||||
L_weighting,
|
||||
self.split_input_params["clip_min_tie_weight"],
|
||||
self.split_input_params["clip_max_tie_weight"],
|
||||
)
|
||||
|
||||
L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
|
||||
weighting = weighting * L_weighting
|
||||
return weighting
|
||||
|
||||
def get_fold_unfold(
|
||||
self, x, kernel_size, stride, uf=1, df=1
|
||||
): # todo load once not every time, shorten code
|
||||
"""
|
||||
:param x: img of size (bs, c, h, w)
|
||||
:return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
|
||||
"""
|
||||
bs, nc, h, w = x.shape
|
||||
|
||||
# number of crops in image
|
||||
Ly = (h - kernel_size[0]) // stride[0] + 1
|
||||
Lx = (w - kernel_size[1]) // stride[1] + 1
|
||||
|
||||
if uf == 1 and df == 1:
|
||||
fold_params = dict(
|
||||
kernel_size=kernel_size, dilation=1, padding=0, stride=stride
|
||||
)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
|
||||
|
||||
weighting = self.get_weighting(
|
||||
kernel_size[0], kernel_size[1], Ly, Lx, x.device
|
||||
).to(x.dtype)
|
||||
normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap
|
||||
weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
|
||||
|
||||
elif uf > 1 and df == 1:
|
||||
fold_params = dict(
|
||||
kernel_size=kernel_size, dilation=1, padding=0, stride=stride
|
||||
)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold_params2 = dict(
|
||||
kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
|
||||
dilation=1,
|
||||
padding=0,
|
||||
stride=(stride[0] * uf, stride[1] * uf),
|
||||
)
|
||||
fold = torch.nn.Fold(
|
||||
output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2
|
||||
)
|
||||
|
||||
weighting = self.get_weighting(
|
||||
kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device
|
||||
).to(x.dtype)
|
||||
normalization = fold(weighting).view(
|
||||
1, 1, h * uf, w * uf
|
||||
) # normalizes the overlap
|
||||
weighting = weighting.view(
|
||||
(1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx)
|
||||
)
|
||||
|
||||
elif df > 1 and uf == 1:
|
||||
fold_params = dict(
|
||||
kernel_size=kernel_size, dilation=1, padding=0, stride=stride
|
||||
)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold_params2 = dict(
|
||||
kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
|
||||
dilation=1,
|
||||
padding=0,
|
||||
stride=(stride[0] // df, stride[1] // df),
|
||||
)
|
||||
fold = torch.nn.Fold(
|
||||
output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2
|
||||
)
|
||||
|
||||
weighting = self.get_weighting(
|
||||
kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device
|
||||
).to(x.dtype)
|
||||
normalization = fold(weighting).view(
|
||||
1, 1, h // df, w // df
|
||||
) # normalizes the overlap
|
||||
weighting = weighting.view(
|
||||
(1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx)
|
||||
)
|
||||
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return fold, unfold, normalization, weighting
|
||||
|
||||
@torch.no_grad()
|
||||
def get_input(
|
||||
self,
|
||||
batch,
|
||||
k,
|
||||
return_first_stage_outputs=False,
|
||||
force_c_encode=False,
|
||||
cond_key=None,
|
||||
return_original_cond=False,
|
||||
bs=None,
|
||||
):
|
||||
x = super().get_input(batch, k)
|
||||
if bs is not None:
|
||||
x = x[:bs]
|
||||
x = x.to(self.device)
|
||||
encoder_posterior = self.encode_first_stage(x)
|
||||
z = self.get_first_stage_encoding(encoder_posterior).detach()
|
||||
|
||||
if self.model.conditioning_key is not None:
|
||||
if cond_key is None:
|
||||
cond_key = self.cond_stage_key
|
||||
if cond_key != self.first_stage_key:
|
||||
if cond_key in ["caption", "coordinates_bbox"]:
|
||||
xc = batch[cond_key]
|
||||
elif cond_key == "class_label":
|
||||
xc = batch
|
||||
else:
|
||||
xc = super().get_input(batch, cond_key).to(self.device)
|
||||
else:
|
||||
xc = x
|
||||
if not self.cond_stage_trainable or force_c_encode:
|
||||
if isinstance(xc, dict) or isinstance(xc, list):
|
||||
# import pudb; pudb.set_trace()
|
||||
c = self.get_learned_conditioning(xc)
|
||||
else:
|
||||
c = self.get_learned_conditioning(xc.to(self.device))
|
||||
else:
|
||||
c = xc
|
||||
if bs is not None:
|
||||
c = c[:bs]
|
||||
|
||||
if self.use_positional_encodings:
|
||||
pos_x, pos_y = self.compute_latent_shifts(batch)
|
||||
ckey = __conditioning_keys__[self.model.conditioning_key]
|
||||
c = {ckey: c, "pos_x": pos_x, "pos_y": pos_y}
|
||||
|
||||
else:
|
||||
c = None
|
||||
xc = None
|
||||
if self.use_positional_encodings:
|
||||
pos_x, pos_y = self.compute_latent_shifts(batch)
|
||||
c = {"pos_x": pos_x, "pos_y": pos_y}
|
||||
out = [z, c]
|
||||
if return_first_stage_outputs:
|
||||
xrec = self.decode_first_stage(z)
|
||||
out.extend([x, xrec])
|
||||
if return_original_cond:
|
||||
out.append(xc)
|
||||
return out
|
||||
|
||||
@torch.no_grad()
|
||||
def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
|
||||
if predict_cids:
|
||||
if z.dim() == 4:
|
||||
z = torch.argmax(z.exp(), dim=1).long()
|
||||
z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
|
||||
z = rearrange(z, "b h w c -> b c h w").contiguous()
|
||||
|
||||
z = 1.0 / self.scale_factor * z
|
||||
|
||||
if hasattr(self, "split_input_params"):
|
||||
if self.split_input_params["patch_distributed_vq"]:
|
||||
ks = self.split_input_params["ks"] # eg. (128, 128)
|
||||
stride = self.split_input_params["stride"] # eg. (64, 64)
|
||||
uf = self.split_input_params["vqf"]
|
||||
bs, nc, h, w = z.shape
|
||||
if ks[0] > h or ks[1] > w:
|
||||
ks = (min(ks[0], h), min(ks[1], w))
|
||||
print("reducing Kernel")
|
||||
|
||||
if stride[0] > h or stride[1] > w:
|
||||
stride = (min(stride[0], h), min(stride[1], w))
|
||||
print("reducing stride")
|
||||
|
||||
fold, unfold, normalization, weighting = self.get_fold_unfold(
|
||||
z, ks, stride, uf=uf
|
||||
)
|
||||
|
||||
z = unfold(z) # (bn, nc * prod(**ks), L)
|
||||
# 1. Reshape to img shape
|
||||
z = z.view(
|
||||
(z.shape[0], -1, ks[0], ks[1], z.shape[-1])
|
||||
) # (bn, nc, ks[0], ks[1], L )
|
||||
|
||||
# 2. apply model loop over last dim
|
||||
if isinstance(self.first_stage_model, VQModelInterface):
|
||||
output_list = [
|
||||
self.first_stage_model.decode(
|
||||
z[:, :, :, :, i],
|
||||
force_not_quantize=predict_cids or force_not_quantize,
|
||||
)
|
||||
for i in range(z.shape[-1])
|
||||
]
|
||||
else:
|
||||
|
||||
output_list = [
|
||||
self.first_stage_model.decode(z[:, :, :, :, i])
|
||||
for i in range(z.shape[-1])
|
||||
]
|
||||
|
||||
o = torch.stack(output_list, axis=-1) # # (bn, nc, ks[0], ks[1], L)
|
||||
o = o * weighting
|
||||
# Reverse 1. reshape to img shape
|
||||
o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
|
||||
# stitch crops together
|
||||
decoded = fold(o)
|
||||
decoded = decoded / normalization # norm is shape (1, 1, h, w)
|
||||
return decoded
|
||||
else:
|
||||
if isinstance(self.first_stage_model, VQModelInterface):
|
||||
return self.first_stage_model.decode(
|
||||
z, force_not_quantize=predict_cids or force_not_quantize
|
||||
)
|
||||
else:
|
||||
return self.first_stage_model.decode(z)
|
||||
|
||||
else:
|
||||
if isinstance(self.first_stage_model, VQModelInterface):
|
||||
return self.first_stage_model.decode(
|
||||
z, force_not_quantize=predict_cids or force_not_quantize
|
||||
)
|
||||
else:
|
||||
return self.first_stage_model.decode(z)
|
||||
|
||||
def apply_model(self, x_noisy, t, cond, return_ids=False):
|
||||
|
||||
if isinstance(cond, dict):
|
||||
# hybrid case, cond is exptected to be a dict
|
||||
pass
|
||||
else:
|
||||
if not isinstance(cond, list):
|
||||
cond = [cond]
|
||||
key = (
|
||||
"c_concat" if self.model.conditioning_key == "concat" else "c_crossattn"
|
||||
)
|
||||
cond = {key: cond}
|
||||
|
||||
if hasattr(self, "split_input_params"):
|
||||
assert len(cond) == 1 # todo can only deal with one conditioning atm
|
||||
assert not return_ids
|
||||
ks = self.split_input_params["ks"] # eg. (128, 128)
|
||||
stride = self.split_input_params["stride"] # eg. (64, 64)
|
||||
|
||||
h, w = x_noisy.shape[-2:]
|
||||
|
||||
fold, unfold, normalization, weighting = self.get_fold_unfold(
|
||||
x_noisy, ks, stride
|
||||
)
|
||||
|
||||
z = unfold(x_noisy) # (bn, nc * prod(**ks), L)
|
||||
# Reshape to img shape
|
||||
z = z.view(
|
||||
(z.shape[0], -1, ks[0], ks[1], z.shape[-1])
|
||||
) # (bn, nc, ks[0], ks[1], L )
|
||||
z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
|
||||
|
||||
if (
|
||||
self.cond_stage_key in ["image", "LR_image", "segmentation", "bbox_img"]
|
||||
and self.model.conditioning_key
|
||||
): # todo check for completeness
|
||||
c_key = next(iter(cond.keys())) # get key
|
||||
c = next(iter(cond.values())) # get value
|
||||
assert len(c) == 1 # todo extend to list with more than one elem
|
||||
c = c[0] # get element
|
||||
|
||||
c = unfold(c)
|
||||
c = c.view(
|
||||
(c.shape[0], -1, ks[0], ks[1], c.shape[-1])
|
||||
) # (bn, nc, ks[0], ks[1], L )
|
||||
|
||||
cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
|
||||
|
||||
elif self.cond_stage_key == "coordinates_bbox":
|
||||
assert (
|
||||
"original_image_size" in self.split_input_params
|
||||
), "BoudingBoxRescaling is missing original_image_size"
|
||||
|
||||
# assuming padding of unfold is always 0 and its dilation is always 1
|
||||
n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
|
||||
full_img_h, full_img_w = self.split_input_params["original_image_size"]
|
||||
# as we are operating on latents, we need the factor from the original image size to the
|
||||
# spatial latent size to properly rescale the crops for regenerating the bbox annotations
|
||||
num_downs = self.first_stage_model.encoder.num_resolutions - 1
|
||||
rescale_latent = 2 ** (num_downs)
|
||||
|
||||
# get top left postions of patches as conforming for the bbbox tokenizer, therefore we
|
||||
# need to rescale the tl patch coordinates to be in between (0,1)
|
||||
tl_patch_coordinates = [
|
||||
(
|
||||
rescale_latent
|
||||
* stride[0]
|
||||
* (patch_nr % n_patches_per_row)
|
||||
/ full_img_w,
|
||||
rescale_latent
|
||||
* stride[1]
|
||||
* (patch_nr // n_patches_per_row)
|
||||
/ full_img_h,
|
||||
)
|
||||
for patch_nr in range(z.shape[-1])
|
||||
]
|
||||
|
||||
# patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
|
||||
patch_limits = [
|
||||
(
|
||||
x_tl,
|
||||
y_tl,
|
||||
rescale_latent * ks[0] / full_img_w,
|
||||
rescale_latent * ks[1] / full_img_h,
|
||||
)
|
||||
for x_tl, y_tl in tl_patch_coordinates
|
||||
]
|
||||
# patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
|
||||
|
||||
# tokenize crop coordinates for the bounding boxes of the respective patches
|
||||
patch_limits_tknzd = [
|
||||
torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(
|
||||
self.device
|
||||
)
|
||||
for bbox in patch_limits
|
||||
] # list of length l with tensors of shape (1, 2)
|
||||
print(patch_limits_tknzd[0].shape)
|
||||
# cut tknzd crop position from conditioning
|
||||
assert isinstance(cond, dict), "cond must be dict to be fed into model"
|
||||
cut_cond = cond["c_crossattn"][0][..., :-2].to(self.device)
|
||||
print(cut_cond.shape)
|
||||
|
||||
adapted_cond = torch.stack(
|
||||
[torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd]
|
||||
)
|
||||
adapted_cond = rearrange(adapted_cond, "l b n -> (l b) n")
|
||||
print(adapted_cond.shape)
|
||||
adapted_cond = self.get_learned_conditioning(adapted_cond)
|
||||
print(adapted_cond.shape)
|
||||
adapted_cond = rearrange(
|
||||
adapted_cond, "(l b) n d -> l b n d", l=z.shape[-1]
|
||||
)
|
||||
print(adapted_cond.shape)
|
||||
|
||||
cond_list = [{"c_crossattn": [e]} for e in adapted_cond]
|
||||
|
||||
else:
|
||||
cond_list = [
|
||||
cond for i in range(z.shape[-1])
|
||||
] # Todo make this more efficient
|
||||
|
||||
# apply model by loop over crops
|
||||
output_list = [
|
||||
self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])
|
||||
]
|
||||
assert not isinstance(
|
||||
output_list[0], tuple
|
||||
) # todo cant deal with multiple model outputs check this never happens
|
||||
|
||||
o = torch.stack(output_list, axis=-1)
|
||||
o = o * weighting
|
||||
# Reverse reshape to img shape
|
||||
o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
|
||||
# stitch crops together
|
||||
x_recon = fold(o) / normalization
|
||||
|
||||
else:
|
||||
x_recon = self.model(x_noisy, t, **cond)
|
||||
|
||||
if isinstance(x_recon, tuple) and not return_ids:
|
||||
return x_recon[0]
|
||||
else:
|
||||
return x_recon
|
||||
|
||||
def p_mean_variance(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
clip_denoised: bool,
|
||||
return_codebook_ids=False,
|
||||
quantize_denoised=False,
|
||||
return_x0=False,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
):
|
||||
t_in = t
|
||||
model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
|
||||
|
||||
if score_corrector is not None:
|
||||
assert self.parameterization == "eps"
|
||||
model_out = score_corrector.modify_score(
|
||||
self, model_out, x, t, c, **corrector_kwargs
|
||||
)
|
||||
|
||||
if return_codebook_ids:
|
||||
model_out, logits = model_out
|
||||
|
||||
if self.parameterization == "eps":
|
||||
x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
|
||||
elif self.parameterization == "x0":
|
||||
x_recon = model_out
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
if clip_denoised:
|
||||
x_recon.clamp_(-1.0, 1.0)
|
||||
if quantize_denoised:
|
||||
x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
|
||||
model_mean, posterior_variance, posterior_log_variance = self.q_posterior(
|
||||
x_start=x_recon, x_t=x, t=t
|
||||
)
|
||||
if return_codebook_ids:
|
||||
return model_mean, posterior_variance, posterior_log_variance, logits
|
||||
elif return_x0:
|
||||
return model_mean, posterior_variance, posterior_log_variance, x_recon
|
||||
else:
|
||||
return model_mean, posterior_variance, posterior_log_variance
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
clip_denoised=False,
|
||||
repeat_noise=False,
|
||||
return_codebook_ids=False,
|
||||
quantize_denoised=False,
|
||||
return_x0=False,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
):
|
||||
b, *_, device = *x.shape, x.device
|
||||
outputs = self.p_mean_variance(
|
||||
x=x,
|
||||
c=c,
|
||||
t=t,
|
||||
clip_denoised=clip_denoised,
|
||||
return_codebook_ids=return_codebook_ids,
|
||||
quantize_denoised=quantize_denoised,
|
||||
return_x0=return_x0,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
)
|
||||
if return_codebook_ids:
|
||||
raise DeprecationWarning("Support dropped.")
|
||||
model_mean, _, model_log_variance, logits = outputs
|
||||
elif return_x0:
|
||||
model_mean, _, model_log_variance, x0 = outputs
|
||||
else:
|
||||
model_mean, _, model_log_variance = outputs
|
||||
|
||||
noise = noise_like(x.shape, device, repeat_noise) * temperature
|
||||
if noise_dropout > 0.0:
|
||||
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||
# no noise when t == 0
|
||||
nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
|
||||
|
||||
if return_codebook_ids:
|
||||
return model_mean + nonzero_mask * (
|
||||
0.5 * model_log_variance
|
||||
).exp() * noise, logits.argmax(dim=1)
|
||||
if return_x0:
|
||||
return (
|
||||
model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise,
|
||||
x0,
|
||||
)
|
||||
else:
|
||||
return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
|
||||
|
||||
|
||||
class DiffusionWrapper(pl.LightningModule):
|
||||
def __init__(self, diff_model_config, conditioning_key):
|
||||
super().__init__()
|
||||
self.diffusion_model = instantiate_from_config(diff_model_config)
|
||||
self.conditioning_key = conditioning_key
|
||||
assert self.conditioning_key in [None, "concat", "crossattn", "hybrid", "adm"]
|
||||
|
||||
def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
|
||||
if self.conditioning_key is None:
|
||||
out = self.diffusion_model(x, t)
|
||||
elif self.conditioning_key == "concat":
|
||||
xc = torch.cat([x] + c_concat, dim=1)
|
||||
out = self.diffusion_model(xc, t)
|
||||
elif self.conditioning_key == "crossattn":
|
||||
cc = torch.cat(c_crossattn, 1)
|
||||
out = self.diffusion_model(x, t, context=cc)
|
||||
elif self.conditioning_key == "hybrid":
|
||||
xc = torch.cat([x] + c_concat, dim=1)
|
||||
cc = torch.cat(c_crossattn, 1)
|
||||
out = self.diffusion_model(xc, t, context=cc)
|
||||
elif self.conditioning_key == "adm":
|
||||
cc = c_crossattn[0]
|
||||
out = self.diffusion_model(x, t, y=cc)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
return out
|
@ -0,0 +1,360 @@
|
||||
"""SAMPLING ONLY."""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from imaginairy.modules.diffusionmodules.util import (
|
||||
make_ddim_sampling_parameters,
|
||||
make_ddim_timesteps,
|
||||
noise_like,
|
||||
)
|
||||
from imaginairy.utils import get_device
|
||||
|
||||
|
||||
class PLMSSampler(object):
|
||||
def __init__(self, model, schedule="linear", **kwargs):
|
||||
super().__init__()
|
||||
self.model = model
|
||||
self.ddpm_num_timesteps = model.num_timesteps
|
||||
self.schedule = schedule
|
||||
self.device_available = get_device()
|
||||
|
||||
def register_buffer(self, name, attr):
|
||||
if type(attr) == torch.Tensor:
|
||||
if attr.device != torch.device(self.device_available):
|
||||
attr = attr.to(torch.float32).to(torch.device(self.device_available))
|
||||
setattr(self, name, attr)
|
||||
|
||||
def make_schedule(
|
||||
self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
|
||||
):
|
||||
if ddim_eta != 0:
|
||||
raise ValueError("ddim_eta must be 0 for PLMS")
|
||||
self.ddim_timesteps = make_ddim_timesteps(
|
||||
ddim_discr_method=ddim_discretize,
|
||||
num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,
|
||||
verbose=verbose,
|
||||
)
|
||||
alphas_cumprod = self.model.alphas_cumprod
|
||||
assert (
|
||||
alphas_cumprod.shape[0] == self.ddpm_num_timesteps
|
||||
), "alphas have to be defined for each timestep"
|
||||
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||
|
||||
self.register_buffer("betas", to_torch(self.model.betas))
|
||||
self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
|
||||
self.register_buffer(
|
||||
"alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
|
||||
)
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer(
|
||||
"sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_one_minus_alphas_cumprod",
|
||||
to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
"log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
"sqrt_recipm1_alphas_cumprod",
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
|
||||
)
|
||||
|
||||
# ddim sampling parameters
|
||||
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
|
||||
alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,
|
||||
verbose=verbose,
|
||||
)
|
||||
self.register_buffer("ddim_sigmas", ddim_sigmas)
|
||||
self.register_buffer("ddim_alphas", ddim_alphas)
|
||||
self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
|
||||
self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
|
||||
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||
(1 - self.alphas_cumprod_prev)
|
||||
/ (1 - self.alphas_cumprod)
|
||||
* (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
|
||||
)
|
||||
self.register_buffer(
|
||||
"ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(
|
||||
self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.0,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs,
|
||||
):
|
||||
if conditioning is not None:
|
||||
if isinstance(conditioning, dict):
|
||||
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||
if cbs != batch_size:
|
||||
print(
|
||||
f"Warning: Got {cbs} conditionings but batch-size is {batch_size}"
|
||||
)
|
||||
else:
|
||||
if conditioning.shape[0] != batch_size:
|
||||
print(
|
||||
f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}"
|
||||
)
|
||||
|
||||
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||
# sampling
|
||||
C, H, W = shape
|
||||
size = (batch_size, C, H, W)
|
||||
print(f"Data shape for PLMS sampling is {size}")
|
||||
|
||||
samples, intermediates = self.plms_sampling(
|
||||
conditioning,
|
||||
size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask,
|
||||
x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
return samples, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def plms_sampling(
|
||||
self,
|
||||
cond,
|
||||
shape,
|
||||
x_T=None,
|
||||
ddim_use_original_steps=False,
|
||||
callback=None,
|
||||
timesteps=None,
|
||||
quantize_denoised=False,
|
||||
mask=None,
|
||||
x0=None,
|
||||
img_callback=None,
|
||||
log_every_t=100,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
device = self.model.betas.device
|
||||
b = shape[0]
|
||||
if x_T is None:
|
||||
img = torch.randn(shape, device=device)
|
||||
else:
|
||||
img = x_T
|
||||
|
||||
if timesteps is None:
|
||||
timesteps = (
|
||||
self.ddpm_num_timesteps
|
||||
if ddim_use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
elif timesteps is not None and not ddim_use_original_steps:
|
||||
subset_end = (
|
||||
int(
|
||||
min(timesteps / self.ddim_timesteps.shape[0], 1)
|
||||
* self.ddim_timesteps.shape[0]
|
||||
)
|
||||
- 1
|
||||
)
|
||||
timesteps = self.ddim_timesteps[:subset_end]
|
||||
|
||||
intermediates = {"x_inter": [img], "pred_x0": [img]}
|
||||
time_range = (
|
||||
list(reversed(range(0, timesteps)))
|
||||
if ddim_use_original_steps
|
||||
else np.flip(timesteps)
|
||||
)
|
||||
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
print(f"Running PLMS Sampling with {total_steps} timesteps")
|
||||
|
||||
iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps)
|
||||
old_eps = []
|
||||
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||
ts_next = torch.full(
|
||||
(b,),
|
||||
time_range[min(i + 1, len(time_range) - 1)],
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
|
||||
if mask is not None:
|
||||
assert x0 is not None
|
||||
img_orig = self.model.q_sample(
|
||||
x0, ts
|
||||
) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1.0 - mask) * img
|
||||
|
||||
outs = self.p_sample_plms(
|
||||
img,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised,
|
||||
temperature=temperature,
|
||||
noise_dropout=noise_dropout,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
old_eps=old_eps,
|
||||
t_next=ts_next,
|
||||
)
|
||||
img, pred_x0, e_t = outs
|
||||
old_eps.append(e_t)
|
||||
if len(old_eps) >= 4:
|
||||
old_eps.pop(0)
|
||||
if callback:
|
||||
callback(i)
|
||||
if img_callback:
|
||||
img_callback(pred_x0, i)
|
||||
|
||||
if index % log_every_t == 0 or index == total_steps - 1:
|
||||
intermediates["x_inter"].append(img)
|
||||
intermediates["pred_x0"].append(pred_x0)
|
||||
|
||||
return img, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_plms(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
index,
|
||||
repeat_noise=False,
|
||||
use_original_steps=False,
|
||||
quantize_denoised=False,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
old_eps=None,
|
||||
t_next=None,
|
||||
):
|
||||
b, *_, device = *x.shape, x.device
|
||||
|
||||
def get_model_output(x, t):
|
||||
if (
|
||||
unconditional_conditioning is None
|
||||
or unconditional_guidance_scale == 1.0
|
||||
):
|
||||
e_t = self.model.apply_model(x, t, c)
|
||||
else:
|
||||
x_in = torch.cat([x] * 2)
|
||||
t_in = torch.cat([t] * 2)
|
||||
c_in = torch.cat([unconditional_conditioning, c])
|
||||
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
|
||||
|
||||
if score_corrector is not None:
|
||||
assert self.model.parameterization == "eps"
|
||||
e_t = score_corrector.modify_score(
|
||||
self.model, e_t, x, t, c, **corrector_kwargs
|
||||
)
|
||||
|
||||
return e_t
|
||||
|
||||
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||
alphas_prev = (
|
||||
self.model.alphas_cumprod_prev
|
||||
if use_original_steps
|
||||
else self.ddim_alphas_prev
|
||||
)
|
||||
sqrt_one_minus_alphas = (
|
||||
self.model.sqrt_one_minus_alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_sqrt_one_minus_alphas
|
||||
)
|
||||
sigmas = (
|
||||
self.model.ddim_sigmas_for_original_num_steps
|
||||
if use_original_steps
|
||||
else self.ddim_sigmas
|
||||
)
|
||||
|
||||
def get_x_prev_and_pred_x0(e_t, index):
|
||||
# select parameters corresponding to the currently considered timestep
|
||||
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||
sqrt_one_minus_at = torch.full(
|
||||
(b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
|
||||
)
|
||||
|
||||
# current prediction for x_0
|
||||
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||
if quantize_denoised:
|
||||
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||
# direction pointing to x_t
|
||||
dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||
if noise_dropout > 0.0:
|
||||
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||
return x_prev, pred_x0
|
||||
|
||||
e_t = get_model_output(x, t)
|
||||
if len(old_eps) == 0:
|
||||
# Pseudo Improved Euler (2nd order)
|
||||
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
|
||||
e_t_next = get_model_output(x_prev, t_next)
|
||||
e_t_prime = (e_t + e_t_next) / 2
|
||||
elif len(old_eps) == 1:
|
||||
# 2nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||
e_t_prime = (3 * e_t - old_eps[-1]) / 2
|
||||
elif len(old_eps) == 2:
|
||||
# 3nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||
e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
|
||||
elif len(old_eps) >= 3:
|
||||
# 4nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||
e_t_prime = (
|
||||
55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
|
||||
) / 24
|
||||
|
||||
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
|
||||
|
||||
return x_prev, pred_x0, e_t
|
@ -0,0 +1,277 @@
|
||||
from inspect import isfunction
|
||||
import math
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn, einsum
|
||||
from einops import rearrange, repeat
|
||||
|
||||
from imaginairy.modules.diffusionmodules.util import checkpoint
|
||||
|
||||
|
||||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
|
||||
def uniq(arr):
|
||||
return {el: True for el in arr}.keys()
|
||||
|
||||
|
||||
def default(val, d):
|
||||
if exists(val):
|
||||
return val
|
||||
return d() if isfunction(d) else d
|
||||
|
||||
|
||||
def max_neg_value(t):
|
||||
return -torch.finfo(t.dtype).max
|
||||
|
||||
|
||||
def init_(tensor):
|
||||
dim = tensor.shape[-1]
|
||||
std = 1 / math.sqrt(dim)
|
||||
tensor.uniform_(-std, std)
|
||||
return tensor
|
||||
|
||||
|
||||
# feedforward
|
||||
class GEGLU(nn.Module):
|
||||
def __init__(self, dim_in, dim_out):
|
||||
super().__init__()
|
||||
self.proj = nn.Linear(dim_in, dim_out * 2)
|
||||
|
||||
def forward(self, x):
|
||||
x, gate = self.proj(x).chunk(2, dim=-1)
|
||||
return x * F.gelu(gate)
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
|
||||
super().__init__()
|
||||
inner_dim = int(dim * mult)
|
||||
dim_out = default(dim_out, dim)
|
||||
project_in = (
|
||||
nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
|
||||
if not glu
|
||||
else GEGLU(dim, inner_dim)
|
||||
)
|
||||
|
||||
self.net = nn.Sequential(
|
||||
project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.net(x)
|
||||
|
||||
|
||||
def zero_module(module):
|
||||
"""
|
||||
Zero out the parameters of a module and return it.
|
||||
"""
|
||||
for p in module.parameters():
|
||||
p.detach().zero_()
|
||||
return module
|
||||
|
||||
|
||||
def Normalize(in_channels):
|
||||
return torch.nn.GroupNorm(
|
||||
num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
|
||||
)
|
||||
|
||||
|
||||
class LinearAttention(nn.Module):
|
||||
def __init__(self, dim, heads=4, dim_head=32):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
hidden_dim = dim_head * heads
|
||||
self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
|
||||
self.to_out = nn.Conv2d(hidden_dim, dim, 1)
|
||||
|
||||
def forward(self, x):
|
||||
b, c, h, w = x.shape
|
||||
qkv = self.to_qkv(x)
|
||||
q, k, v = rearrange(
|
||||
qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
|
||||
)
|
||||
k = k.softmax(dim=-1)
|
||||
context = torch.einsum("bhdn,bhen->bhde", k, v)
|
||||
out = torch.einsum("bhde,bhdn->bhen", context, q)
|
||||
out = rearrange(
|
||||
out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
|
||||
)
|
||||
return self.to_out(out)
|
||||
|
||||
|
||||
class SpatialSelfAttention(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = Normalize(in_channels)
|
||||
self.q = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.k = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.v = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.proj_out = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
h_ = x
|
||||
h_ = self.norm(h_)
|
||||
q = self.q(h_)
|
||||
k = self.k(h_)
|
||||
v = self.v(h_)
|
||||
|
||||
# compute attention
|
||||
b, c, h, w = q.shape
|
||||
q = rearrange(q, "b c h w -> b (h w) c")
|
||||
k = rearrange(k, "b c h w -> b c (h w)")
|
||||
w_ = torch.einsum("bij,bjk->bik", q, k)
|
||||
|
||||
w_ = w_ * (int(c) ** (-0.5))
|
||||
w_ = torch.nn.functional.softmax(w_, dim=2)
|
||||
|
||||
# attend to values
|
||||
v = rearrange(v, "b c h w -> b c (h w)")
|
||||
w_ = rearrange(w_, "b i j -> b j i")
|
||||
h_ = torch.einsum("bij,bjk->bik", v, w_)
|
||||
h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
|
||||
h_ = self.proj_out(h_)
|
||||
|
||||
return x + h_
|
||||
|
||||
|
||||
class CrossAttention(nn.Module):
|
||||
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
context_dim = default(context_dim, query_dim)
|
||||
|
||||
self.scale = dim_head**-0.5
|
||||
self.heads = heads
|
||||
|
||||
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
|
||||
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
|
||||
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
|
||||
|
||||
self.to_out = nn.Sequential(
|
||||
nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
|
||||
)
|
||||
|
||||
def forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
|
||||
q = self.to_q(x)
|
||||
context = default(context, x)
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
|
||||
|
||||
sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
|
||||
|
||||
if exists(mask):
|
||||
mask = rearrange(mask, "b ... -> b (...)")
|
||||
max_neg_value = -torch.finfo(sim.dtype).max
|
||||
mask = repeat(mask, "b j -> (b h) () j", h=h)
|
||||
sim.masked_fill_(~mask, max_neg_value)
|
||||
|
||||
# attention, what we cannot get enough of
|
||||
attn = sim.softmax(dim=-1)
|
||||
|
||||
out = einsum("b i j, b j d -> b i d", attn, v)
|
||||
out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
|
||||
return self.to_out(out)
|
||||
|
||||
|
||||
class BasicTransformerBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
n_heads,
|
||||
d_head,
|
||||
dropout=0.0,
|
||||
context_dim=None,
|
||||
gated_ff=True,
|
||||
checkpoint=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.attn1 = CrossAttention(
|
||||
query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
|
||||
) # is a self-attention
|
||||
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
|
||||
self.attn2 = CrossAttention(
|
||||
query_dim=dim,
|
||||
context_dim=context_dim,
|
||||
heads=n_heads,
|
||||
dim_head=d_head,
|
||||
dropout=dropout,
|
||||
) # is self-attn if context is none
|
||||
self.norm1 = nn.LayerNorm(dim)
|
||||
self.norm2 = nn.LayerNorm(dim)
|
||||
self.norm3 = nn.LayerNorm(dim)
|
||||
self.checkpoint = checkpoint
|
||||
|
||||
def forward(self, x, context=None):
|
||||
return checkpoint(
|
||||
self._forward, (x, context), self.parameters(), self.checkpoint
|
||||
)
|
||||
|
||||
def _forward(self, x, context=None):
|
||||
x = self.attn1(self.norm1(x)) + x
|
||||
x = self.attn2(self.norm2(x), context=context) + x
|
||||
x = self.ff(self.norm3(x)) + x
|
||||
return x
|
||||
|
||||
|
||||
class SpatialTransformer(nn.Module):
|
||||
"""
|
||||
Transformer block for image-like data.
|
||||
First, project the input (aka embedding)
|
||||
and reshape to b, t, d.
|
||||
Then apply standard transformer action.
|
||||
Finally, reshape to image
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
inner_dim = n_heads * d_head
|
||||
self.norm = Normalize(in_channels)
|
||||
|
||||
self.proj_in = nn.Conv2d(
|
||||
in_channels, inner_dim, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
BasicTransformerBlock(
|
||||
inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
|
||||
)
|
||||
for d in range(depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.proj_out = zero_module(
|
||||
nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
|
||||
)
|
||||
|
||||
def forward(self, x, context=None):
|
||||
# note: if no context is given, cross-attention defaults to self-attention
|
||||
b, c, h, w = x.shape
|
||||
x_in = x
|
||||
x = self.norm(x)
|
||||
x = self.proj_in(x)
|
||||
x = rearrange(x, "b c h w -> b (h w) c")
|
||||
for block in self.transformer_blocks:
|
||||
x = block(x, context=context)
|
||||
x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
|
||||
x = self.proj_out(x)
|
||||
return x + x_in
|
@ -0,0 +1,146 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import clip
|
||||
from einops import repeat
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
import kornia
|
||||
|
||||
from imaginairy.utils import get_device, print_params
|
||||
|
||||
|
||||
class AbstractEncoder(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def encode(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class FrozenCLIPEmbedder(AbstractEncoder):
|
||||
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
version="openai/clip-vit-large-patch14",
|
||||
device=get_device(),
|
||||
max_length=77,
|
||||
):
|
||||
super().__init__()
|
||||
self.tokenizer = CLIPTokenizer.from_pretrained(version)
|
||||
self.transformer = CLIPTextModel.from_pretrained(version)
|
||||
self.device = device
|
||||
self.max_length = max_length
|
||||
self.freeze()
|
||||
|
||||
def freeze(self):
|
||||
self.transformer = self.transformer.eval()
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def forward(self, text):
|
||||
batch_encoding = self.tokenizer(
|
||||
text,
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
return_length=True,
|
||||
return_overflowing_tokens=False,
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
)
|
||||
tokens = batch_encoding["input_ids"].to(self.device)
|
||||
outputs = self.transformer(input_ids=tokens)
|
||||
|
||||
z = outputs.last_hidden_state
|
||||
return z
|
||||
|
||||
def encode(self, text):
|
||||
return self(text)
|
||||
|
||||
|
||||
class FrozenCLIPTextEmbedder(nn.Module):
|
||||
"""
|
||||
Uses the CLIP transformer encoder for text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
version="ViT-L/14",
|
||||
device=get_device(),
|
||||
max_length=77,
|
||||
n_repeat=1,
|
||||
normalize=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.model, _ = clip.load(version, jit=False, device=device)
|
||||
self.device = device
|
||||
self.max_length = max_length
|
||||
self.n_repeat = n_repeat
|
||||
self.normalize = normalize
|
||||
|
||||
def freeze(self):
|
||||
self.model = self.model.eval()
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def forward(self, text):
|
||||
tokens = clip.tokenize(text).to(self.device)
|
||||
z = self.model.encode_text(tokens)
|
||||
if self.normalize:
|
||||
z = z / torch.linalg.norm(z, dim=1, keepdim=True)
|
||||
return z
|
||||
|
||||
def encode(self, text):
|
||||
z = self(text)
|
||||
if z.ndim == 2:
|
||||
z = z[:, None, :]
|
||||
z = repeat(z, "b 1 d -> b k d", k=self.n_repeat)
|
||||
return z
|
||||
|
||||
|
||||
class FrozenClipImageEmbedder(nn.Module):
|
||||
"""
|
||||
Uses the CLIP image encoder.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
jit=False,
|
||||
device=get_device(),
|
||||
antialias=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.model, _ = clip.load(name=model, device=device, jit=jit)
|
||||
|
||||
self.antialias = antialias
|
||||
|
||||
self.register_buffer(
|
||||
"mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
|
||||
)
|
||||
|
||||
def preprocess(self, x):
|
||||
# normalize to [0,1]
|
||||
x = kornia.geometry.resize(
|
||||
x,
|
||||
(224, 224),
|
||||
interpolation="bicubic",
|
||||
align_corners=True,
|
||||
antialias=self.antialias,
|
||||
)
|
||||
x = (x + 1.0) / 2.0
|
||||
# renormalize according to clip
|
||||
x = kornia.enhance.normalize(x, self.mean, self.std)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
# x is assumed to be in range [-1,1]
|
||||
return self.model.encode_image(self.preprocess(x))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = FrozenCLIPEmbedder()
|
||||
print_params(model, verbose=True)
|
@ -0,0 +1,642 @@
|
||||
# pytorch_diffusion + derived encoder decoder
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
|
||||
from imaginairy.modules.attention import LinearAttention
|
||||
from imaginairy.utils import instantiate_from_config
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, embedding_dim):
|
||||
"""
|
||||
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
||||
From Fairseq.
|
||||
Build sinusoidal embeddings.
|
||||
This matches the implementation in tensor2tensor, but differs slightly
|
||||
from the description in Section 3.5 of "Attention Is All You Need".
|
||||
"""
|
||||
assert len(timesteps.shape) == 1
|
||||
|
||||
half_dim = embedding_dim // 2
|
||||
emb = math.log(10000) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
|
||||
emb = emb.to(device=timesteps.device)
|
||||
emb = timesteps.float()[:, None] * emb[None, :]
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||
if embedding_dim % 2 == 1: # zero pad
|
||||
emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
|
||||
return emb
|
||||
|
||||
|
||||
def nonlinearity(x):
|
||||
# swish
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
def Normalize(in_channels, num_groups=32):
|
||||
return torch.nn.GroupNorm(
|
||||
num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
|
||||
)
|
||||
|
||||
|
||||
class Upsample(nn.Module):
|
||||
def __init__(self, in_channels, with_conv):
|
||||
super().__init__()
|
||||
self.with_conv = with_conv
|
||||
if self.with_conv:
|
||||
self.conv = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
|
||||
if self.with_conv:
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class Downsample(nn.Module):
|
||||
def __init__(self, in_channels, with_conv):
|
||||
super().__init__()
|
||||
self.with_conv = with_conv
|
||||
if self.with_conv:
|
||||
# no asymmetric padding in torch conv, must do it ourselves
|
||||
self.conv = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=3, stride=2, padding=0
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
if self.with_conv:
|
||||
pad = (0, 1, 0, 1)
|
||||
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
|
||||
x = self.conv(x)
|
||||
else:
|
||||
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
||||
return x
|
||||
|
||||
|
||||
class ResnetBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
in_channels,
|
||||
out_channels=None,
|
||||
conv_shortcut=False,
|
||||
dropout,
|
||||
temb_channels=512,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
out_channels = in_channels if out_channels is None else out_channels
|
||||
self.out_channels = out_channels
|
||||
self.use_conv_shortcut = conv_shortcut
|
||||
|
||||
self.norm1 = Normalize(in_channels)
|
||||
self.conv1 = torch.nn.Conv2d(
|
||||
in_channels, out_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
if temb_channels > 0:
|
||||
self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
|
||||
self.norm2 = Normalize(out_channels)
|
||||
self.dropout = torch.nn.Dropout(dropout)
|
||||
self.conv2 = torch.nn.Conv2d(
|
||||
out_channels, out_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
if self.in_channels != self.out_channels:
|
||||
if self.use_conv_shortcut:
|
||||
self.conv_shortcut = torch.nn.Conv2d(
|
||||
in_channels, out_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
else:
|
||||
self.nin_shortcut = torch.nn.Conv2d(
|
||||
in_channels, out_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
def forward(self, x, temb):
|
||||
h = x
|
||||
h = self.norm1(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv1(h)
|
||||
|
||||
if temb is not None:
|
||||
h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
|
||||
|
||||
h = self.norm2(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.dropout(h)
|
||||
h = self.conv2(h)
|
||||
|
||||
if self.in_channels != self.out_channels:
|
||||
if self.use_conv_shortcut:
|
||||
x = self.conv_shortcut(x)
|
||||
else:
|
||||
x = self.nin_shortcut(x)
|
||||
|
||||
return x + h
|
||||
|
||||
|
||||
class LinAttnBlock(LinearAttention):
|
||||
"""to match AttnBlock usage"""
|
||||
|
||||
def __init__(self, in_channels):
|
||||
super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
|
||||
|
||||
|
||||
class AttnBlock(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = Normalize(in_channels)
|
||||
self.q = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.k = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.v = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.proj_out = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
h_ = x
|
||||
h_ = self.norm(h_)
|
||||
q = self.q(h_)
|
||||
k = self.k(h_)
|
||||
v = self.v(h_)
|
||||
|
||||
# compute attention
|
||||
b, c, h, w = q.shape
|
||||
q = q.reshape(b, c, h * w)
|
||||
q = q.permute(0, 2, 1) # b,hw,c
|
||||
k = k.reshape(b, c, h * w) # b,c,hw
|
||||
w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
|
||||
w_ = w_ * (int(c) ** (-0.5))
|
||||
w_ = torch.nn.functional.softmax(w_, dim=2)
|
||||
|
||||
# attend to values
|
||||
v = v.reshape(b, c, h * w)
|
||||
w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
|
||||
h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
|
||||
h_ = h_.reshape(b, c, h, w)
|
||||
|
||||
h_ = self.proj_out(h_)
|
||||
|
||||
return x + h_
|
||||
|
||||
|
||||
def make_attn(in_channels, attn_type="vanilla"):
|
||||
assert attn_type in ["vanilla", "linear", "none"], f"attn_type {attn_type} unknown"
|
||||
print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
|
||||
if attn_type == "vanilla":
|
||||
return AttnBlock(in_channels)
|
||||
elif attn_type == "none":
|
||||
return nn.Identity(in_channels)
|
||||
else:
|
||||
return LinAttnBlock(in_channels)
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
ch,
|
||||
out_ch,
|
||||
ch_mult=(1, 2, 4, 8),
|
||||
num_res_blocks,
|
||||
attn_resolutions,
|
||||
dropout=0.0,
|
||||
resamp_with_conv=True,
|
||||
in_channels,
|
||||
resolution,
|
||||
z_channels,
|
||||
double_z=True,
|
||||
use_linear_attn=False,
|
||||
attn_type="vanilla",
|
||||
**ignore_kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
if use_linear_attn:
|
||||
attn_type = "linear"
|
||||
self.ch = ch
|
||||
self.temb_ch = 0
|
||||
self.num_resolutions = len(ch_mult)
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.resolution = resolution
|
||||
self.in_channels = in_channels
|
||||
|
||||
# downsampling
|
||||
self.conv_in = torch.nn.Conv2d(
|
||||
in_channels, self.ch, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
|
||||
curr_res = resolution
|
||||
in_ch_mult = (1,) + tuple(ch_mult)
|
||||
self.in_ch_mult = in_ch_mult
|
||||
self.down = nn.ModuleList()
|
||||
for i_level in range(self.num_resolutions):
|
||||
block = nn.ModuleList()
|
||||
attn = nn.ModuleList()
|
||||
block_in = ch * in_ch_mult[i_level]
|
||||
block_out = ch * ch_mult[i_level]
|
||||
for i_block in range(self.num_res_blocks):
|
||||
block.append(
|
||||
ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_out,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
)
|
||||
block_in = block_out
|
||||
if curr_res in attn_resolutions:
|
||||
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||
down = nn.Module()
|
||||
down.block = block
|
||||
down.attn = attn
|
||||
if i_level != self.num_resolutions - 1:
|
||||
down.downsample = Downsample(block_in, resamp_with_conv)
|
||||
curr_res = curr_res // 2
|
||||
self.down.append(down)
|
||||
|
||||
# middle
|
||||
self.mid = nn.Module()
|
||||
self.mid.block_1 = ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_in,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
||||
self.mid.block_2 = ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_in,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
# end
|
||||
self.norm_out = Normalize(block_in)
|
||||
self.conv_out = torch.nn.Conv2d(
|
||||
block_in,
|
||||
2 * z_channels if double_z else z_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
# timestep embedding
|
||||
temb = None
|
||||
|
||||
# downsampling
|
||||
hs = [self.conv_in(x)]
|
||||
for i_level in range(self.num_resolutions):
|
||||
for i_block in range(self.num_res_blocks):
|
||||
h = self.down[i_level].block[i_block](hs[-1], temb)
|
||||
if len(self.down[i_level].attn) > 0:
|
||||
h = self.down[i_level].attn[i_block](h)
|
||||
hs.append(h)
|
||||
if i_level != self.num_resolutions - 1:
|
||||
hs.append(self.down[i_level].downsample(hs[-1]))
|
||||
|
||||
# middle
|
||||
h = hs[-1]
|
||||
h = self.mid.block_1(h, temb)
|
||||
h = self.mid.attn_1(h)
|
||||
h = self.mid.block_2(h, temb)
|
||||
|
||||
# end
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv_out(h)
|
||||
return h
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
ch,
|
||||
out_ch,
|
||||
ch_mult=(1, 2, 4, 8),
|
||||
num_res_blocks,
|
||||
attn_resolutions,
|
||||
dropout=0.0,
|
||||
resamp_with_conv=True,
|
||||
in_channels,
|
||||
resolution,
|
||||
z_channels,
|
||||
give_pre_end=False,
|
||||
tanh_out=False,
|
||||
use_linear_attn=False,
|
||||
attn_type="vanilla",
|
||||
**ignorekwargs,
|
||||
):
|
||||
super().__init__()
|
||||
if use_linear_attn:
|
||||
attn_type = "linear"
|
||||
self.ch = ch
|
||||
self.temb_ch = 0
|
||||
self.num_resolutions = len(ch_mult)
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.resolution = resolution
|
||||
self.in_channels = in_channels
|
||||
self.give_pre_end = give_pre_end
|
||||
self.tanh_out = tanh_out
|
||||
|
||||
# compute in_ch_mult, block_in and curr_res at lowest res
|
||||
in_ch_mult = (1,) + tuple(ch_mult)
|
||||
block_in = ch * ch_mult[self.num_resolutions - 1]
|
||||
curr_res = resolution // 2 ** (self.num_resolutions - 1)
|
||||
self.z_shape = (1, z_channels, curr_res, curr_res)
|
||||
print(
|
||||
"Working with z of shape {} = {} dimensions.".format(
|
||||
self.z_shape, np.prod(self.z_shape)
|
||||
)
|
||||
)
|
||||
|
||||
# z to block_in
|
||||
self.conv_in = torch.nn.Conv2d(
|
||||
z_channels, block_in, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
|
||||
# middle
|
||||
self.mid = nn.Module()
|
||||
self.mid.block_1 = ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_in,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
||||
self.mid.block_2 = ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_in,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
# upsampling
|
||||
self.up = nn.ModuleList()
|
||||
for i_level in reversed(range(self.num_resolutions)):
|
||||
block = nn.ModuleList()
|
||||
attn = nn.ModuleList()
|
||||
block_out = ch * ch_mult[i_level]
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
block.append(
|
||||
ResnetBlock(
|
||||
in_channels=block_in,
|
||||
out_channels=block_out,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout,
|
||||
)
|
||||
)
|
||||
block_in = block_out
|
||||
if curr_res in attn_resolutions:
|
||||
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||
up = nn.Module()
|
||||
up.block = block
|
||||
up.attn = attn
|
||||
if i_level != 0:
|
||||
up.upsample = Upsample(block_in, resamp_with_conv)
|
||||
curr_res = curr_res * 2
|
||||
self.up.insert(0, up) # prepend to get consistent order
|
||||
|
||||
# end
|
||||
self.norm_out = Normalize(block_in)
|
||||
self.conv_out = torch.nn.Conv2d(
|
||||
block_in, out_ch, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
|
||||
def forward(self, z):
|
||||
# assert z.shape[1:] == self.z_shape[1:]
|
||||
self.last_z_shape = z.shape
|
||||
|
||||
# timestep embedding
|
||||
temb = None
|
||||
|
||||
# z to block_in
|
||||
h = self.conv_in(z)
|
||||
|
||||
# middle
|
||||
h = self.mid.block_1(h, temb)
|
||||
h = self.mid.attn_1(h)
|
||||
h = self.mid.block_2(h, temb)
|
||||
|
||||
# upsampling
|
||||
for i_level in reversed(range(self.num_resolutions)):
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
h = self.up[i_level].block[i_block](h, temb)
|
||||
if len(self.up[i_level].attn) > 0:
|
||||
h = self.up[i_level].attn[i_block](h)
|
||||
if i_level != 0:
|
||||
h = self.up[i_level].upsample(h)
|
||||
|
||||
# end
|
||||
if self.give_pre_end:
|
||||
return h
|
||||
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv_out(h)
|
||||
if self.tanh_out:
|
||||
h = torch.tanh(h)
|
||||
return h
|
||||
|
||||
|
||||
class LatentRescaler(nn.Module):
|
||||
def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
|
||||
super().__init__()
|
||||
# residual block, interpolate, residual block
|
||||
self.factor = factor
|
||||
self.conv_in = nn.Conv2d(
|
||||
in_channels, mid_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
self.res_block1 = nn.ModuleList(
|
||||
[
|
||||
ResnetBlock(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
temb_channels=0,
|
||||
dropout=0.0,
|
||||
)
|
||||
for _ in range(depth)
|
||||
]
|
||||
)
|
||||
self.attn = AttnBlock(mid_channels)
|
||||
self.res_block2 = nn.ModuleList(
|
||||
[
|
||||
ResnetBlock(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
temb_channels=0,
|
||||
dropout=0.0,
|
||||
)
|
||||
for _ in range(depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.conv_out = nn.Conv2d(
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_in(x)
|
||||
for block in self.res_block1:
|
||||
x = block(x, None)
|
||||
x = torch.nn.functional.interpolate(
|
||||
x,
|
||||
size=(
|
||||
int(round(x.shape[2] * self.factor)),
|
||||
int(round(x.shape[3] * self.factor)),
|
||||
),
|
||||
)
|
||||
x = self.attn(x)
|
||||
for block in self.res_block2:
|
||||
x = block(x, None)
|
||||
x = self.conv_out(x)
|
||||
return x
|
||||
|
||||
|
||||
class Upsampler(nn.Module):
|
||||
def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
|
||||
super().__init__()
|
||||
assert out_size >= in_size
|
||||
num_blocks = int(np.log2(out_size // in_size)) + 1
|
||||
factor_up = 1.0 + (out_size % in_size)
|
||||
print(
|
||||
f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}"
|
||||
)
|
||||
self.rescaler = LatentRescaler(
|
||||
factor=factor_up,
|
||||
in_channels=in_channels,
|
||||
mid_channels=2 * in_channels,
|
||||
out_channels=in_channels,
|
||||
)
|
||||
self.decoder = Decoder(
|
||||
out_ch=out_channels,
|
||||
resolution=out_size,
|
||||
z_channels=in_channels,
|
||||
num_res_blocks=2,
|
||||
attn_resolutions=[],
|
||||
in_channels=None,
|
||||
ch=in_channels,
|
||||
ch_mult=[ch_mult for _ in range(num_blocks)],
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.rescaler(x)
|
||||
x = self.decoder(x)
|
||||
return x
|
||||
|
||||
|
||||
class Resize(nn.Module):
|
||||
def __init__(self, in_channels=None, learned=False, mode="bilinear"):
|
||||
super().__init__()
|
||||
self.with_conv = learned
|
||||
self.mode = mode
|
||||
if self.with_conv:
|
||||
print(
|
||||
f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode"
|
||||
)
|
||||
raise NotImplementedError()
|
||||
assert in_channels is not None
|
||||
# no asymmetric padding in torch conv, must do it ourselves
|
||||
self.conv = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=4, stride=2, padding=1
|
||||
)
|
||||
|
||||
def forward(self, x, scale_factor=1.0):
|
||||
if scale_factor == 1.0:
|
||||
return x
|
||||
else:
|
||||
x = torch.nn.functional.interpolate(
|
||||
x, mode=self.mode, align_corners=False, scale_factor=scale_factor
|
||||
)
|
||||
return x
|
||||
|
||||
|
||||
class FirstStagePostProcessor(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
ch_mult: list,
|
||||
in_channels,
|
||||
pretrained_model: nn.Module = None,
|
||||
reshape=False,
|
||||
n_channels=None,
|
||||
dropout=0.0,
|
||||
pretrained_config=None,
|
||||
):
|
||||
super().__init__()
|
||||
if pretrained_config is None:
|
||||
assert (
|
||||
pretrained_model is not None
|
||||
), 'Either "pretrained_model" or "pretrained_config" must not be None'
|
||||
self.pretrained_model = pretrained_model
|
||||
else:
|
||||
assert (
|
||||
pretrained_config is not None
|
||||
), 'Either "pretrained_model" or "pretrained_config" must not be None'
|
||||
self.instantiate_pretrained(pretrained_config)
|
||||
|
||||
self.do_reshape = reshape
|
||||
|
||||
if n_channels is None:
|
||||
n_channels = self.pretrained_model.encoder.ch
|
||||
|
||||
self.proj_norm = Normalize(in_channels, num_groups=in_channels // 2)
|
||||
self.proj = nn.Conv2d(
|
||||
in_channels, n_channels, kernel_size=3, stride=1, padding=1
|
||||
)
|
||||
|
||||
blocks = []
|
||||
downs = []
|
||||
ch_in = n_channels
|
||||
for m in ch_mult:
|
||||
blocks.append(
|
||||
ResnetBlock(
|
||||
in_channels=ch_in, out_channels=m * n_channels, dropout=dropout
|
||||
)
|
||||
)
|
||||
ch_in = m * n_channels
|
||||
downs.append(Downsample(ch_in, with_conv=False))
|
||||
|
||||
self.model = nn.ModuleList(blocks)
|
||||
self.downsampler = nn.ModuleList(downs)
|
||||
|
||||
def instantiate_pretrained(self, config):
|
||||
model = instantiate_from_config(config)
|
||||
self.pretrained_model = model.eval()
|
||||
# self.pretrained_model.train = False
|
||||
for param in self.pretrained_model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
@torch.no_grad()
|
||||
def encode_with_pretrained(self, x):
|
||||
c = self.pretrained_model.encode(x)
|
||||
if isinstance(c, DiagonalGaussianDistribution):
|
||||
c = c.mode()
|
||||
return c
|
||||
|
||||
def forward(self, x):
|
||||
z_fs = self.encode_with_pretrained(x)
|
||||
z = self.proj_norm(z_fs)
|
||||
z = self.proj(z)
|
||||
z = nonlinearity(z)
|
||||
|
||||
for submodel, downmodel in zip(self.model, self.downsampler):
|
||||
z = submodel(z, temb=None)
|
||||
z = downmodel(z)
|
||||
|
||||
if self.do_reshape:
|
||||
z = rearrange(z, "b c h w -> b (h w) c")
|
||||
return z
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,294 @@
|
||||
# adopted from
|
||||
# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
|
||||
# and
|
||||
# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
|
||||
# and
|
||||
# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
|
||||
#
|
||||
# thanks!
|
||||
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
from einops import repeat
|
||||
|
||||
from imaginairy.utils import instantiate_from_config
|
||||
|
||||
|
||||
def make_beta_schedule(
|
||||
schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
|
||||
):
|
||||
if schedule == "linear":
|
||||
betas = (
|
||||
torch.linspace(
|
||||
linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
|
||||
)
|
||||
** 2
|
||||
)
|
||||
|
||||
elif schedule == "cosine":
|
||||
timesteps = (
|
||||
torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
|
||||
)
|
||||
alphas = timesteps / (1 + cosine_s) * np.pi / 2
|
||||
alphas = torch.cos(alphas).pow(2)
|
||||
alphas = alphas / alphas[0]
|
||||
betas = 1 - alphas[1:] / alphas[:-1]
|
||||
betas = np.clip(betas, a_min=0, a_max=0.999)
|
||||
|
||||
elif schedule == "sqrt_linear":
|
||||
betas = torch.linspace(
|
||||
linear_start, linear_end, n_timestep, dtype=torch.float64
|
||||
)
|
||||
elif schedule == "sqrt":
|
||||
betas = (
|
||||
torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
|
||||
** 0.5
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"schedule '{schedule}' unknown.")
|
||||
return betas.numpy()
|
||||
|
||||
|
||||
def make_ddim_timesteps(
|
||||
ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True
|
||||
):
|
||||
if ddim_discr_method == "uniform":
|
||||
c = num_ddpm_timesteps // num_ddim_timesteps
|
||||
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
||||
elif ddim_discr_method == "quad":
|
||||
ddim_timesteps = (
|
||||
(np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps)) ** 2
|
||||
).astype(int)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f'There is no ddim discretization method called "{ddim_discr_method}"'
|
||||
)
|
||||
|
||||
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
||||
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
||||
steps_out = ddim_timesteps + 1
|
||||
if verbose:
|
||||
print(f"Selected timesteps for ddim sampler: {steps_out}")
|
||||
return steps_out
|
||||
|
||||
|
||||
def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
|
||||
# select alphas for computing the variance schedule
|
||||
alphas = alphacums[ddim_timesteps]
|
||||
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
|
||||
|
||||
# according the the formula provided in https://arxiv.org/abs/2010.02502
|
||||
sigmas = eta * np.sqrt(
|
||||
(1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
|
||||
)
|
||||
if verbose:
|
||||
print(
|
||||
f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}"
|
||||
)
|
||||
print(
|
||||
f"For the chosen value of eta, which is {eta}, "
|
||||
f"this results in the following sigma_t schedule for ddim sampler {sigmas}"
|
||||
)
|
||||
return sigmas, alphas, alphas_prev
|
||||
|
||||
|
||||
def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function,
|
||||
which defines the cumulative product of (1-beta) over time from t = [0,1].
|
||||
:param num_diffusion_timesteps: the number of betas to produce.
|
||||
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
|
||||
produces the cumulative product of (1-beta) up to that
|
||||
part of the diffusion process.
|
||||
:param max_beta: the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
"""
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
t1 = i / num_diffusion_timesteps
|
||||
t2 = (i + 1) / num_diffusion_timesteps
|
||||
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
|
||||
return np.array(betas)
|
||||
|
||||
|
||||
def extract_into_tensor(a, t, x_shape):
|
||||
b, *_ = t.shape
|
||||
out = a.gather(-1, t)
|
||||
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
||||
|
||||
|
||||
def checkpoint(func, inputs, params, flag):
|
||||
"""
|
||||
Evaluate a function without caching intermediate activations, allowing for
|
||||
reduced memory at the expense of extra compute in the backward pass.
|
||||
:param func: the function to evaluate.
|
||||
:param inputs: the argument sequence to pass to `func`.
|
||||
:param params: a sequence of parameters `func` depends on but does not
|
||||
explicitly take as arguments.
|
||||
:param flag: if False, disable gradient checkpointing.
|
||||
"""
|
||||
if flag:
|
||||
args = tuple(inputs) + tuple(params)
|
||||
return CheckpointFunction.apply(func, len(inputs), *args)
|
||||
else:
|
||||
return func(*inputs)
|
||||
|
||||
|
||||
class CheckpointFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, run_function, length, *args):
|
||||
ctx.run_function = run_function
|
||||
ctx.input_tensors = list(args[:length])
|
||||
ctx.input_params = list(args[length:])
|
||||
|
||||
with torch.no_grad():
|
||||
output_tensors = ctx.run_function(*ctx.input_tensors)
|
||||
return output_tensors
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *output_grads):
|
||||
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
||||
with torch.enable_grad():
|
||||
# Fixes a bug where the first op in run_function modifies the
|
||||
# Tensor storage in place, which is not allowed for detach()'d
|
||||
# Tensors.
|
||||
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
|
||||
output_tensors = ctx.run_function(*shallow_copies)
|
||||
input_grads = torch.autograd.grad(
|
||||
output_tensors,
|
||||
ctx.input_tensors + ctx.input_params,
|
||||
output_grads,
|
||||
allow_unused=True,
|
||||
)
|
||||
del ctx.input_tensors
|
||||
del ctx.input_params
|
||||
del output_tensors
|
||||
return (None, None) + input_grads
|
||||
|
||||
|
||||
def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
|
||||
"""
|
||||
Create sinusoidal timestep embeddings.
|
||||
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
:param dim: the dimension of the output.
|
||||
:param max_period: controls the minimum frequency of the embeddings.
|
||||
:return: an [N x dim] Tensor of positional embeddings.
|
||||
"""
|
||||
if not repeat_only:
|
||||
half = dim // 2
|
||||
freqs = torch.exp(
|
||||
-math.log(max_period)
|
||||
* torch.arange(start=0, end=half, dtype=torch.float32)
|
||||
/ half
|
||||
).to(device=timesteps.device)
|
||||
args = timesteps[:, None].float() * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat(
|
||||
[embedding, torch.zeros_like(embedding[:, :1])], dim=-1
|
||||
)
|
||||
else:
|
||||
embedding = repeat(timesteps, "b -> b d", d=dim)
|
||||
return embedding
|
||||
|
||||
|
||||
def zero_module(module):
|
||||
"""
|
||||
Zero out the parameters of a module and return it.
|
||||
"""
|
||||
for p in module.parameters():
|
||||
p.detach().zero_()
|
||||
return module
|
||||
|
||||
|
||||
def scale_module(module, scale):
|
||||
"""
|
||||
Scale the parameters of a module and return it.
|
||||
"""
|
||||
for p in module.parameters():
|
||||
p.detach().mul_(scale)
|
||||
return module
|
||||
|
||||
|
||||
def mean_flat(tensor):
|
||||
"""
|
||||
Take the mean over all non-batch dimensions.
|
||||
"""
|
||||
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
||||
|
||||
|
||||
def normalization(channels):
|
||||
"""
|
||||
Make a standard normalization layer.
|
||||
:param channels: number of input channels.
|
||||
:return: an nn.Module for normalization.
|
||||
"""
|
||||
return GroupNorm32(32, channels)
|
||||
|
||||
|
||||
# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
|
||||
class SiLU(nn.Module):
|
||||
def forward(self, x):
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
class GroupNorm32(nn.GroupNorm):
|
||||
def forward(self, x):
|
||||
return super().forward(x.float()).type(x.dtype)
|
||||
|
||||
|
||||
def conv_nd(dims, *args, **kwargs):
|
||||
"""
|
||||
Create a 1D, 2D, or 3D convolution module.
|
||||
"""
|
||||
if dims == 1:
|
||||
return nn.Conv1d(*args, **kwargs)
|
||||
elif dims == 2:
|
||||
return nn.Conv2d(*args, **kwargs)
|
||||
elif dims == 3:
|
||||
return nn.Conv3d(*args, **kwargs)
|
||||
raise ValueError(f"unsupported dimensions: {dims}")
|
||||
|
||||
|
||||
def linear(*args, **kwargs):
|
||||
"""
|
||||
Create a linear module.
|
||||
"""
|
||||
return nn.Linear(*args, **kwargs)
|
||||
|
||||
|
||||
def avg_pool_nd(dims, *args, **kwargs):
|
||||
"""
|
||||
Create a 1D, 2D, or 3D average pooling module.
|
||||
"""
|
||||
if dims == 1:
|
||||
return nn.AvgPool1d(*args, **kwargs)
|
||||
elif dims == 2:
|
||||
return nn.AvgPool2d(*args, **kwargs)
|
||||
elif dims == 3:
|
||||
return nn.AvgPool3d(*args, **kwargs)
|
||||
raise ValueError(f"unsupported dimensions: {dims}")
|
||||
|
||||
|
||||
class HybridConditioner(nn.Module):
|
||||
def __init__(self, c_concat_config, c_crossattn_config):
|
||||
super().__init__()
|
||||
self.concat_conditioner = instantiate_from_config(c_concat_config)
|
||||
self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
|
||||
|
||||
def forward(self, c_concat, c_crossattn):
|
||||
c_concat = self.concat_conditioner(c_concat)
|
||||
c_crossattn = self.crossattn_conditioner(c_crossattn)
|
||||
return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}
|
||||
|
||||
|
||||
def noise_like(shape, device, repeat=False):
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
|
||||
shape[0], *((1,) * (len(shape) - 1))
|
||||
)
|
||||
noise = lambda: torch.randn(shape, device=device)
|
||||
return repeat_noise() if repeat else noise()
|
@ -0,0 +1,102 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
|
||||
class AbstractDistribution:
|
||||
def sample(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def mode(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class DiracDistribution(AbstractDistribution):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def sample(self):
|
||||
return self.value
|
||||
|
||||
def mode(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class DiagonalGaussianDistribution(object):
|
||||
def __init__(self, parameters, deterministic=False):
|
||||
self.parameters = parameters
|
||||
self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
|
||||
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
|
||||
self.deterministic = deterministic
|
||||
self.std = torch.exp(0.5 * self.logvar)
|
||||
self.var = torch.exp(self.logvar)
|
||||
if self.deterministic:
|
||||
self.var = self.std = torch.zeros_like(self.mean).to(
|
||||
device=self.parameters.device
|
||||
)
|
||||
|
||||
def sample(self):
|
||||
x = self.mean + self.std * torch.randn(self.mean.shape).to(
|
||||
device=self.parameters.device
|
||||
)
|
||||
return x
|
||||
|
||||
def kl(self, other=None):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.0])
|
||||
else:
|
||||
if other is None:
|
||||
return 0.5 * torch.sum(
|
||||
torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
|
||||
dim=[1, 2, 3],
|
||||
)
|
||||
else:
|
||||
return 0.5 * torch.sum(
|
||||
torch.pow(self.mean - other.mean, 2) / other.var
|
||||
+ self.var / other.var
|
||||
- 1.0
|
||||
- self.logvar
|
||||
+ other.logvar,
|
||||
dim=[1, 2, 3],
|
||||
)
|
||||
|
||||
def nll(self, sample, dims=[1, 2, 3]):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.0])
|
||||
logtwopi = np.log(2.0 * np.pi)
|
||||
return 0.5 * torch.sum(
|
||||
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
|
||||
dim=dims,
|
||||
)
|
||||
|
||||
def mode(self):
|
||||
return self.mean
|
||||
|
||||
|
||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||
"""
|
||||
source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
|
||||
Compute the KL divergence between two gaussians.
|
||||
Shapes are automatically broadcasted, so batches can be compared to
|
||||
scalars, among other use cases.
|
||||
"""
|
||||
tensor = None
|
||||
for obj in (mean1, logvar1, mean2, logvar2):
|
||||
if isinstance(obj, torch.Tensor):
|
||||
tensor = obj
|
||||
break
|
||||
assert tensor is not None, "at least one argument must be a Tensor"
|
||||
|
||||
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
||||
# Tensors, but it does not work for torch.exp().
|
||||
logvar1, logvar2 = [
|
||||
x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
|
||||
for x in (logvar1, logvar2)
|
||||
]
|
||||
|
||||
return 0.5 * (
|
||||
-1.0
|
||||
+ logvar2
|
||||
- logvar1
|
||||
+ torch.exp(logvar1 - logvar2)
|
||||
+ ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
|
||||
)
|
@ -0,0 +1,37 @@
|
||||
import importlib
|
||||
from functools import lru_cache
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_device():
|
||||
if torch.cuda.is_available():
|
||||
return "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
return "mps"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
|
||||
def print_params(model):
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
|
||||
|
||||
|
||||
def instantiate_from_config(config):
|
||||
if not "target" in config:
|
||||
if config == "__is_first_stage__":
|
||||
return None
|
||||
elif config == "__is_unconditional__":
|
||||
return None
|
||||
raise KeyError("Expected key `target` to instantiate.")
|
||||
return get_obj_from_str(config["target"])(**config.get("params", dict()))
|
||||
|
||||
|
||||
def get_obj_from_str(string, reload=False):
|
||||
module, cls = string.rsplit(".", 1)
|
||||
if reload:
|
||||
module_imp = importlib.import_module(module)
|
||||
importlib.reload(module_imp)
|
||||
return getattr(importlib.import_module(module, package=None), cls)
|
@ -0,0 +1,19 @@
|
||||
albumentations==0.4.3
|
||||
diffusers
|
||||
#opencv-python==4.1.2.30
|
||||
pudb==2019.2
|
||||
invisible-watermark
|
||||
imageio==2.9.0
|
||||
imageio-ffmpeg==0.4.2
|
||||
pytorch-lightning==1.4.2
|
||||
omegaconf==2.1.1
|
||||
test-tube>=0.7.5
|
||||
streamlit>=0.73.1
|
||||
einops==0.3.0
|
||||
torch-fidelity==0.3.0
|
||||
transformers==4.19.2
|
||||
torchmetrics==0.6.0
|
||||
kornia==0.6
|
||||
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||
-e git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||
-e .
|
@ -0,0 +1,32 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='imaginairy',
|
||||
version='0.0.1',
|
||||
description='AI imagined images.',
|
||||
packages=find_packages("imaginairy"),
|
||||
install_requires=[
|
||||
'torch',
|
||||
'numpy',
|
||||
'tqdm',
|
||||
"albumentations==0.4.3",
|
||||
"diffusers",
|
||||
# opencv-python==4.1.2.30
|
||||
"pudb==2019.2",
|
||||
"invisible-watermark",
|
||||
"imageio==2.9.0",
|
||||
"imageio-ffmpeg==0.4.2",
|
||||
"pytorch-lightning==1.4.2",
|
||||
"omegaconf==2.1.1",
|
||||
"test-tube>=0.7.5",
|
||||
"streamlit>=0.73.1",
|
||||
"einops==0.3.0",
|
||||
"torch-fidelity==0.3.0",
|
||||
"transformers==4.19.2",
|
||||
"torchmetrics==0.6.0",
|
||||
"kornia==0.6",
|
||||
"realesrgan",
|
||||
"-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers",
|
||||
"-e git+https://github.com/openai/CLIP.git@main#egg=clip",
|
||||
],
|
||||
)
|
Loading…
Reference in New Issue