diff --git a/README.md b/README.md index b3a8b2f..4b02906 100644 --- a/README.md +++ b/README.md @@ -300,7 +300,8 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface - ## ChangeLog -- perf: tiled encoding of images (removes memory bottleneck) +- feature: 🎉🎉 Make large images while retaining composition. Try `imagine "a flower" -w 1920 -h 1080 --upscale` +- perf: sliced encoding of images to latents (removes memory bottleneck) - perf: use Silu for performance improvement over nonlinearity - perf: `xformers` added as a dependency for linux and windows. Gives a nice speed boost. - perf: sliced attention now runs on MacOS. A typo prevented that from happening previously. @@ -555,6 +556,7 @@ would be uncorrelated to the rest of the surrounding image. It created terrible - https://github.com/huggingface/diffusers/pull/532/files - https://github.com/HazyResearch/flash-attention - https://github.com/chavinlo/sda-node + - https://github.com/AminRezaei0x443/memory-efficient-attention/issues/7 - Development Environment - ✅ add tests diff --git a/imaginairy/api.py b/imaginairy/api.py index 878106d..98baf9f 100755 --- a/imaginairy/api.py +++ b/imaginairy/api.py @@ -3,6 +3,7 @@ import math import os import re +from imaginairy.enhancers.upscale_riverwing import upscale_latent from imaginairy.schema import SafetyMode logger = logging.getLogger(__name__) @@ -269,6 +270,7 @@ def _generate_single_image( with lc.timing("conditioning"): # need to expand if doing batches neutral_conditioning = _prompts_to_embeddings(prompt.negative_prompt, model) + _prompts_to_embeddings("", model) log_conditioning(neutral_conditioning, "neutral conditioning") if prompt.conditioning is not None: positive_conditioning = prompt.conditioning @@ -425,40 +427,43 @@ def _generate_single_image( } log_latent(init_latent_noised, "init_latent_noised") - comp_samples = _generate_composition_latent( - sampler=sampler, - sampler_kwargs={ - "num_steps": prompt.steps, - "initial_latent": init_latent_noised, - "positive_conditioning": positive_conditioning, - "neutral_conditioning": neutral_conditioning, - "guidance_scale": prompt.prompt_strength, - "t_start": t_enc, - "mask": mask_latent, - "orig_latent": init_latent, - "shape": shape, - "batch_size": 1, - "denoiser_cls": denoiser_cls, - }, - ) - if comp_samples is not None: - noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]] - - schedule = NoiseSchedule( - model_num_timesteps=model.num_timesteps, - ddim_num_steps=prompt.steps, - model_alphas_cumprod=model.alphas_cumprod, - ddim_discretize="uniform", - ) - t_enc = int(prompt.steps * 0.8) - init_latent_noised = noise_an_image( - comp_samples, - torch.tensor([t_enc - 1]).to(get_device()), - schedule=schedule, - noise=noise, + if prompt.allow_compose_phase: + comp_samples = _generate_composition_latent( + sampler=sampler, + sampler_kwargs={ + "num_steps": prompt.steps, + "initial_latent": init_latent_noised, + "positive_conditioning": positive_conditioning, + "neutral_conditioning": neutral_conditioning, + "guidance_scale": prompt.prompt_strength, + "t_start": t_enc, + "mask": mask_latent, + "orig_latent": init_latent, + "shape": shape, + "batch_size": 1, + "denoiser_cls": denoiser_cls, + }, ) + if comp_samples is not None: + result_images["composition"] = comp_samples + noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]] + + schedule = NoiseSchedule( + model_num_timesteps=model.num_timesteps, + ddim_num_steps=prompt.steps, + model_alphas_cumprod=model.alphas_cumprod, + ddim_discretize="uniform", + ) + t_enc = int(prompt.steps * 0.75) + init_latent_noised = noise_an_image( + comp_samples, + torch.tensor([t_enc - 1]).to(get_device()), + schedule=schedule, + noise=noise, + ) + + log_latent(comp_samples, "comp_samples") - log_latent(comp_samples, "comp_samples") with lc.timing("sampling"): samples = sampler.sample( num_steps=prompt.steps, @@ -575,8 +580,7 @@ def _generate_composition_latent( from torch.nn import functional as F - new_kwargs = deepcopy(sampler_kwargs) - b, c, h, w = orig_shape = new_kwargs["shape"] + b, c, h, w = orig_shape = sampler_kwargs["shape"] max_compose_gen_size = 768 shrink_scale = calc_scale_to_fit_within( height=h, @@ -586,6 +590,8 @@ def _generate_composition_latent( if shrink_scale >= 1: return None + new_kwargs = deepcopy(sampler_kwargs) + # shrink everything new_shape = b, c, int(round(h * shrink_scale)), int(round(w * shrink_scale)) initial_latent = new_kwargs["initial_latent"] @@ -622,7 +628,7 @@ def _generate_composition_latent( } ) samples = sampler.sample(**new_kwargs) - # samples = upscale_latent(samples) + samples = upscale_latent(samples) samples = F.interpolate(samples, size=orig_shape[2:], mode="bilinear") return samples diff --git a/imaginairy/cmds.py b/imaginairy/cmds.py index 230b2af..61bb26d 100644 --- a/imaginairy/cmds.py +++ b/imaginairy/cmds.py @@ -136,6 +136,11 @@ common_options = [ is_flag=True, help="Any images rendered will be tileable in the Y direction.", ), + click.option( + "--allow-compose-phase/--no-compose-phase", + default=True, + help="Allow the image to be composed at a lower resolution.", + ), click.option( "--mask-image", metavar="PATH|URL", @@ -342,6 +347,7 @@ def imagine_cmd( tile, tile_x, tile_y, + allow_compose_phase, mask_image, mask_prompt, mask_mode, @@ -387,6 +393,7 @@ def imagine_cmd( tile, tile_x, tile_y, + allow_compose_phase, mask_image, mask_prompt, mask_mode, @@ -591,6 +598,7 @@ def _imagine_cmd( tile, tile_x, tile_y, + allow_compose_phase, mask_image, mask_prompt, mask_mode, @@ -705,6 +713,7 @@ def _imagine_cmd( fix_faces=fix_faces, fix_faces_fidelity=fix_faces_fidelity, tile_mode=_tile_mode, + allow_compose_phase=allow_compose_phase, model=model_weights_path, model_config_path=model_config_path, ) diff --git a/imaginairy/enhancers/upscale_riverwing.py b/imaginairy/enhancers/upscale_riverwing.py index 0850de6..67716cc 100644 --- a/imaginairy/enhancers/upscale_riverwing.py +++ b/imaginairy/enhancers/upscale_riverwing.py @@ -23,7 +23,13 @@ class NoiseLevelAndTextConditionedUpscaler(nn.Module): def forward(self, inp, sigma, low_res, low_res_sigma, c, **kwargs): cross_cond, cross_cond_padding, pooler = c - c_in = 1 / (low_res_sigma**2 + self.sigma_data**2) ** 0.5 + sigma_data = self.sigma_data + # 'MPS does not support power op with int64 input' + if isinstance(low_res_sigma, torch.Tensor): + low_res_sigma = low_res_sigma.to(torch.float32) + if isinstance(sigma_data, torch.Tensor): + sigma_data = sigma_data.to(torch.float32) + c_in = 1 / (low_res_sigma**2 + sigma_data**2) ** 0.5 c_noise = low_res_sigma.log1p()[:, None] c_in = append_dims(c_in, low_res.ndim) low_res_noise_embed = self.low_res_noise_embed(c_noise) @@ -200,7 +206,7 @@ def upscale_latent( low_res_latent, upscale_prompt="", seed=0, - steps=30, + steps=15, guidance_scale=1.0, batch_size=1, num_samples=1, diff --git a/imaginairy/schema.py b/imaginairy/schema.py index 426b670..80f3ec4 100644 --- a/imaginairy/schema.py +++ b/imaginairy/schema.py @@ -110,6 +110,7 @@ class ImaginePrompt: sampler_type=config.DEFAULT_SAMPLER, conditioning=None, tile_mode="", + allow_compose_phase=True, model=config.DEFAULT_MODEL, model_config_path=None, is_intermediate=False, @@ -136,8 +137,10 @@ class ImaginePrompt: self.mask_modify_original = mask_modify_original self.outpaint = outpaint self.tile_mode = tile_mode + self.allow_compose_phase = allow_compose_phase self.model = model self.model_config_path = model_config_path + # we don't want to save intermediate images self.is_intermediate = is_intermediate self.collect_progress_latents = collect_progress_latents @@ -284,7 +287,10 @@ class ImagineResult: ): import torch - from imaginairy.img_utils import torch_img_to_pillow_img + from imaginairy.img_utils import ( + model_latent_to_pillow_img, + torch_img_to_pillow_img, + ) from imaginairy.utils import get_device, get_hardware_description self.prompt = prompt @@ -305,7 +311,10 @@ class ImagineResult: for img_type, r_img in result_images.items(): if isinstance(r_img, torch.Tensor): - r_img = torch_img_to_pillow_img(r_img) + if r_img.shape[1] == 4: + r_img = model_latent_to_pillow_img(r_img) + else: + r_img = torch_img_to_pillow_img(r_img) self.images[img_type] = r_img self.timings = timings diff --git a/imaginairy/vendored/k_diffusion/models/image_v1.py b/imaginairy/vendored/k_diffusion/models/image_v1.py index a8fc678..06a10e9 100644 --- a/imaginairy/vendored/k_diffusion/models/image_v1.py +++ b/imaginairy/vendored/k_diffusion/models/image_v1.py @@ -142,6 +142,10 @@ class UBlock(layers.ConditionedSequential): def forward(self, input, cond, skip=None): if skip is not None: + if input.shape[-2:] != skip.shape[-2:]: + input = nn.functional.interpolate( + input, size=skip.shape[-2:], mode="bilinear" + ) input = torch.cat([input, skip], dim=1) return super().forward(input, cond)