feature: generate large images

Added a composition stage so large images are more coherent
pull/259/head
Bryce 1 year ago committed by Bryce Drennan
parent b93b6a4d7c
commit 2aef6089e0

@ -53,15 +53,17 @@ Use prompt strength to control how strong the edit is. For extra control you can
with prompt-based masking.
```bash
>> aimg edit scenic_landscape.jpg "make it winter" --prompt-strength 20
>> aimg edit scenic_landscape.jpg "make it winter" --steps 30 --arg-schedule "prompt_strength[2:25:0.5]" --compilation-anim
>> aimg edit dog.jpg "make the dog red" --prompt-strength 5
>> aimg edit bowl_of_fruit.jpg "replace the fruit with strawberries"
>> aimg edit freckled_woman.jpg "make her a cyborg" --prompt-strength 13
# enter imaginairy shell
>> aimg
🤖🧠> edit scenic_landscape.jpg -p "make it winter" --prompt-strength 20
🤖🧠> edit scenic_landscape.jpg -p "make it winter" --steps 30 --arg-schedule "prompt_strength[2:25:0.5]" --compilation-anim
🤖🧠> edit dog.jpg -p "make the dog red" --prompt-strength 5
🤖🧠> edit bowl_of_fruit.jpg -p "replace the fruit with strawberries"
🤖🧠> edit freckled_woman.jpg -p "make her a cyborg" --prompt-strength 13
# create a comparison gif
>> aimg edit pearl_girl.jpg "make her wear clown makeup" --compare-gif
🤖🧠> edit pearl_girl.jpg -p "make her wear clown makeup" --compare-gif
# create an animation showing the edit with increasing prompt strengths
>> aimg edit mona-lisa.jpg "make it a color professional photo headshot" --negative-prompt "old, ugly, blurry" --arg-schedule "prompt-strength[2:8:0.5]" --compilation-anim gif
🤖🧠> edit mona-lisa.jpg -p "make it a color professional photo headshot" --negative-prompt "old, ugly, blurry" --arg-schedule "prompt-strength[2:8:0.5]" --compilation-anim gif
```
@ -570,6 +572,7 @@ would be uncorrelated to the rest of the surrounding image. It created terrible
- ✅ add k-diffusion sampling methods
- ✅ tiling
- ✅ generation videos/gifs
- [Attend and Excite](https://attendandexcite.github.io/Attend-and-Excite/)
- Compositional Visual Generation
- https://github.com/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch
- https://colab.research.google.com/github/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch/blob/main/notebooks/demo.ipynb#scrollTo=wt_j3uXZGFAS
@ -600,6 +603,10 @@ would be uncorrelated to the rest of the surrounding image. It created terrible
- ✅ text based image masking
- ✅ ClipSeg - https://github.com/timojl/clipseg
- https://github.com/facebookresearch/detectron2
- https://x-decoder-vl.github.io/
- Maskless editing
- ✅ instruct-pix2pix
-
- Attention Control Methods
- https://github.com/bloc97/CrossAttentionControl
- https://github.com/ChenWu98/cycle-diffusion
@ -609,7 +616,10 @@ would be uncorrelated to the rest of the surrounding image. It created terrible
- ✅ realesrgan
- ldm
- https://github.com/lowfuel/progrock-stable
- gobig
- [txt2imghd](https://github.com/jquesnelle/txt2imghd/blob/master/txt2imghd.py)
- latent scaling + reprocessing
- stability upscaler
- rivers have wings upscaler
- stable super-res?
- todo: try with 1-0-0-0 mask at full image resolution (rencoding entire image+predicted image at every step)
- todo: use a gaussian pyramid and only include the "high-detail" level of the pyramid into the next step
@ -684,6 +694,7 @@ would be uncorrelated to the rest of the surrounding image. It created terrible
- https://stablecog.com/
## Further Reading
- https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
- [Prompt Engineering Handbook](https://openart.ai/promptbook)
- Differences between samplers
- https://www.reddit.com/r/StableDiffusion/comments/xbeyw3/can_anyone_offer_a_little_guidance_on_the/

@ -1,4 +1,5 @@
import logging
import math
import os
import re
@ -194,6 +195,8 @@ def _generate_single_image(
progress_img_interval_min_s=0.1,
half_mode=None,
add_caption=False,
suppress_inpaint=False,
return_latent=False,
):
import torch.nn
from PIL import Image, ImageOps
@ -241,7 +244,8 @@ def _generate_single_image(
weights_location=prompt.model,
config_path=prompt.model_config_path,
half_mode=half_mode,
for_inpainting=prompt.mask_image or prompt.mask_prompt or prompt.outpaint,
for_inpainting=(prompt.mask_image or prompt.mask_prompt or prompt.outpaint)
and not suppress_inpaint,
)
progress_latents = []
@ -288,6 +292,9 @@ def _generate_single_image(
c_cat = []
c_cat_neutral = None
result_images = {}
seed_everything(prompt.seed)
noise = randn_seeded(seed=prompt.seed, size=shape).to(get_device())
if prompt.init_image:
starting_image = prompt.init_image
generation_strength = 1 - prompt.init_image_strength
@ -341,10 +348,11 @@ def _generate_single_image(
shape = init_latent.shape
log_latent(init_latent, "init_latent")
seed_everything(prompt.seed)
noise = randn_seeded(seed=prompt.seed, size=init_latent.size())
noise = noise.to(get_device())
noise = randn_seeded(seed=prompt.seed, size=init_latent.shape).to(
get_device()
)
# noise = noise[:, :, : init_latent.shape[2], : init_latent.shape[3]]
schedule = NoiseSchedule(
model_num_timesteps=model.num_timesteps,
@ -417,6 +425,40 @@ def _generate_single_image(
}
log_latent(init_latent_noised, "init_latent_noised")
comp_samples = _generate_composition_latent(
sampler=sampler,
sampler_kwargs={
"num_steps": prompt.steps,
"initial_latent": init_latent_noised,
"positive_conditioning": positive_conditioning,
"neutral_conditioning": neutral_conditioning,
"guidance_scale": prompt.prompt_strength,
"t_start": t_enc,
"mask": mask_latent,
"orig_latent": init_latent,
"shape": shape,
"batch_size": 1,
"denoiser_cls": denoiser_cls,
},
)
if comp_samples is not None:
noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]]
schedule = NoiseSchedule(
model_num_timesteps=model.num_timesteps,
ddim_num_steps=prompt.steps,
model_alphas_cumprod=model.alphas_cumprod,
ddim_discretize="uniform",
)
t_enc = int(prompt.steps * 0.8)
init_latent_noised = noise_an_image(
comp_samples,
torch.tensor([t_enc - 1]).to(get_device()),
schedule=schedule,
noise=noise,
)
log_latent(comp_samples, "comp_samples")
with lc.timing("sampling"):
samples = sampler.sample(
num_steps=prompt.steps,
@ -431,6 +473,8 @@ def _generate_single_image(
batch_size=1,
denoiser_cls=denoiser_cls,
)
if return_latent:
return samples
with lc.timing("decoding"):
gen_imgs_t = model.decode_first_stage(samples)
@ -441,6 +485,11 @@ def _generate_single_image(
log_img(mask_final, "reconstituting mask")
mask_final = ImageOps.invert(mask_final)
gen_img = Image.composite(gen_img, init_image, mask_final)
gen_img = combine_image(
original_img=init_image,
generated_img=gen_img,
mask_img=mask_image_orig,
)
log_img(gen_img, "reconstituted image")
upscaled_img = None
@ -504,6 +553,80 @@ def _prompts_to_embeddings(prompts, model):
return conditioning
def calc_scale_to_fit_within(
height,
width,
max_size,
):
if max(height, width) < max_size:
return 1
if width > height:
return max_size / width
return max_size / height
def _generate_composition_latent(
sampler,
sampler_kwargs,
):
from copy import deepcopy
from torch.nn import functional as F
new_kwargs = deepcopy(sampler_kwargs)
b, c, h, w = orig_shape = new_kwargs["shape"]
max_compose_gen_size = 768
shrink_scale = calc_scale_to_fit_within(
height=h,
width=w,
max_size=int(math.ceil(max_compose_gen_size / 8)),
)
if shrink_scale >= 1:
return None
# shrink everything
new_shape = b, c, int(round(h * shrink_scale)), int(round(w * shrink_scale))
initial_latent = new_kwargs["initial_latent"]
if initial_latent is not None:
initial_latent = F.interpolate(initial_latent, size=new_shape[2:], mode="area")
for cond in [
new_kwargs["positive_conditioning"],
new_kwargs["neutral_conditioning"],
]:
cond["c_concat"] = [
F.interpolate(c, size=new_shape[2:], mode="area") for c in cond["c_concat"]
]
mask_latent = new_kwargs["mask"]
if mask_latent is not None:
mask_latent = F.interpolate(mask_latent, size=new_shape[2:], mode="area")
orig_latent = new_kwargs["orig_latent"]
if orig_latent is not None:
orig_latent = F.interpolate(orig_latent, size=new_shape[2:], mode="area")
t_start = new_kwargs["t_start"]
if t_start is not None:
gen_strength = new_kwargs["t_start"] / new_kwargs["num_steps"]
t_start = int(round(15 * gen_strength))
new_kwargs.update(
{
"num_steps": 15,
"initial_latent": initial_latent,
"t_start": t_start,
"mask": mask_latent,
"orig_latent": orig_latent,
"shape": new_shape,
}
)
samples = sampler.sample(**new_kwargs)
# samples = upscale_latent(samples)
samples = F.interpolate(samples, size=orig_shape[2:], mode="bilinear")
return samples
def prompt_normalized(prompt):
return re.sub(r"[^a-zA-Z0-9.,\[\]-]+", "_", prompt)[:130]

@ -0,0 +1,299 @@
from functools import lru_cache
import numpy as np
import torch
import torch.nn.functional as F
from pytorch_lightning import seed_everything
from torch import nn
from imaginairy.model_manager import hf_hub_download
from imaginairy.utils import get_device, platform_appropriate_autocast
from imaginairy.vendored import k_diffusion as K
from imaginairy.vendored.k_diffusion import layers
from imaginairy.vendored.k_diffusion.models.image_v1 import ImageDenoiserModelV1
from imaginairy.vendored.k_diffusion.utils import append_dims
class NoiseLevelAndTextConditionedUpscaler(nn.Module):
def __init__(self, inner_model, sigma_data=1.0, embed_dim=256):
super().__init__()
self.inner_model = inner_model
self.sigma_data = sigma_data
self.low_res_noise_embed = K.layers.FourierFeatures(1, embed_dim, std=2)
def forward(self, inp, sigma, low_res, low_res_sigma, c, **kwargs):
cross_cond, cross_cond_padding, pooler = c
c_in = 1 / (low_res_sigma**2 + self.sigma_data**2) ** 0.5
c_noise = low_res_sigma.log1p()[:, None]
c_in = append_dims(c_in, low_res.ndim)
low_res_noise_embed = self.low_res_noise_embed(c_noise)
low_res_in = F.interpolate(low_res, scale_factor=2, mode="nearest") * c_in
mapping_cond = torch.cat([low_res_noise_embed, pooler], dim=1)
return self.inner_model(
inp,
sigma,
unet_cond=low_res_in,
mapping_cond=mapping_cond,
cross_cond=cross_cond,
cross_cond_padding=cross_cond_padding,
**kwargs,
)
@lru_cache(maxsize=1)
def get_upscaler_model(
model_path,
pooler_dim=768,
train=False,
device=get_device(),
):
config = {
"type": "image_v1",
"input_channels": 4,
"input_size": [48, 48],
"patch_size": 1,
"mapping_out": 768,
"mapping_cond_dim": 896,
"unet_cond_dim": 4,
"depths": [4, 4, 4, 4],
"channels": [384, 384, 768, 768],
"self_attn_depths": [False, False, False, True],
"cross_attn_depths": [False, True, True, True],
"cross_cond_dim": 768,
"has_variance": True,
"dropout_rate": 0.0,
"augment_prob": 0.0,
"augment_wrapper": False,
"sigma_data": 1.0,
"sigma_min": 1e-2,
"sigma_max": 20,
"sigma_sample_density": {"type": "lognormal", "mean": -0.5, "std": 1.2},
"skip_stages": 0,
}
model = ImageDenoiserModelV1(
config["input_channels"],
config["mapping_out"],
config["depths"],
config["channels"],
config["self_attn_depths"],
config["cross_attn_depths"],
patch_size=config["patch_size"],
dropout_rate=config["dropout_rate"],
mapping_cond_dim=config["mapping_cond_dim"]
+ (9 if config["augment_wrapper"] else 0),
unet_cond_dim=config["unet_cond_dim"],
cross_cond_dim=config["cross_cond_dim"],
skip_stages=config["skip_stages"],
has_variance=config["has_variance"],
)
model = NoiseLevelAndTextConditionedUpscaler(
model,
sigma_data=config["sigma_data"],
embed_dim=config["mapping_cond_dim"] - pooler_dim,
)
ckpt = torch.load(model_path, map_location="cpu")
model.load_state_dict(ckpt["model_ema"])
model = layers.DenoiserWithVariance(model, sigma_data=config["sigma_data"])
if not train:
model = model.eval().requires_grad_(False)
return model.to(device)
class CFGUpscaler(nn.Module):
def __init__(self, model, uc, cond_scale, device):
super().__init__()
self.inner_model = model
self.uc = uc
self.cond_scale = cond_scale
self.device = device
def forward(self, x, sigma, low_res, low_res_sigma, c):
if self.cond_scale in (0.0, 1.0):
# Shortcut for when we don't need to run both.
if self.cond_scale == 0.0:
c_in = self.uc
elif self.cond_scale == 1.0:
c_in = c
return self.inner_model(
x, sigma, low_res=low_res, low_res_sigma=low_res_sigma, c=c_in
)
x_in = torch.cat([x] * 2)
sigma_in = torch.cat([sigma] * 2)
low_res_in = torch.cat([low_res] * 2)
low_res_sigma_in = torch.cat([low_res_sigma] * 2)
c_in = [torch.cat([uc_item, c_item]) for uc_item, c_item in zip(self.uc, c)]
uncond, cond = self.inner_model(
x_in, sigma_in, low_res=low_res_in, low_res_sigma=low_res_sigma_in, c=c_in
).chunk(2)
return uncond + (cond - uncond) * self.cond_scale
class CLIPTokenizerTransform:
def __init__(self, version="openai/clip-vit-large-patch14", max_length=77):
from transformers import CLIPTokenizer
self.tokenizer = CLIPTokenizer.from_pretrained(version)
self.max_length = max_length
def __call__(self, text):
indexer = 0 if isinstance(text, str) else ...
tok_out = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
return_length=True,
return_overflowing_tokens=False,
padding="max_length",
return_tensors="pt",
)
input_ids = tok_out["input_ids"][indexer]
attention_mask = 1 - tok_out["attention_mask"][indexer]
return input_ids, attention_mask
class CLIPEmbedder(nn.Module):
"""Uses the CLIP transformer encoder for text (from Hugging Face)."""
def __init__(self, version="openai/clip-vit-large-patch14", device="cuda"):
super().__init__()
from transformers import CLIPTextModel, logging
logging.set_verbosity_error()
self.transformer = CLIPTextModel.from_pretrained(version)
self.transformer = self.transformer.eval().requires_grad_(False).to(device)
@property
def device(self):
return self.transformer.device
def forward(self, tok_out):
input_ids, cross_cond_padding = tok_out
clip_out = self.transformer(
input_ids=input_ids.to(self.device), output_hidden_states=True
)
return (
clip_out.hidden_states[-1],
cross_cond_padding.to(self.device),
clip_out.pooler_output,
)
@lru_cache()
def clip_up_models():
with platform_appropriate_autocast():
tok_up = CLIPTokenizerTransform()
text_encoder_up = CLIPEmbedder(device=get_device())
return text_encoder_up, tok_up
@torch.no_grad()
def condition_up(prompts):
text_encoder_up, tok_up = clip_up_models()
return text_encoder_up(tok_up(prompts))
@torch.no_grad()
def upscale_latent(
low_res_latent,
upscale_prompt="",
seed=0,
steps=30,
guidance_scale=1.0,
batch_size=1,
num_samples=1,
# Amount of noise to add per step (0.0=deterministic). Used in all samplers except `k_euler`.
eta=1.0,
device=get_device(),
):
# Add noise to the latent vectors before upscaling. This theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default.
noise_aug_level = 0 # @param {type: 'slider', min: 0.0, max: 0.6, step:0.025}
noise_aug_type = "gaussian" # @param ["gaussian", "fake"]
# @markdown Sampler settings. `k_dpm_adaptive` uses an adaptive solver with error tolerance `tol_scale`, all other use a fixed number of steps.
sampler = "k_dpm_2_ancestral" # @param ["k_euler", "k_euler_ancestral", "k_dpm_2_ancestral", "k_dpm_fast", "k_dpm_adaptive"]
tol_scale = 0.25 # @param {type: 'number'}
seed_everything(seed)
# uc = condition_up(batch_size * ["blurry, low resolution, 720p, grainy"])
uc = condition_up(batch_size * [""])
c = condition_up(batch_size * [upscale_prompt])
[_, C, H, W] = low_res_latent.shape
# Noise levels from stable diffusion.
sigma_min, sigma_max = 0.029167532920837402, 14.614642143249512
model_up = get_upscaler_model(
model_path=hf_hub_download(
"pcuenq/k-upscaler", "laion_text_cond_latent_upscaler_2_1_00470000_slim.pth"
),
device=device,
)
model_wrap = CFGUpscaler(model_up, uc, cond_scale=guidance_scale, device=device)
low_res_sigma = torch.full([batch_size], noise_aug_level, device=device)
x_shape = [batch_size, C, 2 * H, 2 * W]
def do_sample(noise, extra_args):
# We take log-linear steps in noise-level from sigma_max to sigma_min, using one of the k diffusion samplers.
sigmas = (
torch.linspace(np.log(sigma_max), np.log(sigma_min), steps + 1)
.exp()
.to(device)
)
if sampler == "k_euler":
return K.sampling.sample_euler(
model_wrap, noise * sigma_max, sigmas, extra_args=extra_args
)
if sampler == "k_euler_ancestral":
return K.sampling.sample_euler_ancestral(
model_wrap, noise * sigma_max, sigmas, extra_args=extra_args, eta=eta
)
if sampler == "k_dpm_2_ancestral":
return K.sampling.sample_dpm_2_ancestral(
model_wrap, noise * sigma_max, sigmas, extra_args=extra_args, eta=eta
)
if sampler == "k_dpm_fast":
return K.sampling.sample_dpm_fast(
model_wrap,
noise * sigma_max,
sigma_min,
sigma_max,
steps,
extra_args=extra_args,
eta=eta,
)
if sampler == "k_dpm_adaptive":
sampler_opts = {
"s_noise": 1.0,
"rtol": tol_scale * 0.05,
"atol": tol_scale / 127.5,
"pcoeff": 0.2,
"icoeff": 0.4,
"dcoeff": 0,
}
return K.sampling.sample_dpm_adaptive(
model_wrap,
noise * sigma_max,
sigma_min,
sigma_max,
extra_args=extra_args,
eta=eta,
**sampler_opts,
)
raise ValueError(f"Unknown sampler {sampler}")
for _ in range((num_samples - 1) // batch_size + 1):
if noise_aug_type == "gaussian":
latent_noised = low_res_latent + noise_aug_level * torch.randn_like(
low_res_latent
)
elif noise_aug_type == "fake":
latent_noised = low_res_latent * (noise_aug_level**2 + 1) ** 0.5
extra_args = {"low_res": latent_noised, "low_res_sigma": low_res_sigma, "c": c}
noise = torch.randn(x_shape, device=device)
up_latents = do_sample(noise, extra_args)
return up_latents

@ -76,7 +76,9 @@ def torch_img_to_pillow_img(img_t: torch.Tensor):
elif img_t.shape[1] == 3:
colorspace = "RGB"
else:
raise ValueError("Unsupported colorspace")
raise ValueError(
f"Unsupported colorspace. {img_t.shape[1]} channels in {img_t.shape} shape"
)
img_t = rearrange(img_t, "b c h w -> b h w c")
img_t = torch.clamp((img_t + 1.0) / 2.0, min=0.0, max=1.0)
img_np = (255.0 * img_t).cpu().numpy().astype(np.uint8)[0]

@ -91,14 +91,17 @@ class ImageLoggingContext:
self.last_progress_img_ts = 0
self.last_progress_img_step = -1000
self._prev_log_context = None
def __enter__(self):
global _CURRENT_LOGGING_CONTEXT # noqa
self._prev_log_context = _CURRENT_LOGGING_CONTEXT
_CURRENT_LOGGING_CONTEXT = self
return self
def __exit__(self, exc_type, exc_val, exc_tb):
global _CURRENT_LOGGING_CONTEXT # noqa
_CURRENT_LOGGING_CONTEXT = None
_CURRENT_LOGGING_CONTEXT = self._prev_log_context
def timing(self, description):
return TimingContext(self, description)

@ -1,5 +1,6 @@
# pylama:ignore=W0613
import logging
import math
from contextlib import contextmanager
import pytorch_lightning as pl
@ -317,3 +318,62 @@ class IdentityFirstStage(torch.nn.Module):
def forward(self, x, *args, **kwargs):
return x
def chunk_latent(tensor, chunk_size=64, overlap_size=8):
# Get the shape of the tensor
batch_size, num_channels, height, width = tensor.shape
# Calculate the number of chunks along each dimension
num_rows = int(math.ceil(height / chunk_size))
num_cols = int(math.ceil(width / chunk_size))
# Initialize a list to store the chunks
chunks = []
# Loop over the rows and columns
for row in range(num_rows):
for col in range(num_cols):
# Calculate the start and end indices for the chunk along each dimension
row_start = max(row * chunk_size - overlap_size, 0)
row_end = min(row_start + chunk_size + overlap_size, height)
col_start = max(col * chunk_size - overlap_size, 0)
col_end = min(col_start + chunk_size + overlap_size, width)
# Extract the chunk from the tensor and append it to the list of chunks
chunk = tensor[:, :, row_start:row_end, col_start:col_end]
chunks.append((chunk, row_start, col_start))
return chunks, num_rows, num_cols
def merge_tensors(tensor_list, num_rows, num_cols):
print(f"num_rows: {num_rows}")
print(f"num_cols: {num_cols}")
n, channel, h, w = tensor_list[0].size()
assert n == 1
final_width = 0
final_height = 0
for col_idx in range(num_cols):
final_width += tensor_list[col_idx].size()[3]
for row_idx in range(num_rows):
final_height += tensor_list[row_idx * num_cols].size()[2]
final_tensor = torch.zeros([1, channel, final_height, final_width])
print(f"final size {final_tensor.size()}")
for row_idx in range(num_rows):
for col_idx in range(num_cols):
list_idx = row_idx * num_cols + col_idx
chunk = tensor_list[list_idx]
print(f"chunk size: {chunk.size()}")
_, _, chunk_h, chunk_w = chunk.size()
final_tensor[
:,
:,
row_idx * h : row_idx * h + chunk_h,
col_idx * w : col_idx * w + chunk_w,
] = chunk
return final_tensor

@ -1,74 +1,188 @@
import re
import torch
from PIL import Image, ImageDraw
from torch import nn
from imaginairy.img_utils import torch_img_to_pillow_img
def prepare_image_for_outpaint(
img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
def outpaint_calculations(
img_width,
img_height,
up=None,
down=None,
left=None,
right=None,
_all=0,
snap_multiple=8,
):
up = up if up is not None else _all
down = down if down is not None else _all
left = left if left is not None else _all
right = right if right is not None else _all
lft_pct = left / (left + right)
rgt_pct = right / (left + right)
up_pct = up / (up + down)
dwn_pct = down / (up + down)
lft_pct = left / (left + right) if left + right else 0
rgt_pct = right / (left + right) if left + right else 0
up_pct = up / (up + down) if up + down else 0
dwn_pct = down / (up + down) if up + down else 0
new_width = round((img.width + left + right) / snap_multiple) * snap_multiple
new_height = round((img.height + up + down) / snap_multiple) * snap_multiple
height_addition = max(new_height - img.height, 0)
width_addition = max(new_width - img.width, 0)
new_width = round((img_width + left + right) / snap_multiple) * snap_multiple
new_height = round((img_height + up + down) / snap_multiple) * snap_multiple
height_addition = max(new_height - img_height, 0)
width_addition = max(new_width - img_width, 0)
up = int(round(height_addition * up_pct))
down = int(round(height_addition * dwn_pct))
left = int(round(width_addition * lft_pct))
right = int(round(width_addition * rgt_pct))
expanded_image = Image.new(
"RGB", (img.width + left + right, img.height + up + down), (0, 0, 0)
return up, down, left, right, new_width, new_height
def prepare_tensor_for_outpaint(
img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
):
up, down, left, right, new_width, new_height = outpaint_calculations(
img_width=img.shape[2],
img_height=img.shape[1],
up=up,
down=down,
left=left,
right=right,
_all=_all,
snap_multiple=snap_multiple,
)
expanded_image.paste(img, (left, up))
def resize(img_t, h, w):
new_size = (img_t.shape[0], h, w)
return nn.functional.interpolate(img_t, size=new_size, mode="nearest")
def paste(dst, src, y, x):
dst[:, y : y + src.shape[1], x : x + src.shape[2]] = src
expanded_img = torch.zeros(
img.shape[0], img.shape[1] + up + down, img.shape[2] + left + right
)
expanded_img[:, up : up + img.shape[1], left : left + img.shape[2]] = img
# extend border pixels outward, this helps prevents lines at the boundary because masks getting reduced to
# 64x64 latent space can cause some inaccuracies
if up > 0:
top_row = img[:, 0, :]
paste(expanded_img, resize(top_row, h=up, w=expanded_img.shape[2]), y=0, x=0)
paste(expanded_img, resize(top_row, h=up, w=img.shape[2]), y=0, x=left)
if down > 0:
bottom_row = img[:, -1, :]
paste(
expanded_img,
resize(bottom_row, h=down, w=expanded_img.shape[2]),
y=expanded_img.shape[1] - down,
x=0,
)
paste(
expanded_img,
resize(bottom_row, h=down, w=img.shape[2]),
y=expanded_img.shape[1] - down,
x=left,
)
if left > 0:
left_column = img[:, :, 0]
paste(
expanded_img, resize(left_column, h=expanded_img.shape[1], w=left), y=0, x=0
)
paste(expanded_img, resize(left_column, h=img.shape[1], w=left), y=up, x=0)
if right > 0:
right_column = img[:, :, -1]
paste(
expanded_img,
resize(right_column, h=expanded_img.shape[1], w=right),
y=0,
x=expanded_img.shape[2] - right,
)
paste(
expanded_img,
resize(right_column, h=img.shape[1], w=right),
y=up,
x=expanded_img.shape[2] - right,
)
# create a mask for the new boundaries
expanded_mask = torch.zeros_like(expanded_img)
if mask is None:
# set to black
expanded_mask[:, up : up + img.shape[1], left : left + img.shape[2]] = 1
else:
expanded_mask[:, up : up + mask.shape[1], left : left + mask.shape[2]] = mask
return expanded_img, expanded_mask
def prepare_image_for_outpaint(
img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
):
up, down, left, right, new_width, new_height = outpaint_calculations(
img_width=img.width,
img_height=img.height,
up=up,
down=down,
left=left,
right=right,
_all=_all,
snap_multiple=snap_multiple,
)
ran_img_t = torch.randn((1, 3, new_height, new_width), device="cpu")
expanded_image = torch_img_to_pillow_img(ran_img_t)
# expanded_image = Image.new(
# "RGB", (img.width + left + right, img.height + up + down), (0, 0, 0)
# )
expanded_image.paste(img, (left, up))
# extend border pixels outward, this helps prevents lines at the boundary because masks getting reduced to
# 64x64 latent space can cause some inaccuracies
alpha = 20
if up > 0:
top_row = img.crop((0, 0, img.width, 1))
top_row.putalpha(alpha)
expanded_image.paste(
img.crop((0, 0, img.width, 1)).resize((expanded_image.width, up)),
top_row.resize((expanded_image.width, up)),
(0, 0),
)
expanded_image.paste(
img.crop((0, 0, img.width, 1)).resize((img.width, up)),
top_row.resize((img.width, up)),
(left, 0),
)
if down > 0:
bottom_row = img.crop((0, img.height - 1, img.width, img.height))
bottom_row.putalpha(alpha)
expanded_image.paste(
img.crop((0, img.height - 1, img.width, img.height)).resize(
(expanded_image.width, down)
),
bottom_row.resize((expanded_image.width, down)),
(0, expanded_image.height - down),
)
expanded_image.paste(
img.crop((0, img.height - 1, img.width, img.height)).resize(
(img.width, down)
),
bottom_row.resize((img.width, down)),
(left, expanded_image.height - down),
)
if left > 0:
left_column = img.crop((0, 0, 1, img.height))
left_column.putalpha(alpha)
expanded_image.paste(
img.crop((0, 0, 1, img.height)).resize((left, expanded_image.height)),
left_column.resize((left, expanded_image.height)),
(0, 0),
)
expanded_image.paste(
img.crop((0, 0, 1, img.height)).resize((left, img.height)),
left_column.resize((left, img.height)),
(0, up),
)
if right > 0:
right_column = img.crop((img.width - 1, 0, img.width, img.height))
right_column.putalpha(alpha)
expanded_image.paste(
img.crop((img.width - 1, 0, img.width, img.height)).resize(
(right, expanded_image.height)
),
right_column.resize((right, expanded_image.height)),
(expanded_image.width - right, 0),
)
expanded_image.paste(

@ -238,7 +238,7 @@ class ImaginePrompt:
"negative_prompt": negative_prompts,
"init_image": str(self.init_image),
"init_image_strength": self.init_image_strength,
"seed": self.seed,
# "seed": self.seed,
"steps": self.steps,
"height": self.height,
"width": self.width,

Loading…
Cancel
Save