fix: improved large images using composition

This commit is contained in:
Bryce 2023-02-28 20:54:26 -08:00 committed by Bryce Drennan
parent a449fbd5e2
commit 3b777b98d8
4 changed files with 120 additions and 26 deletions

View File

@ -478,28 +478,28 @@ def _generate_single_image(
and not is_controlnet_model
and not model.cond_stage_key == "edit"
):
comp_samples = _generate_composition_latent(
sampler=sampler,
sampler_kwargs={
"num_steps": prompt.steps,
"noise": noise,
"positive_conditioning": positive_conditioning,
"neutral_conditioning": neutral_conditioning,
"guidance_scale": prompt.prompt_strength,
"t_start": t_enc,
"mask": mask_latent,
"orig_latent": init_latent,
"shape": shape,
"batch_size": 1,
"denoiser_cls": denoiser_cls,
},
)
if comp_samples is not None:
result_images["composition"] = comp_samples
noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]]
t_enc = int(prompt.steps * 0.75)
log_latent(comp_samples, "comp_samples")
init_latent = comp_samples
if prompt.init_image:
comp_image = _generate_composition_image(
prompt=prompt,
target_height=init_image.height,
target_width=init_image.width,
)
else:
comp_image = _generate_composition_image(
prompt=prompt,
target_height=prompt.height,
target_width=prompt.width,
)
if comp_image is not None:
result_images["composition"] = comp_image
# noise = noise[:, :, : comp_image.height, : comp_image.shape[3]]
t_enc = int(prompt.steps * 0.65)
log_img(comp_image, "comp_image")
comp_image_t = pillow_img_to_torch_image(comp_image)
comp_image_t = comp_image_t.to(get_device())
init_latent = model.get_first_stage_encoding(
model.encode_first_stage(comp_image_t)
)
with lc.timing("sampling"):
samples = sampler.sample(
num_steps=prompt.steps,
@ -611,7 +611,68 @@ def calc_scale_to_fit_within(
return max_size / height
def _generate_composition_latent(
def _scale_latent(
latent,
model,
h,
w,
):
from torch.nn import functional as F
# convert to non-latent-space first
img = model.decode_first_stage(latent)
img = F.interpolate(img, size=(h, w), mode="bicubic", align_corners=False)
latent = model.get_first_stage_encoding(model.encode_first_stage(img))
return latent
def _generate_composition_image(prompt, target_height, target_width):
from copy import copy
from PIL import Image
cutoff = 512
if prompt.width <= cutoff and prompt.height <= cutoff:
return None
composition_prompt = copy(prompt)
shrink_scale = calc_scale_to_fit_within(
height=prompt.height,
width=prompt.width,
max_size=cutoff,
)
composition_prompt.width = int(prompt.width * shrink_scale)
composition_prompt.height = int(prompt.height * shrink_scale)
composition_prompt.steps = None
composition_prompt.upscaled = False
composition_prompt.fix_faces = False
composition_prompt.mask_modify_original = False
composition_prompt.validate()
result = _generate_single_image(composition_prompt)
img = result.images["generated"]
while img.width < target_width:
from imaginairy.enhancers.upscale_realesrgan import upscale_image
img = upscale_image(img)
# samples = _generate_single_image(composition_prompt, return_latent=True)
# while samples.shape[-1] * 8 < target_width:
# samples = upscale_latent(samples)
#
# img = model_latent_to_pillow_img(samples)
img = img.resize(
(target_width, target_height),
resample=Image.Resampling.LANCZOS,
)
return img
def _generate_composition_latentz(
sampler,
sampler_kwargs,
):
@ -644,9 +705,16 @@ def _generate_composition_latent(
new_kwargs["positive_conditioning"],
new_kwargs["neutral_conditioning"],
]:
print(cond["c_concat"])
for c in cond["c_concat"]:
print(f"downscaling {c.shape} ")
cond["c_concat"] = [
F.interpolate(c, size=new_shape[2:], mode="area") for c in cond["c_concat"]
_scale_latent(
latent=c, model=sampler.model, h=new_shape[2] * 8, w=new_shape[3] * 8
)
for c in cond["c_concat"]
]
print(cond["c_concat"])
mask_latent = new_kwargs["mask"]
if mask_latent is not None:
@ -654,7 +722,13 @@ def _generate_composition_latent(
orig_latent = new_kwargs["orig_latent"]
if orig_latent is not None:
orig_latent = F.interpolate(orig_latent, size=new_shape[2:], mode="area")
orig_latent = _scale_latent(
latent=orig_latent,
model=sampler.model,
h=new_shape[2] * 8,
w=new_shape[3] * 8,
)
t_start = new_kwargs["t_start"]
if t_start is not None:
gen_strength = new_kwargs["t_start"] / new_kwargs["num_steps"]

View File

@ -97,7 +97,11 @@ class KDiffusionSampler(ImageSampler, ABC):
sigmas = self.cv_denoiser.get_sigmas(num_steps)[t_start:]
# see https://github.com/crowsonkb/k-diffusion/issues/43#issuecomment-1305195666
if self.short_name in (SamplerName.K_DPM_2, SamplerName.K_DPMPP_2M, SamplerName.K_DPM_2_ANCESTRAL):
if self.short_name in (
SamplerName.K_DPM_2,
SamplerName.K_DPMPP_2M,
SamplerName.K_DPM_2_ANCESTRAL,
):
sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
# if our number of steps is zero, just return the initial latent

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 MiB

View File

@ -333,3 +333,19 @@ def test_controlnet(filename_base_for_outputs, control_mode):
img_path = f"{filename_base_for_outputs}.png"
assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=24000)
@pytest.mark.skipif(get_device() == "cpu", reason="Too slow to run on CPU")
def test_large_image(filename_base_for_outputs):
prompt_text = "a stormy ocean. oil painting"
prompt = ImaginePrompt(
prompt_text,
width=1920,
height=1080,
steps=15,
seed=0,
)
result = next(imagine(prompt))
img_path = f"{filename_base_for_outputs}.png"
assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=24000)