fix: improved large images using composition

2024-10-31 03:20:40 +00:00 · 2023-02-28 20:54:26 -08:00 · 2023-02-28 20:54:26 -08:00 · 3b777b98d8
commit 3b777b98d8
parent a449fbd5e2
4 changed files with 120 additions and 26 deletions
--- a/imaginairy/api.py
+++ b/imaginairy/api.py
@ -478,28 +478,28 @@ def _generate_single_image(
            and not is_controlnet_model
            and not model.cond_stage_key == "edit"
        ):
-            comp_samples = _generate_composition_latent(
-                sampler=sampler,
-                sampler_kwargs={
-                    "num_steps": prompt.steps,
-                    "noise": noise,
-                    "positive_conditioning": positive_conditioning,
-                    "neutral_conditioning": neutral_conditioning,
-                    "guidance_scale": prompt.prompt_strength,
-                    "t_start": t_enc,
-                    "mask": mask_latent,
-                    "orig_latent": init_latent,
-                    "shape": shape,
-                    "batch_size": 1,
-                    "denoiser_cls": denoiser_cls,
-                },
-            )
-            if comp_samples is not None:
-                result_images["composition"] = comp_samples
-                noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]]
-                t_enc = int(prompt.steps * 0.75)
-                log_latent(comp_samples, "comp_samples")
-                init_latent = comp_samples
+            if prompt.init_image:
+                comp_image = _generate_composition_image(
+                    prompt=prompt,
+                    target_height=init_image.height,
+                    target_width=init_image.width,
+                )
+            else:
+                comp_image = _generate_composition_image(
+                    prompt=prompt,
+                    target_height=prompt.height,
+                    target_width=prompt.width,
+                )
+            if comp_image is not None:
+                result_images["composition"] = comp_image
+                # noise = noise[:, :, : comp_image.height, : comp_image.shape[3]]
+                t_enc = int(prompt.steps * 0.65)
+                log_img(comp_image, "comp_image")
+                comp_image_t = pillow_img_to_torch_image(comp_image)
+                comp_image_t = comp_image_t.to(get_device())
+                init_latent = model.get_first_stage_encoding(
+                    model.encode_first_stage(comp_image_t)
+                )
        with lc.timing("sampling"):
            samples = sampler.sample(
                num_steps=prompt.steps,
@ -611,7 +611,68 @@ def calc_scale_to_fit_within(
    return max_size / height


-def _generate_composition_latent(
+def _scale_latent(
+    latent,
+    model,
+    h,
+    w,
+):
+    from torch.nn import functional as F
+
+    # convert to non-latent-space first
+    img = model.decode_first_stage(latent)
+    img = F.interpolate(img, size=(h, w), mode="bicubic", align_corners=False)
+    latent = model.get_first_stage_encoding(model.encode_first_stage(img))
+    return latent
+
+
+def _generate_composition_image(prompt, target_height, target_width):
+    from copy import copy
+
+    from PIL import Image
+
+    cutoff = 512
+    if prompt.width <= cutoff and prompt.height <= cutoff:
+        return None
+
+    composition_prompt = copy(prompt)
+    shrink_scale = calc_scale_to_fit_within(
+        height=prompt.height,
+        width=prompt.width,
+        max_size=cutoff,
+    )
+    composition_prompt.width = int(prompt.width * shrink_scale)
+    composition_prompt.height = int(prompt.height * shrink_scale)
+
+    composition_prompt.steps = None
+    composition_prompt.upscaled = False
+    composition_prompt.fix_faces = False
+    composition_prompt.mask_modify_original = False
+
+    composition_prompt.validate()
+
+    result = _generate_single_image(composition_prompt)
+    img = result.images["generated"]
+    while img.width < target_width:
+        from imaginairy.enhancers.upscale_realesrgan import upscale_image
+
+        img = upscale_image(img)
+
+    # samples = _generate_single_image(composition_prompt, return_latent=True)
+    # while samples.shape[-1] * 8 < target_width:
+    #     samples = upscale_latent(samples)
+    #
+    # img = model_latent_to_pillow_img(samples)
+
+    img = img.resize(
+        (target_width, target_height),
+        resample=Image.Resampling.LANCZOS,
+    )
+
+    return img
+
+
+def _generate_composition_latentz(
    sampler,
    sampler_kwargs,
 ):
@ -644,9 +705,16 @@ def _generate_composition_latent(
        new_kwargs["positive_conditioning"],
        new_kwargs["neutral_conditioning"],
    ]:
+        print(cond["c_concat"])
+        for c in cond["c_concat"]:
+            print(f"downscaling {c.shape} ")
        cond["c_concat"] = [
-            F.interpolate(c, size=new_shape[2:], mode="area") for c in cond["c_concat"]
+            _scale_latent(
+                latent=c, model=sampler.model, h=new_shape[2] * 8, w=new_shape[3] * 8
+            )
+            for c in cond["c_concat"]
        ]
+        print(cond["c_concat"])

    mask_latent = new_kwargs["mask"]
    if mask_latent is not None:
@ -654,7 +722,13 @@ def _generate_composition_latent(

    orig_latent = new_kwargs["orig_latent"]
    if orig_latent is not None:
-        orig_latent = F.interpolate(orig_latent, size=new_shape[2:], mode="area")
+        orig_latent = _scale_latent(
+            latent=orig_latent,
+            model=sampler.model,
+            h=new_shape[2] * 8,
+            w=new_shape[3] * 8,
+        )
+
    t_start = new_kwargs["t_start"]
    if t_start is not None:
        gen_strength = new_kwargs["t_start"] / new_kwargs["num_steps"]
--- a/imaginairy/samplers/kdiff.py
+++ b/imaginairy/samplers/kdiff.py
@ -97,7 +97,11 @@ class KDiffusionSampler(ImageSampler, ABC):
        sigmas = self.cv_denoiser.get_sigmas(num_steps)[t_start:]

        # see https://github.com/crowsonkb/k-diffusion/issues/43#issuecomment-1305195666
-        if self.short_name in (SamplerName.K_DPM_2, SamplerName.K_DPMPP_2M, SamplerName.K_DPM_2_ANCESTRAL):
+        if self.short_name in (
+            SamplerName.K_DPM_2,
+            SamplerName.K_DPMPP_2M,
+            SamplerName.K_DPM_2_ANCESTRAL,
+        ):
            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])

        # if our number of steps is zero, just return the initial latent
--- a/tests/expected_output/test_large_image_.png
+++ b/tests/expected_output/test_large_image_.png
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -333,3 +333,19 @@ def test_controlnet(filename_base_for_outputs, control_mode):

    img_path = f"{filename_base_for_outputs}.png"
    assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=24000)
+
+
+@pytest.mark.skipif(get_device() == "cpu", reason="Too slow to run on CPU")
+def test_large_image(filename_base_for_outputs):
+    prompt_text = "a stormy ocean. oil painting"
+    prompt = ImaginePrompt(
+        prompt_text,
+        width=1920,
+        height=1080,
+        steps=15,
+        seed=0,
+    )
+    result = next(imagine(prompt))
+
+    img_path = f"{filename_base_for_outputs}.png"
+    assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=24000)