feature: img2img now supported with PLMS (instead of just DDIM)

Kinda hacky copy/pasting from ddim. Need to cleanup
2 years ago · a46424c673
parent 3a10a2eb80
commit a46424c673
8 changed files with 194 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -162,9 +162,27 @@ docker build . -t imaginairy
 docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -v $HOME/.cache/torch:/root/.cache/torch -v `pwd`/outputs:/outputs imaginairy /bin/bash
 ```
-## Improvements from CompVis
+## ChangeLog
 **1.5.0**
 - img2img now supported with PLMS (instead of just DDIM)
 - added image captioning feature `aimg describe dog.jpg` => `a brown dog sitting on grass`
 - added new commandline tool `aimg` for additional image manipulation functionality
 **1.4.0**
 - support multiple additive targets for masking with `|` symbol.  Example: "fruit|stem|fruit stem"
 **1.3.0**
 - added prompt based image editing. Example: "fruit => gold coins"
 - test coverage improved
 **1.2.0**
 - allow urls as init-images
 ** previous **
 - img2img actually does # of steps you specify  
 - performance optimizations
 - numerous other changes
 ## Models Used
 - CLIP - https://openai.com/blog/clip/
@ -205,6 +223,9 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -
     - ✅ realesrgan 
     - ldm
     - https://github.com/lowfuel/progrock-stable
     - stable super-res?
       - todo: try with 1-0-0-0 mask at full image resolution (rencoding entire image+predicted image at every step)
       - todo: use a gaussian pyramid and only include the "high-detail" level of the pyramid into the next step
   - ✅ face enhancers
     - ✅ gfpgan - https://github.com/TencentARC/GFPGAN
     - ✅ codeformer - https://github.com/sczhou/CodeFormer
@ -214,14 +235,15 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -
     - https://github.com/pharmapsychotic/clip-interrogator (blip + clip)
     - https://github.com/KaiyangZhou/CoOp
   - outpainting
-   - inpainting
+   - ✅ inpainting
     - https://github.com/andreas128/RePaint
     - img2img but keeps img stable
     - https://www.reddit.com/r/StableDiffusion/comments/xboy90/a_better_way_of_doing_img2img_by_finding_the/
     - https://gist.github.com/trygvebw/c71334dd127d537a15e9d59790f7f5e1
     - https://github.com/pesser/stable-diffusion/commit/bbb52981460707963e2a62160890d7ecbce00e79
   - CPU support
-   - img2img for plms?
+   - ✅ img2img for plms
   - img2img for kdiff functions
   - images as actual prompts instead of just init images
     - requires model fine-tuning since SD1.4 expects 77x768 text encoding input
     - https://twitter.com/Buntworthy/status/1566744186153484288
--- a/imaginairy/api.py
+++ b/imaginairy/api.py
@ -215,8 +215,9 @@ def imagine(
                    prompt.height // downsampling_factor,
                    prompt.width // downsampling_factor,
                ]
-                if prompt.init_image:
+                if prompt.init_image and prompt.sampler_type not in ("ddim", "plms"):
-                    sampler_type = "ddim"
+                    sampler_type = "plms"
                    logger.info("   Sampler type switched to plms for img2img")
                else:
                    sampler_type = prompt.sampler_type
                start_code = None
--- a/imaginairy/cmds.py
+++ b/imaginairy/cmds.py
@ -185,9 +185,6 @@ def imagine_cmd(
    logger.info(
        f"🤖🧠 imaginAIry received {len(prompt_texts)} prompt(s) and will repeat them {repeats} times to create {total_image_count} images."
    )
    if init_image and sampler_type != "ddim":
        sampler_type = "ddim"
        logger.info("   Sampler type switched to ddim for img2img")
    if init_image and init_image.startswith("http"):
        init_image = LazyLoadingImage(url=init_image)
--- a/imaginairy/enhancers/describe_image_blip.py
+++ b/imaginairy/enhancers/describe_image_blip.py
@ -18,7 +18,7 @@ BLIP_EVAL_SIZE = 384
@lru_cache()
 def blip_model():
-    from imaginairy import PKG_ROOT
+    from imaginairy import PKG_ROOT  # noqa
    config_path = os.path.join(
        PKG_ROOT, "vendored", "blip", "configs", "med_config.json"
--- a/imaginairy/modules/diffusion/ddpm.py
+++ b/imaginairy/modules/diffusion/ddpm.py
@ -255,7 +255,8 @@ class LatentDiffusion(DDPM):
        self.cond_stage_key = cond_stage_key
        try:
            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
-        except:
+        except:  # noqa
            logger.exception("Bad num downs?")
            self.num_downs = 0
        if not scale_by_std:
            self.scale_factor = scale_factor
@ -639,7 +640,7 @@ class LatentDiffusion(DDPM):
            ks = self.split_input_params["ks"]  # eg. (128, 128)
            stride = self.split_input_params["stride"]  # eg. (64, 64)
-            h, w = x_noisy.shape[-2:]
+            h, w = x_noisy.shape[-2:]  # noqa
            fold, unfold, normalization, weighting = self.get_fold_unfold(
                x_noisy, ks, stride
@ -711,7 +712,7 @@ class LatentDiffusion(DDPM):
                # tokenize crop coordinates for the bounding boxes of the respective patches
                patch_limits_tknzd = [
-                    torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[
+                    torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[  # noqa
                        None
                    ].to(  # noqa
                        self.device
--- a/imaginairy/samplers/plms.py
+++ b/imaginairy/samplers/plms.py
@ -5,7 +5,9 @@ import numpy as np
 import torch
 from tqdm import tqdm
 from imaginairy.img_log import log_latent
 from imaginairy.modules.diffusion.util import (
    extract_into_tensor,
    make_ddim_sampling_parameters,
    make_ddim_timesteps,
    noise_like,
@ -172,7 +174,7 @@ class PLMSSampler:
            img = torch.randn(shape, device="cpu").to(device)
        else:
            img = x_T
-
+        log_latent(img, "initial img")
        if timesteps is None:
            timesteps = (
                self.ddpm_num_timesteps
@ -217,7 +219,7 @@ class PLMSSampler:
                )  # TODO: deterministic forward pass?
                img = img_orig * mask + (1.0 - mask) * img
-            outs = self.p_sample_plms(
+            img, pred_x0, e_t = self.p_sample_plms(
                img,
                cond,
                ts,
@ -233,7 +235,6 @@ class PLMSSampler:
                old_eps=old_eps,
                t_next=ts_next,
            )
            img, pred_x0, e_t = outs
            old_eps.append(e_t)
            if len(old_eps) >= 4:
                old_eps.pop(0)
@ -277,7 +278,11 @@ class PLMSSampler:
                t_in = torch.cat([t] * 2)
                c_in = torch.cat([unconditional_conditioning, c])
                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
                log_latent(e_t_uncond, "noise pred uncond")
                log_latent(e_t, "noise pred cond")
                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
                log_latent(e_t, "noise pred combined")
            if score_corrector is not None:
                assert self.model.parameterization == "eps"
@ -326,6 +331,7 @@ class PLMSSampler:
            return x_prev, pred_x0
        e_t = get_model_output(x, t)
        if len(old_eps) == 0:
            # Pseudo Improved Euler (2nd order)
            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
@ -344,5 +350,97 @@ class PLMSSampler:
            ) / 24
        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
        log_latent(x_prev, "x_prev")
        log_latent(pred_x0, "pred_x0")
        return x_prev, pred_x0, e_t
    @torch.no_grad()
    def stochastic_encode(self, init_latent, t, noise=None):
        # fast, but does not allow for exact reconstruction
        # t serves as an index to gather the correct alphas
        sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
        sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
        if noise is None:
            noise = torch.randn_like(init_latent, device="cpu").to(get_device())
        return (
            extract_into_tensor(sqrt_alphas_cumprod, t, init_latent.shape) * init_latent
            + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, init_latent.shape)
            * noise
        )
    @torch.no_grad()
    def decode(
        self,
        x_latent,
        cond,
        t_start,
        unconditional_guidance_scale=1.0,
        unconditional_conditioning=None,
        img_callback=None,
        score_corrector=None,
        temperature=1.0,
        mask=None,
        orig_latent=None,
    ):
        timesteps = self.ddim_timesteps[:t_start]
        time_range = np.flip(timesteps)
        total_steps = timesteps.shape[0]
        iterator = tqdm(time_range, desc="PLMS altering image", total=total_steps)
        x_dec = x_latent
        old_eps = []
        for i, step in enumerate(iterator):
            index = total_steps - i - 1
            ts = torch.full(
                (x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long
            )
            ts_next = torch.full(
                (x_latent.shape[0],),
                time_range[min(i + 1, len(time_range) - 1)],
                device=x_latent.device,
                dtype=torch.long,
            )
            if mask is not None:
                assert orig_latent is not None
                xdec_orig = self.model.q_sample(orig_latent, ts)
                log_latent(xdec_orig, "xdec_orig")
                log_latent(xdec_orig * mask, "masked_xdec_orig")
                x_dec = xdec_orig * mask + (1.0 - mask) * x_dec
                log_latent(x_dec, "x_dec")
            x_dec, pred_x0, e_t = self.p_sample_plms(
                x_dec,
                cond,
                ts,
                index=index,
                unconditional_guidance_scale=unconditional_guidance_scale,
                unconditional_conditioning=unconditional_conditioning,
                temperature=temperature,
                old_eps=old_eps,
                t_next=ts_next,
            )
            # original_loss = ((x_dec - x_latent).abs().mean()*70)
            # sigma_t = torch.full((1, 1, 1, 1), self.ddim_sigmas[index], device=get_device())
            # x_dec = x_dec.detach() + (original_loss * 0.1) ** 2
            # cond_grad = -torch.autograd.grad(original_loss, x_dec)[0]
            # x_dec = x_dec.detach() + cond_grad * sigma_t ** 2
            ## x_dec_alt = x_dec + (original_loss * 0.1) ** 2
            old_eps.append(e_t)
            if len(old_eps) >= 4:
                old_eps.pop(0)
            if img_callback:
                img_callback(x_dec, "x_dec")
                img_callback(pred_x0, "pred_x0")
            log_latent(x_dec, f"x_dec {i}")
            log_latent(pred_x0, f"pred_x0 {i}")
        return x_dec
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@ -56,10 +56,8 @@ def experiment_step_repeats():
    sampler.make_schedule(1000)
    img = LazyLoadingImage(filepath=f"{TESTS_FOLDER}/data/beach_at_sainte_adresse.jpg")
-    init_image, _, h = pillow_img_to_torch_image(
+    init_image, _, _ = pillow_img_to_torch_image(
        img,
        max_height=512,
        max_width=512,
    )
    init_image = init_image.to(get_device())
    init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))
@ -119,3 +117,38 @@ def experiment_repeated_img_2_img():
        img = result.img
        os.makedirs(outdir, exist_ok=True)
        img.save(f"{outdir}/{step_num:04}.png")
 def experiment_superresolution():
    """
    Try to trick it into making a superresolution image
    Did not work, resulting image was more blurry
    # i put this into the api.py file hardcoded
    row_a = torch.tensor([1, 0]).repeat(32)
    row_b = torch.tensor([0, 1]).repeat(32)
    grid = torch.stack([row_a, row_b]).repeat(32, 1)
    mask = grid
    mask = mask.to(get_device())
    """
    description = "a black and white photo of a dog's face"
    # image was a quarter of existing image
    img = LazyLoadingImage(filepath=f"{TESTS_FOLDER}/../outputs/dog02.jpg")
    # todo: try with 1000 mask at image resultion (rencoding entire image+predicted image at every step)
    # todo: use a gaussian pyramid and only include the "high-detail" level of the pyramid into the next step
    prompt = ImaginePrompt(
        description,
        init_image=img,
        init_image_strength=0.8,
        width=512,
        height=512,
        steps=50,
        seed=1,
        sampler_type="DDIM",
    )
    out_folder = f"{TESTS_FOLDER}/test_output"
    imagine_image_files(prompt, outdir=out_folder)
--- a/tests/test_imagine.py
+++ b/tests/test_imagine.py
@ -45,7 +45,23 @@ def test_imagine(sampler_type, expected_md5):
    assert result.md5() == expected_md5
-def test_img_to_img():
+device_sampler_type_test_cases_img_2_img = {
    "mps:0": {
        ("plms", "54656a7f449cb73b99436e61470172b3"),
        ("ddim", "87d04423f6d03ddfc065cabc62e3909c"),
    },
    "cuda": {
        ("plms", "efba8b836b51d262dbf72284844869f8"),
        ("ddim", "a62878000ad3b581a11dd3fb329dc7d2"),
    },
 }
 sampler_type_test_cases_img_2_img = device_sampler_type_test_cases_img_2_img[
    get_device()
 ]
@pytest.mark.parametrize("sampler_type,expected_md5", sampler_type_test_cases_img_2_img)
 def test_img_to_img(sampler_type, expected_md5):
    prompt = ImaginePrompt(
        "a photo of a beach",
        init_image=f"{TESTS_FOLDER}/data/beach_at_sainte_adresse.jpg",
@ -54,10 +70,13 @@ def test_img_to_img():
        height=512,
        steps=5,
        seed=1,
-        sampler_type="DDIM",
+        sampler_type=sampler_type,
    )
-    out_folder = f"{TESTS_FOLDER}/test_output"
+    result = next(imagine(prompt))
-    imagine_image_files(prompt, outdir=out_folder)
+    result.img.save(
        f"{TESTS_FOLDER}/test_output/sampler_type_{sampler_type.upper()}_img2img_beach.jpg"
    )
    assert result.md5() == expected_md5
 def test_img_to_img_from_url():