feature: (wip) better image to image

I tried it with the DDIM sampler and it didn't work. Probably need to use the k-diffusion sampler with it from a846393251/find_noise.py needs https://github.com/crowsonkb/k-diffusion
2 years ago · 84a73cb5a2
parent 438c2868ad
commit 84a73cb5a2
8 changed files with 290 additions and 59 deletions
--- a/README.md
+++ b/README.md
@ -4,6 +4,7 @@ AI imagined images.

 "just works" on Linux and OSX(M1).

+## Examples
 ```bash
 >> pip install imaginairy
 >> imagine "a scenic landscape" "a photo of a dog" "photo of a fruit bowl" "portrait photo of a freckled woman"
@ -27,7 +28,7 @@ Generating 🖼  : "portrait photo of a freckled woman" 512x512px seed:500686645
 <img src="assets/000056_293284644_PLMS40_PS7.5_photo_of_a_bowl_of_fruit.jpg" width="256" height="256">
 <img src="assets/000078_260972468_PLMS40_PS7.5_portrait_photo_of_a_freckled_woman.jpg" width="256" height="256">

-# Features
+## Features
 
 - It makes images from text descriptions!
 - Generate images either in code or from command line.
@ -36,7 +37,7 @@ Generating 🖼  : "portrait photo of a freckled woman" 512x512px seed:500686645
 - WeightedPrompts let you smash together separate prompts (cat-dog)
 - Tile Mode creates tileable images

-# How To
+## How To

 ```python
 from imaginairy import imagine_images, imagine_image_files, ImaginePrompt, WeightedPrompt
@ -79,13 +80,17 @@ OR
   - https://laion.ai/blog/laion-5b/

 # Todo
+ - performance optimizations 
+   - https://github.com/huggingface/diffusers/blob/main/docs/source/optimization/fp16.mdx
+   - https://github.com/neonsecret/stable-diffusion
+   - ✅ https://github.com/CompVis/stable-diffusion/compare/main...Doggettx:stable-diffusion:autocast-improvements#
+   - ✅ https://www.reddit.com/r/StableDiffusion/comments/xalaws/test_update_for_less_memory_usage_and_higher/
 - deploy to pypi
 - add tests
 - set up ci (test/lint/format)
 - add docs
 - notify https://github.com/CompVis/stable-diffusion/issues/25
 - remove yaml config
- - performance optimizations https://github.com/huggingface/diffusers/blob/main/docs/source/optimization/fp16.mdx
 - delete more unused code
 - Interface improvements
   - init-image at command line
@ -93,7 +98,9 @@ OR
   - webserver interface (low priority, this is a library)
 - Image Generation Features
   - upscaling
+     - https://github.com/lowfuel/progrock-stable
   - face improvements
+     - codeformer
   - image describe feature - https://replicate.com/methexis-inc/img2prompt
   - outpainting
   - inpainting
@ -110,7 +117,9 @@ OR
   - tiling
   - output show-work videos
   - image variations https://github.com/lstein/stable-diffusion/blob/main/VARIATIONS.md
-   - textual inversion https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb#scrollTo=50JuJUM8EG1h
+   - textual inversion 
+     - https://www.reddit.com/r/StableDiffusion/comments/xbwb5y/how_to_run_textual_inversion_locally_train_your/
+     - https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb#scrollTo=50JuJUM8EG1h
   - zooming videos? a la disco diffusion
   - fix saturation at high CFG https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/

--- a/imaginairy/api.py
+++ b/imaginairy/api.py
@ -6,30 +6,30 @@ from contextlib import nullcontext
 from functools import lru_cache

 import numpy as np
-import PIL
 import torch
 import torch.nn
 from einops import rearrange
 from omegaconf import OmegaConf
-from PIL import Image
+from PIL import Image, ImageDraw
 from pytorch_lightning import seed_everything
 from torch import autocast
 from transformers import cached_path

 from imaginairy.modules.diffusion.ddim import DDIMSampler
 from imaginairy.modules.diffusion.plms import PLMSSampler
+from imaginairy.modules.find_noise import find_noise_for_latent
 from imaginairy.safety import is_nsfw
 from imaginairy.schema import ImaginePrompt, ImagineResult
 from imaginairy.utils import (
    fix_torch_nn_layer_norm,
    get_device,
+    img_path_to_torch_image,
    instantiate_from_config,
 )

 LIB_PATH = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)

-
 # leave undocumented. I'd ask that no one publicize this flag
 IMAGINAIRY_ALLOW_NSFW = os.getenv("IMAGINAIRY_ALLOW_NSFW", "False")
 IMAGINAIRY_ALLOW_NSFW = bool(IMAGINAIRY_ALLOW_NSFW == "I AM A RESPONSIBLE ADULT")
@ -56,20 +56,6 @@ def load_model_from_config(config):
    return model


-def load_img(path, max_height=512, max_width=512):
-    image = Image.open(path).convert("RGB")
-    w, h = image.size
-    logger.info(f"loaded input image of size ({w}, {h}) from {path}")
-    resize_ratio = min(max_width / w, max_height / h)
-    w, h = int(w * resize_ratio), int(h * resize_ratio)
-    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.0 * image - 1.0, w, h
-
-
 def patch_conv(**patch):
    """https://github.com/replicate/cog-stable-diffusion/compare/main...TomMoore515:material_stable_diffusion:main"""
    cls = torch.nn.Conv2d
@ -115,7 +101,7 @@ def imagine_image_files(
    if output_file_extension not in {"jpg", "png"}:
        raise ValueError("Must output a png or jpg")

-    def _record_steps(samples, i, model, prompt):
+    def _record_steps(samples, description, model, prompt):
        nonlocal step_count
        step_count += 1
        samples = model.decode_first_stage(samples)
@ -125,9 +111,10 @@ def imagine_image_files(
        for pred_x0 in samples:
            pred_x0 = 255.0 * rearrange(pred_x0.cpu().numpy(), "c h w -> h w c")
            filename = f"{base_count:08}_S{prompt.seed}_step{step_count:04}.jpg"
-            Image.fromarray(pred_x0.astype(np.uint8)).save(
-                os.path.join(steps_path, filename)
-            )
+            img = Image.fromarray(pred_x0.astype(np.uint8))
+            draw = ImageDraw.Draw(img)
+            draw.text((10, 10), str(description))
+            img.save(os.path.join(steps_path, filename))

    img_callback = _record_steps if record_step_images else None
    for result in imagine_images(
@ -190,10 +177,10 @@ def imagine_images(
                    for wp in prompt.prompts
                ]
            )
-            if img_callback:

-                def _img_callback(samples, i):
-                    img_callback(samples, i, model, prompt)
+            def _img_callback(samples, description):
+                if img_callback:
+                    img_callback(samples, description, model, prompt)

            shape = [
                latent_channels,
@ -209,19 +196,18 @@ def imagine_images(
                sampler.make_schedule(ddim_num_steps=ddim_steps, ddim_eta=ddim_eta)

                t_enc = int(generation_strength * ddim_steps)
-                init_image, w, h = load_img(prompt.init_image)
+                init_image, w, h = img_path_to_torch_image(prompt.init_image)
                init_image = init_image.to(get_device())
-                init_latent = model.encode_first_stage(init_image)
-                noised_init_latent = model.get_first_stage_encoding(init_latent)
-                _img_callback(init_latent.mean, 0)
-                _img_callback(noised_init_latent, 0)
+                init_latent = model.get_first_stage_encoding(
+                    model.encode_first_stage(init_image)
+                )
+                _img_callback(init_latent, "init_latent")

                # encode (scaled latent)
                z_enc = sampler.stochastic_encode(
-                    noised_init_latent,
-                    torch.tensor([t_enc]).to(get_device()),
+                    init_latent, torch.tensor([t_enc]).to(get_device())
                )
-                _img_callback(noised_init_latent, 0)
+                _img_callback(z_enc, "z_enc")

                # decode it
                samples = sampler.decode(
--- a/imaginairy/cmds.py
+++ b/imaginairy/cmds.py
@ -45,6 +45,21 @@ def configure_logging(level="INFO"):

@click.command()
@click.argument("prompt_texts", nargs=-1)
+@click.option(
+    "--prompt-strength",
+    default=7.5,
+    show_default=True,
+    help="How closely to follow the prompt. Image looks unnatural at higher values",
+)
+@click.option(
+    "--init-image",
+    help="Starting image.",
+)
+@click.option(
+    "--init-image-strength",
+    default=0.3,
+    help="Starting image.",
+)
@click.option("--outdir", default="./outputs", help="where to write results to")
@click.option(
    "-r",
@ -76,12 +91,6 @@ def configure_logging(level="INFO"):
    type=int,
    help="What seed to use for randomness. Allows reproducible image renders",
 )
-@click.option(
-    "--prompt-strength",
-    default=7.5,
-    show_default=True,
-    help="How closely to follow the prompt. Image looks unnatural at higher values",
-)
@click.option(
    "--sampler-type",
    default="PLMS",
@ -109,13 +118,15 @@ def configure_logging(level="INFO"):
 )
 def imagine_cmd(
    prompt_texts,
+    prompt_strength,
+    init_image,
+    init_image_strength,
    outdir,
    repeats,
    height,
    width,
    steps,
    seed,
-    prompt_strength,
    sampler_type,
    ddim_eta,
    log_level,
@ -139,12 +150,14 @@ def imagine_cmd(
        for prompt_text in prompt_texts:
            prompt = ImaginePrompt(
                prompt_text,
+                prompt_strength=prompt_strength,
+                init_image=init_image,
+                init_image_strength=init_image_strength,
                seed=seed,
                sampler_type=sampler_type,
                steps=steps,
                height=height,
                width=width,
-                prompt_strength=prompt_strength,
                upscale=False,
                fix_faces=False,
            )
--- a/imaginairy/modules/diffusion/ddim.py
+++ b/imaginairy/modules/diffusion/ddim.py
@ -378,6 +378,6 @@ class DDIMSampler:
            # x_dec = x_dec.detach() + cond_grad * sigma_t ** 2
            ## x_dec_alt = x_dec + (original_loss * 0.1) ** 2
            if img_callback:
-                img_callback(x_dec, i)
-                img_callback(pred_x0, i)
+                img_callback(x_dec, f"x_dec {i}")
+                img_callback(pred_x0, f"pred_x0 {i}")
        return x_dec
--- a/imaginairy/modules/find_noise.py
+++ b/imaginairy/modules/find_noise.py
@ -0,0 +1,102 @@
+"""
+I tried it with the DDIM sampler and it didn't work.
+
+Probably need to use the k-diffusion sampler with it
+from https://gist.githubusercontent.com/trygvebw/c71334dd127d537a15e9d59790f7f5e1/raw/a846393251f5be8289d4febc75a19f1f962aabcc/find_noise.py
+
+needs https://github.com/crowsonkb/k-diffusion
+"""
+from contextlib import nullcontext
+
+import torch
+from einops import repeat
+from torch import autocast
+
+from imaginairy.utils import get_device, pillow_img_to_torch_image
+
+
+def pil_img_to_latent(model, img, batch_size=1, device="cuda", half=True):
+    # init_image = pil_img_to_torch(img, half=half).to(device)
+    init_image = pillow_img_to_torch_image(img).to(get_device())
+    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
+    if half:
+        return model.get_first_stage_encoding(
+            model.encode_first_stage(init_image.half())
+        )
+    return model.get_first_stage_encoding(model.encode_first_stage(init_image))
+
+
+def find_noise_for_image(
+    model, pil_img, prompt, steps=50, cond_scale=1.0, verbose=False, half=True
+):
+    img_latent = pil_img_to_latent(
+        model, pil_img, batch_size=1, device="cuda", half=half
+    )
+    return find_noise_for_latent(
+        model,
+        img_latent,
+        prompt,
+        steps=steps,
+        cond_scale=cond_scale,
+        verbose=verbose,
+        half=half,
+    )
+
+
+def find_noise_for_latent(
+    model, img_latent, prompt, steps=50, cond_scale=1.0, verbose=False, half=True
+):
+    import k_diffusion as K
+
+    x = img_latent
+
+    _autocast = autocast if get_device() in ("cuda", "cpu") else nullcontext
+    with (torch.no_grad(), _autocast(get_device())):
+        uncond = model.get_learned_conditioning([""])
+        cond = model.get_learned_conditioning([prompt])
+
+    s_in = x.new_ones([x.shape[0]])
+    dnw = K.external.CompVisDenoiser(model)
+    sigmas = dnw.get_sigmas(steps).flip(0)
+
+    if verbose:
+        print(sigmas)
+
+    with (torch.no_grad(), _autocast(get_device())):
+        for i in range(1, len(sigmas)):
+            x_in = torch.cat([x] * 2)
+            sigma_in = torch.cat([sigmas[i] * s_in] * 2)
+            cond_in = torch.cat([uncond, cond])
+
+            c_out, c_in = [
+                K.utils.append_dims(k, x_in.ndim) for k in dnw.get_scalings(sigma_in)
+            ]
+            t = dnw.sigma_to_t(sigma_in)
+
+            eps = model.apply_model(x_in * c_in, t, cond=cond_in)
+            denoised_uncond, denoised_cond = (x_in + eps * c_out).chunk(2)
+
+            denoised = denoised_uncond + (denoised_cond - denoised_uncond) * cond_scale
+
+            d = (x - denoised) / sigmas[i]
+            dt = sigmas[i] - sigmas[i - 1]
+
+            x = x + d * dt
+
+            # This shouldn't be necessary, but solved some VRAM issues
+            del (
+                x_in,
+                sigma_in,
+                cond_in,
+                c_out,
+                c_in,
+                t,
+            )
+            del eps, denoised_uncond, denoised_cond, denoised, d, dt
+            # collect_and_empty()
+
+        return (x / x.std()) * sigmas[-1]
+
+
+if __name__ == "__main__":
+    pass
--- a/imaginairy/utils.py
+++ b/imaginairy/utils.py
@ -5,9 +5,14 @@ from contextlib import contextmanager
 from functools import lru_cache
 from typing import List, Optional

+import numpy as np
+import PIL
 import torch
+from PIL import Image
 from torch import Tensor

+from imaginairy.api import logger
+
 logger = logging.getLogger(__name__)


@ -95,3 +100,21 @@ def fix_torch_nn_layer_norm():
        yield
    finally:
        functional.layer_norm = orig_function
+
+
+def img_path_to_torch_image(path, max_height=512, max_width=512):
+    image = Image.open(path).convert("RGB")
+    logger.info(f"loaded input image of size {image.size} from {path}")
+    return pillow_img_to_torch_image(image, max_height=max_height, max_width=max_width)
+
+
+def pillow_img_to_torch_image(image, max_height=512, max_width=512):
+    w, h = image.size
+    resize_ratio = min(max_width / w, max_height / h)
+    w, h = int(w * resize_ratio), int(h * resize_ratio)
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0, w, h
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -6,6 +6,8 @@
 #
 absl-py==1.2.0
    # via tensorboard
+accelerate==0.12.0
+    # via k-diffusion
 aiohttp==3.8.1
    # via fsspec
 aiosignal==1.2.0
@ -19,31 +21,43 @@ async-timeout==4.0.2
 attrs==22.1.0
    # via
    #   aiohttp
+    #   jsonschema
    #   pytest
 black==22.8.0
    # via -r requirements-dev.in
 cachetools==5.2.0
    # via google-auth
 certifi==2022.6.15.1
-    # via requests
-charset-normalizer==2.1.1
    # via
-    #   aiohttp
    #   requests
+    #   sentry-sdk
+chardet==4.0.0
+    # via requests
+charset-normalizer==2.1.1
+    # via aiohttp
+clean-fid==0.1.30
+    # via k-diffusion
 click==8.1.3
    # via
    #   black
    #   imaginairy (setup.py)
-clip @ git+https://github.com/openai/CLIP.git@d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
-    # via imaginairy (setup.py)
+    #   wandb
+clip @ git+https://github.com/openai/CLIP
+    # via
+    #   imaginairy (setup.py)
+    #   k-diffusion
 coverage==6.4.4
    # via -r requirements-dev.in
 diffusers==0.3.0
    # via imaginairy (setup.py)
 dill==0.3.5.1
    # via pylint
+docker-pycreds==0.4.0
+    # via wandb
 einops==0.3.0
-    # via imaginairy (setup.py)
+    # via
+    #   imaginairy (setup.py)
+    #   k-diffusion
 filelock==3.8.0
    # via
    #   diffusers
@ -59,6 +73,10 @@ ftfy==6.1.1
    # via clip
 future==0.18.2
    # via pytorch-lightning
+gitdb==4.0.9
+    # via gitpython
+gitpython==3.1.27
+    # via wandb
 google-auth==2.11.0
    # via
    #   google-auth-oauthlib
@ -71,12 +89,14 @@ huggingface-hub==0.9.1
    # via
    #   diffusers
    #   transformers
-idna==3.3
+idna==2.10
    # via
    #   requests
    #   yarl
 imageio==2.9.0
-    # via imaginairy (setup.py)
+    # via
+    #   imaginairy (setup.py)
+    #   scikit-image
 importlib-metadata==4.12.0
    # via diffusers
 iniconfig==1.1.1
@ -85,8 +105,16 @@ isort==5.10.1
    # via
    #   -r requirements-dev.in
    #   pylint
-kornia==0.6
+jsonmerge==1.8.0
+    # via k-diffusion
+jsonschema==4.16.0
+    # via jsonmerge
+k-diffusion @ git+https://github.com/crowsonkb/k-diffusion.git@71ba7d6735e9cba1945b429a21345960eb3f151c
    # via imaginairy (setup.py)
+kornia==0.6
+    # via
+    #   imaginairy (setup.py)
+    #   k-diffusion
 lazy-object-proxy==1.7.1
    # via astroid
 markdown==3.4.1
@ -103,13 +131,21 @@ multidict==6.0.2
    #   yarl
 mypy-extensions==0.4.3
    # via black
+networkx==2.8.6
+    # via scikit-image
 numpy==1.23.3
    # via
+    #   accelerate
+    #   clean-fid
    #   diffusers
    #   imageio
    #   imaginairy (setup.py)
    #   pytorch-lightning
+    #   pywavelets
+    #   scikit-image
+    #   scipy
    #   tensorboard
+    #   tifffile
    #   torchmetrics
    #   torchvision
    #   transformers
@ -119,18 +155,25 @@ omegaconf==2.1.1
    # via imaginairy (setup.py)
 packaging==21.3
    # via
+    #   accelerate
    #   huggingface-hub
    #   kornia
    #   pytest
    #   pytorch-lightning
+    #   scikit-image
    #   torchmetrics
    #   transformers
 pathspec==0.10.1
    # via black
+pathtools==0.1.2
+    # via wandb
 pillow==9.2.0
    # via
+    #   clean-fid
    #   diffusers
    #   imageio
+    #   k-diffusion
+    #   scikit-image
    #   torchvision
 platformdirs==2.5.2
    # via
@ -138,8 +181,16 @@ platformdirs==2.5.2
    #   pylint
 pluggy==1.0.0
    # via pytest
+promise==2.3
+    # via wandb
 protobuf==3.19.4
-    # via tensorboard
+    # via
+    #   tensorboard
+    #   wandb
+psutil==5.9.2
+    # via
+    #   accelerate
+    #   wandb
 py==1.11.0
    # via pytest
 pyasn1==0.4.8
@ -164,23 +215,30 @@ pylint==2.15.2
    # via -r requirements-dev.in
 pyparsing==3.0.9
    # via packaging
+pyrsistent==0.18.1
+    # via jsonschema
 pytest==7.1.3
    # via -r requirements-dev.in
 pytorch-lightning==1.4.2
    # via imaginairy (setup.py)
+pywavelets==1.3.0
+    # via scikit-image
 pyyaml==6.0
    # via
+    #   accelerate
    #   huggingface-hub
    #   omegaconf
    #   pytorch-lightning
    #   transformers
+    #   wandb
 regex==2022.9.11
    # via
    #   clip
    #   diffusers
    #   transformers
-requests==2.28.1
+requests==2.25.1
    # via
+    #   clean-fid
    #   diffusers
    #   fsspec
    #   huggingface-hub
@ -188,14 +246,36 @@ requests==2.28.1
    #   tensorboard
    #   torchvision
    #   transformers
+    #   wandb
 requests-oauthlib==1.3.1
    # via google-auth-oauthlib
+resize-right==0.0.2
+    # via k-diffusion
 rsa==4.9
    # via google-auth
+scikit-image==0.19.3
+    # via k-diffusion
+scipy==1.9.1
+    # via
+    #   clean-fid
+    #   k-diffusion
+    #   scikit-image
+    #   torchdiffeq
+sentry-sdk==1.9.8
+    # via wandb
+setproctitle==1.3.2
+    # via wandb
+shortuuid==1.0.9
+    # via wandb
 six==1.16.0
    # via
+    #   docker-pycreds
    #   google-auth
    #   grpcio
+    #   promise
+    #   wandb
+smmap==5.0.0
+    # via gitdb
 snowballstemmer==2.2.0
    # via pydocstyle
 tensorboard==2.10.0
@ -204,6 +284,8 @@ tensorboard-data-server==0.6.1
    # via tensorboard
 tensorboard-plugin-wit==1.8.1
    # via tensorboard
+tifffile==2022.8.12
+    # via scikit-image
 tokenizers==0.12.1
    # via transformers
 tomli==2.0.1
@ -215,26 +297,36 @@ tomlkit==0.11.4
    # via pylint
 torch==1.12.1
    # via
+    #   accelerate
+    #   clean-fid
    #   clip
    #   diffusers
    #   imaginairy (setup.py)
+    #   k-diffusion
    #   kornia
    #   pytorch-lightning
+    #   torchdiffeq
    #   torchmetrics
    #   torchvision
+torchdiffeq==0.2.3
+    # via k-diffusion
 torchmetrics==0.6.0
    # via
    #   imaginairy (setup.py)
    #   pytorch-lightning
 torchvision==0.13.1
    # via
+    #   clean-fid
    #   clip
    #   imaginairy (setup.py)
+    #   k-diffusion
 tqdm==4.64.1
    # via
+    #   clean-fid
    #   clip
    #   huggingface-hub
    #   imaginairy (setup.py)
+    #   k-diffusion
    #   pytorch-lightning
    #   transformers
 transformers==4.19.2
@ -246,7 +338,11 @@ typing-extensions==4.3.0
    #   torch
    #   torchvision
 urllib3==1.26.12
-    # via requests
+    # via
+    #   requests
+    #   sentry-sdk
+wandb==0.13.3
+    # via k-diffusion
 wcwidth==0.2.5
    # via ftfy
 werkzeug==2.2.2
--- a/setup.py
+++ b/setup.py
@ -23,6 +23,8 @@ setup(
        "torchmetrics==0.6.0",
        "torchvision>=0.13.1",
        "kornia==0.6",
-        "clip @  git+https://github.com/openai/CLIP.git@d50d76daa670286dd6cacf3bcd80b5e4823fc8e1#egg=clip",
+        "clip @  git+https://github.com/openai/CLIP",
+        # k-diffusion for use with find_noise.py
+        # "k-diffusion@git+https://github.com/crowsonkb/k-diffusion.git@71ba7d6735e9cba1945b429a21345960eb3f151c#egg=k-diffusion",
    ],
 )