feature: (wip) better image to image

I tried it with the DDIM sampler and it didn't work.

Probably need to use the k-diffusion sampler with it
from a846393251/find_noise.py

needs https://github.com/crowsonkb/k-diffusion
pull/1/head
Bryce 2 years ago
parent 438c2868ad
commit 84a73cb5a2

@ -4,6 +4,7 @@ AI imagined images.
"just works" on Linux and OSX(M1).
## Examples
```bash
>> pip install imaginairy
>> imagine "a scenic landscape" "a photo of a dog" "photo of a fruit bowl" "portrait photo of a freckled woman"
@ -27,7 +28,7 @@ Generating 🖼 : "portrait photo of a freckled woman" 512x512px seed:500686645
<img src="assets/000056_293284644_PLMS40_PS7.5_photo_of_a_bowl_of_fruit.jpg" width="256" height="256">
<img src="assets/000078_260972468_PLMS40_PS7.5_portrait_photo_of_a_freckled_woman.jpg" width="256" height="256">
# Features
## Features
- It makes images from text descriptions!
- Generate images either in code or from command line.
@ -36,7 +37,7 @@ Generating 🖼 : "portrait photo of a freckled woman" 512x512px seed:500686645
- WeightedPrompts let you smash together separate prompts (cat-dog)
- Tile Mode creates tileable images
# How To
## How To
```python
from imaginairy import imagine_images, imagine_image_files, ImaginePrompt, WeightedPrompt
@ -79,13 +80,17 @@ OR
- https://laion.ai/blog/laion-5b/
# Todo
- performance optimizations
- https://github.com/huggingface/diffusers/blob/main/docs/source/optimization/fp16.mdx
- https://github.com/neonsecret/stable-diffusion
- ✅ https://github.com/CompVis/stable-diffusion/compare/main...Doggettx:stable-diffusion:autocast-improvements#
- ✅ https://www.reddit.com/r/StableDiffusion/comments/xalaws/test_update_for_less_memory_usage_and_higher/
- deploy to pypi
- add tests
- set up ci (test/lint/format)
- add docs
- notify https://github.com/CompVis/stable-diffusion/issues/25
- remove yaml config
- performance optimizations https://github.com/huggingface/diffusers/blob/main/docs/source/optimization/fp16.mdx
- delete more unused code
- Interface improvements
- init-image at command line
@ -93,7 +98,9 @@ OR
- webserver interface (low priority, this is a library)
- Image Generation Features
- upscaling
- https://github.com/lowfuel/progrock-stable
- face improvements
- codeformer
- image describe feature - https://replicate.com/methexis-inc/img2prompt
- outpainting
- inpainting
@ -110,7 +117,9 @@ OR
- tiling
- output show-work videos
- image variations https://github.com/lstein/stable-diffusion/blob/main/VARIATIONS.md
- textual inversion https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb#scrollTo=50JuJUM8EG1h
- textual inversion
- https://www.reddit.com/r/StableDiffusion/comments/xbwb5y/how_to_run_textual_inversion_locally_train_your/
- https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb#scrollTo=50JuJUM8EG1h
- zooming videos? a la disco diffusion
- fix saturation at high CFG https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/

@ -6,30 +6,30 @@ from contextlib import nullcontext
from functools import lru_cache
import numpy as np
import PIL
import torch
import torch.nn
from einops import rearrange
from omegaconf import OmegaConf
from PIL import Image
from PIL import Image, ImageDraw
from pytorch_lightning import seed_everything
from torch import autocast
from transformers import cached_path
from imaginairy.modules.diffusion.ddim import DDIMSampler
from imaginairy.modules.diffusion.plms import PLMSSampler
from imaginairy.modules.find_noise import find_noise_for_latent
from imaginairy.safety import is_nsfw
from imaginairy.schema import ImaginePrompt, ImagineResult
from imaginairy.utils import (
fix_torch_nn_layer_norm,
get_device,
img_path_to_torch_image,
instantiate_from_config,
)
LIB_PATH = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
# leave undocumented. I'd ask that no one publicize this flag
IMAGINAIRY_ALLOW_NSFW = os.getenv("IMAGINAIRY_ALLOW_NSFW", "False")
IMAGINAIRY_ALLOW_NSFW = bool(IMAGINAIRY_ALLOW_NSFW == "I AM A RESPONSIBLE ADULT")
@ -56,20 +56,6 @@ def load_model_from_config(config):
return model
def load_img(path, max_height=512, max_width=512):
image = Image.open(path).convert("RGB")
w, h = image.size
logger.info(f"loaded input image of size ({w}, {h}) from {path}")
resize_ratio = min(max_width / w, max_height / h)
w, h = int(w * resize_ratio), int(h * resize_ratio)
w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
image = np.array(image).astype(np.float32) / 255.0
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image)
return 2.0 * image - 1.0, w, h
def patch_conv(**patch):
"""https://github.com/replicate/cog-stable-diffusion/compare/main...TomMoore515:material_stable_diffusion:main"""
cls = torch.nn.Conv2d
@ -115,7 +101,7 @@ def imagine_image_files(
if output_file_extension not in {"jpg", "png"}:
raise ValueError("Must output a png or jpg")
def _record_steps(samples, i, model, prompt):
def _record_steps(samples, description, model, prompt):
nonlocal step_count
step_count += 1
samples = model.decode_first_stage(samples)
@ -125,9 +111,10 @@ def imagine_image_files(
for pred_x0 in samples:
pred_x0 = 255.0 * rearrange(pred_x0.cpu().numpy(), "c h w -> h w c")
filename = f"{base_count:08}_S{prompt.seed}_step{step_count:04}.jpg"
Image.fromarray(pred_x0.astype(np.uint8)).save(
os.path.join(steps_path, filename)
)
img = Image.fromarray(pred_x0.astype(np.uint8))
draw = ImageDraw.Draw(img)
draw.text((10, 10), str(description))
img.save(os.path.join(steps_path, filename))
img_callback = _record_steps if record_step_images else None
for result in imagine_images(
@ -190,10 +177,10 @@ def imagine_images(
for wp in prompt.prompts
]
)
if img_callback:
def _img_callback(samples, i):
img_callback(samples, i, model, prompt)
def _img_callback(samples, description):
if img_callback:
img_callback(samples, description, model, prompt)
shape = [
latent_channels,
@ -209,19 +196,18 @@ def imagine_images(
sampler.make_schedule(ddim_num_steps=ddim_steps, ddim_eta=ddim_eta)
t_enc = int(generation_strength * ddim_steps)
init_image, w, h = load_img(prompt.init_image)
init_image, w, h = img_path_to_torch_image(prompt.init_image)
init_image = init_image.to(get_device())
init_latent = model.encode_first_stage(init_image)
noised_init_latent = model.get_first_stage_encoding(init_latent)
_img_callback(init_latent.mean, 0)
_img_callback(noised_init_latent, 0)
init_latent = model.get_first_stage_encoding(
model.encode_first_stage(init_image)
)
_img_callback(init_latent, "init_latent")
# encode (scaled latent)
z_enc = sampler.stochastic_encode(
noised_init_latent,
torch.tensor([t_enc]).to(get_device()),
init_latent, torch.tensor([t_enc]).to(get_device())
)
_img_callback(noised_init_latent, 0)
_img_callback(z_enc, "z_enc")
# decode it
samples = sampler.decode(

@ -45,6 +45,21 @@ def configure_logging(level="INFO"):
@click.command()
@click.argument("prompt_texts", nargs=-1)
@click.option(
"--prompt-strength",
default=7.5,
show_default=True,
help="How closely to follow the prompt. Image looks unnatural at higher values",
)
@click.option(
"--init-image",
help="Starting image.",
)
@click.option(
"--init-image-strength",
default=0.3,
help="Starting image.",
)
@click.option("--outdir", default="./outputs", help="where to write results to")
@click.option(
"-r",
@ -76,12 +91,6 @@ def configure_logging(level="INFO"):
type=int,
help="What seed to use for randomness. Allows reproducible image renders",
)
@click.option(
"--prompt-strength",
default=7.5,
show_default=True,
help="How closely to follow the prompt. Image looks unnatural at higher values",
)
@click.option(
"--sampler-type",
default="PLMS",
@ -109,13 +118,15 @@ def configure_logging(level="INFO"):
)
def imagine_cmd(
prompt_texts,
prompt_strength,
init_image,
init_image_strength,
outdir,
repeats,
height,
width,
steps,
seed,
prompt_strength,
sampler_type,
ddim_eta,
log_level,
@ -139,12 +150,14 @@ def imagine_cmd(
for prompt_text in prompt_texts:
prompt = ImaginePrompt(
prompt_text,
prompt_strength=prompt_strength,
init_image=init_image,
init_image_strength=init_image_strength,
seed=seed,
sampler_type=sampler_type,
steps=steps,
height=height,
width=width,
prompt_strength=prompt_strength,
upscale=False,
fix_faces=False,
)

@ -378,6 +378,6 @@ class DDIMSampler:
# x_dec = x_dec.detach() + cond_grad * sigma_t ** 2
## x_dec_alt = x_dec + (original_loss * 0.1) ** 2
if img_callback:
img_callback(x_dec, i)
img_callback(pred_x0, i)
img_callback(x_dec, f"x_dec {i}")
img_callback(pred_x0, f"pred_x0 {i}")
return x_dec

@ -0,0 +1,102 @@
"""
I tried it with the DDIM sampler and it didn't work.
Probably need to use the k-diffusion sampler with it
from https://gist.githubusercontent.com/trygvebw/c71334dd127d537a15e9d59790f7f5e1/raw/a846393251f5be8289d4febc75a19f1f962aabcc/find_noise.py
needs https://github.com/crowsonkb/k-diffusion
"""
from contextlib import nullcontext
import torch
from einops import repeat
from torch import autocast
from imaginairy.utils import get_device, pillow_img_to_torch_image
def pil_img_to_latent(model, img, batch_size=1, device="cuda", half=True):
# init_image = pil_img_to_torch(img, half=half).to(device)
init_image = pillow_img_to_torch_image(img).to(get_device())
init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
if half:
return model.get_first_stage_encoding(
model.encode_first_stage(init_image.half())
)
return model.get_first_stage_encoding(model.encode_first_stage(init_image))
def find_noise_for_image(
model, pil_img, prompt, steps=50, cond_scale=1.0, verbose=False, half=True
):
img_latent = pil_img_to_latent(
model, pil_img, batch_size=1, device="cuda", half=half
)
return find_noise_for_latent(
model,
img_latent,
prompt,
steps=steps,
cond_scale=cond_scale,
verbose=verbose,
half=half,
)
def find_noise_for_latent(
model, img_latent, prompt, steps=50, cond_scale=1.0, verbose=False, half=True
):
import k_diffusion as K
x = img_latent
_autocast = autocast if get_device() in ("cuda", "cpu") else nullcontext
with (torch.no_grad(), _autocast(get_device())):
uncond = model.get_learned_conditioning([""])
cond = model.get_learned_conditioning([prompt])
s_in = x.new_ones([x.shape[0]])
dnw = K.external.CompVisDenoiser(model)
sigmas = dnw.get_sigmas(steps).flip(0)
if verbose:
print(sigmas)
with (torch.no_grad(), _autocast(get_device())):
for i in range(1, len(sigmas)):
x_in = torch.cat([x] * 2)
sigma_in = torch.cat([sigmas[i] * s_in] * 2)
cond_in = torch.cat([uncond, cond])
c_out, c_in = [
K.utils.append_dims(k, x_in.ndim) for k in dnw.get_scalings(sigma_in)
]
t = dnw.sigma_to_t(sigma_in)
eps = model.apply_model(x_in * c_in, t, cond=cond_in)
denoised_uncond, denoised_cond = (x_in + eps * c_out).chunk(2)
denoised = denoised_uncond + (denoised_cond - denoised_uncond) * cond_scale
d = (x - denoised) / sigmas[i]
dt = sigmas[i] - sigmas[i - 1]
x = x + d * dt
# This shouldn't be necessary, but solved some VRAM issues
del (
x_in,
sigma_in,
cond_in,
c_out,
c_in,
t,
)
del eps, denoised_uncond, denoised_cond, denoised, d, dt
# collect_and_empty()
return (x / x.std()) * sigmas[-1]
if __name__ == "__main__":
pass

@ -5,9 +5,14 @@ from contextlib import contextmanager
from functools import lru_cache
from typing import List, Optional
import numpy as np
import PIL
import torch
from PIL import Image
from torch import Tensor
from imaginairy.api import logger
logger = logging.getLogger(__name__)
@ -95,3 +100,21 @@ def fix_torch_nn_layer_norm():
yield
finally:
functional.layer_norm = orig_function
def img_path_to_torch_image(path, max_height=512, max_width=512):
image = Image.open(path).convert("RGB")
logger.info(f"loaded input image of size {image.size} from {path}")
return pillow_img_to_torch_image(image, max_height=max_height, max_width=max_width)
def pillow_img_to_torch_image(image, max_height=512, max_width=512):
w, h = image.size
resize_ratio = min(max_width / w, max_height / h)
w, h = int(w * resize_ratio), int(h * resize_ratio)
w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
image = np.array(image).astype(np.float32) / 255.0
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image)
return 2.0 * image - 1.0, w, h

@ -6,6 +6,8 @@
#
absl-py==1.2.0
# via tensorboard
accelerate==0.12.0
# via k-diffusion
aiohttp==3.8.1
# via fsspec
aiosignal==1.2.0
@ -19,31 +21,43 @@ async-timeout==4.0.2
attrs==22.1.0
# via
# aiohttp
# jsonschema
# pytest
black==22.8.0
# via -r requirements-dev.in
cachetools==5.2.0
# via google-auth
certifi==2022.6.15.1
# via requests
charset-normalizer==2.1.1
# via
# aiohttp
# requests
# sentry-sdk
chardet==4.0.0
# via requests
charset-normalizer==2.1.1
# via aiohttp
clean-fid==0.1.30
# via k-diffusion
click==8.1.3
# via
# black
# imaginairy (setup.py)
clip @ git+https://github.com/openai/CLIP.git@d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
# via imaginairy (setup.py)
# wandb
clip @ git+https://github.com/openai/CLIP
# via
# imaginairy (setup.py)
# k-diffusion
coverage==6.4.4
# via -r requirements-dev.in
diffusers==0.3.0
# via imaginairy (setup.py)
dill==0.3.5.1
# via pylint
docker-pycreds==0.4.0
# via wandb
einops==0.3.0
# via imaginairy (setup.py)
# via
# imaginairy (setup.py)
# k-diffusion
filelock==3.8.0
# via
# diffusers
@ -59,6 +73,10 @@ ftfy==6.1.1
# via clip
future==0.18.2
# via pytorch-lightning
gitdb==4.0.9
# via gitpython
gitpython==3.1.27
# via wandb
google-auth==2.11.0
# via
# google-auth-oauthlib
@ -71,12 +89,14 @@ huggingface-hub==0.9.1
# via
# diffusers
# transformers
idna==3.3
idna==2.10
# via
# requests
# yarl
imageio==2.9.0
# via imaginairy (setup.py)
# via
# imaginairy (setup.py)
# scikit-image
importlib-metadata==4.12.0
# via diffusers
iniconfig==1.1.1
@ -85,8 +105,16 @@ isort==5.10.1
# via
# -r requirements-dev.in
# pylint
kornia==0.6
jsonmerge==1.8.0
# via k-diffusion
jsonschema==4.16.0
# via jsonmerge
k-diffusion @ git+https://github.com/crowsonkb/k-diffusion.git@71ba7d6735e9cba1945b429a21345960eb3f151c
# via imaginairy (setup.py)
kornia==0.6
# via
# imaginairy (setup.py)
# k-diffusion
lazy-object-proxy==1.7.1
# via astroid
markdown==3.4.1
@ -103,13 +131,21 @@ multidict==6.0.2
# yarl
mypy-extensions==0.4.3
# via black
networkx==2.8.6
# via scikit-image
numpy==1.23.3
# via
# accelerate
# clean-fid
# diffusers
# imageio
# imaginairy (setup.py)
# pytorch-lightning
# pywavelets
# scikit-image
# scipy
# tensorboard
# tifffile
# torchmetrics
# torchvision
# transformers
@ -119,18 +155,25 @@ omegaconf==2.1.1
# via imaginairy (setup.py)
packaging==21.3
# via
# accelerate
# huggingface-hub
# kornia
# pytest
# pytorch-lightning
# scikit-image
# torchmetrics
# transformers
pathspec==0.10.1
# via black
pathtools==0.1.2
# via wandb
pillow==9.2.0
# via
# clean-fid
# diffusers
# imageio
# k-diffusion
# scikit-image
# torchvision
platformdirs==2.5.2
# via
@ -138,8 +181,16 @@ platformdirs==2.5.2
# pylint
pluggy==1.0.0
# via pytest
promise==2.3
# via wandb
protobuf==3.19.4
# via tensorboard
# via
# tensorboard
# wandb
psutil==5.9.2
# via
# accelerate
# wandb
py==1.11.0
# via pytest
pyasn1==0.4.8
@ -164,23 +215,30 @@ pylint==2.15.2
# via -r requirements-dev.in
pyparsing==3.0.9
# via packaging
pyrsistent==0.18.1
# via jsonschema
pytest==7.1.3
# via -r requirements-dev.in
pytorch-lightning==1.4.2
# via imaginairy (setup.py)
pywavelets==1.3.0
# via scikit-image
pyyaml==6.0
# via
# accelerate
# huggingface-hub
# omegaconf
# pytorch-lightning
# transformers
# wandb
regex==2022.9.11
# via
# clip
# diffusers
# transformers
requests==2.28.1
requests==2.25.1
# via
# clean-fid
# diffusers
# fsspec
# huggingface-hub
@ -188,14 +246,36 @@ requests==2.28.1
# tensorboard
# torchvision
# transformers
# wandb
requests-oauthlib==1.3.1
# via google-auth-oauthlib
resize-right==0.0.2
# via k-diffusion
rsa==4.9
# via google-auth
scikit-image==0.19.3
# via k-diffusion
scipy==1.9.1
# via
# clean-fid
# k-diffusion
# scikit-image
# torchdiffeq
sentry-sdk==1.9.8
# via wandb
setproctitle==1.3.2
# via wandb
shortuuid==1.0.9
# via wandb
six==1.16.0
# via
# docker-pycreds
# google-auth
# grpcio
# promise
# wandb
smmap==5.0.0
# via gitdb
snowballstemmer==2.2.0
# via pydocstyle
tensorboard==2.10.0
@ -204,6 +284,8 @@ tensorboard-data-server==0.6.1
# via tensorboard
tensorboard-plugin-wit==1.8.1
# via tensorboard
tifffile==2022.8.12
# via scikit-image
tokenizers==0.12.1
# via transformers
tomli==2.0.1
@ -215,26 +297,36 @@ tomlkit==0.11.4
# via pylint
torch==1.12.1
# via
# accelerate
# clean-fid
# clip
# diffusers
# imaginairy (setup.py)
# k-diffusion
# kornia
# pytorch-lightning
# torchdiffeq
# torchmetrics
# torchvision
torchdiffeq==0.2.3
# via k-diffusion
torchmetrics==0.6.0
# via
# imaginairy (setup.py)
# pytorch-lightning
torchvision==0.13.1
# via
# clean-fid
# clip
# imaginairy (setup.py)
# k-diffusion
tqdm==4.64.1
# via
# clean-fid
# clip
# huggingface-hub
# imaginairy (setup.py)
# k-diffusion
# pytorch-lightning
# transformers
transformers==4.19.2
@ -246,7 +338,11 @@ typing-extensions==4.3.0
# torch
# torchvision
urllib3==1.26.12
# via requests
# via
# requests
# sentry-sdk
wandb==0.13.3
# via k-diffusion
wcwidth==0.2.5
# via ftfy
werkzeug==2.2.2

@ -23,6 +23,8 @@ setup(
"torchmetrics==0.6.0",
"torchvision>=0.13.1",
"kornia==0.6",
"clip @ git+https://github.com/openai/CLIP.git@d50d76daa670286dd6cacf3bcd80b5e4823fc8e1#egg=clip",
"clip @ git+https://github.com/openai/CLIP",
# k-diffusion for use with find_noise.py
# "k-diffusion@git+https://github.com/crowsonkb/k-diffusion.git@71ba7d6735e9cba1945b429a21345960eb3f151c#egg=k-diffusion",
],
)

Loading…
Cancel
Save