imaginAIry/imaginairy/img_utils.py
Bryce f97f6a3b4b feature: use refiners library for generation
BREAKING CHANGE

  - stable diffusion 1.5 + inpainting working
  - self-attention guidance working. improves image generation quality
  - tile-mode working
  - inpainting self-attention guidance working

disable/broken features:
  - sd 1.4, 2.0, 2.1
  - most of the samplers
  - pix2pix edit
  - most of the controlnets
  - memory management
  - python 3.8 support

wip
2023-11-22 13:22:00 -08:00

172 lines
5.0 KiB
Python

"""
image utils.
Library format cheat sheet:
Library Dim Order Channel Order Value Range Type
Pillow R, G, B, A 0-255 PIL.Image.Image
OpenCV B, G, R, A 0-255 np.ndarray
Torch (B), C, H, W R, G, B -1.0-1.0 torch.Tensor
"""
from typing import Sequence
import numpy as np
import PIL
import torch
from einops import rearrange, repeat
from PIL import Image, ImageDraw, ImageFont
from imaginairy.paths import PKG_ROOT
from imaginairy.schema import LazyLoadingImage
from imaginairy.utils import get_device
def pillow_fit_image_within(
image: PIL.Image.Image, max_height=512, max_width=512, convert="RGB", snap_size=8
):
image = image.convert(convert)
w, h = image.size
resize_ratio = 1
if w > max_width or h > max_height:
resize_ratio = min(max_width / w, max_height / h)
elif w < max_width and h < max_height:
# it's smaller than our target image, enlarge
resize_ratio = max(max_width / w, max_height / h)
if resize_ratio != 1:
w, h = int(w * resize_ratio), int(h * resize_ratio)
# resize to integer multiple of snap_size
w -= w % snap_size
h -= h % snap_size
if (w, h) != image.size:
image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
return image
def pillow_img_to_torch_image(img: PIL.Image.Image, convert="RGB"):
if convert:
img = img.convert(convert)
img = np.array(img).astype(np.float32) / 255.0
# b, h, w, c => b, c, h, w
img = img[None].transpose(0, 3, 1, 2)
img = torch.from_numpy(img)
return 2.0 * img - 1.0
def pillow_mask_to_latent_mask(mask_img: PIL.Image.Image, downsampling_factor):
mask_img = mask_img.resize(
(
mask_img.width // downsampling_factor,
mask_img.height // downsampling_factor,
),
resample=Image.Resampling.LANCZOS,
)
mask = np.array(mask_img).astype(np.float32) / 255.0
mask = mask[None, None]
mask = torch.from_numpy(mask)
return mask
def pillow_img_to_opencv_img(img: PIL.Image.Image):
open_cv_image = np.array(img)
# Convert RGB to BGR
open_cv_image = open_cv_image[:, :, ::-1].copy()
return open_cv_image
def torch_image_to_openvcv_img(img: torch.Tensor):
img = (img + 1) / 2
img = img.detach().cpu().numpy()
# assert there is only one image
assert img.shape[0] == 1
img = img[0]
img = img.transpose(1, 2, 0)
img = (img * 255).astype(np.uint8)
# RGB to BGR
img = img[:, :, ::-1]
return img
def torch_img_to_pillow_img(img_t: torch.Tensor):
img_t = img_t.to(torch.float32).detach().cpu()
if len(img_t.shape) == 3:
img_t = img_t.unsqueeze(0)
if img_t.shape[0] != 1:
raise ValueError("Only batch size 1 supported")
if img_t.shape[1] == 1:
colorspace = "L"
elif img_t.shape[1] == 3:
colorspace = "RGB"
else:
msg = (
f"Unsupported colorspace. {img_t.shape[1]} channels in {img_t.shape} shape"
)
raise ValueError(msg)
img_t = rearrange(img_t, "b c h w -> b h w c")
img_t = torch.clamp((img_t + 1.0) / 2.0, min=0.0, max=1.0)
img_np = (255.0 * img_t).cpu().numpy().astype(np.uint8)[0]
if colorspace == "L":
img_np = img_np[:, :, 0]
return Image.fromarray(img_np, colorspace)
def model_latent_to_pillow_img(latent: torch.Tensor) -> PIL.Image.Image:
from imaginairy.model_manager import get_current_diffusion_model
if len(latent.shape) == 3:
latent = latent.unsqueeze(0)
if latent.shape[0] != 1:
raise ValueError("Only batch size 1 supported")
model = get_current_diffusion_model()
img_t = model.lda.decode(latent)
return torch_img_to_pillow_img(img_t)
def model_latents_to_pillow_imgs(latents: torch.Tensor) -> Sequence[PIL.Image.Image]:
return [model_latent_to_pillow_img(latent) for latent in latents]
def pillow_img_to_model_latent(model, img, batch_size=1, half=True):
init_image = pillow_img_to_torch_image(img).to(get_device())
init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
if half:
return model.get_first_stage_encoding(
model.encode_first_stage(init_image.half())
)
return model.get_first_stage_encoding(model.encode_first_stage(init_image))
def imgpaths_to_imgs(imgpaths):
imgs = []
for imgpath in imgpaths:
if isinstance(imgpath, str):
img = LazyLoadingImage(filepath=imgpath)
imgs.append(img)
else:
imgs.append(imgpath)
return imgs
def add_caption_to_image(
img, caption, font_size=16, font_path=f"{PKG_ROOT}/data/DejaVuSans.ttf"
):
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(font_path, font_size)
x = 15
y = img.height - 15 - font_size
draw.text(
(x, y),
caption,
font=font,
fill=(255, 255, 255),
stroke_width=3,
stroke_fill=(0, 0, 0),
)