imaginAIry/imaginairy/utils/img_utils.py

"""
image utils.

Library format cheat sheet:

Library     Dim Order       Channel Order       Value Range     Type
Pillow                      R, G, B, A          0-255           PIL.Image.Image
OpenCV                      B, G, R, A          0-255           np.ndarray
Torch       (B), C, H, W    R, G, B             -1.0-1.0        torch.Tensor

"""
from typing import Sequence

import numpy as np
import PIL
import torch
from einops import rearrange, repeat
from PIL import Image, ImageDraw, ImageFont

from imaginairy.schema import LazyLoadingImage
from imaginairy.utils import get_device
from imaginairy.utils.paths import PKG_ROOT


def pillow_fit_image_within(
    image: PIL.Image.Image | LazyLoadingImage,
    max_height=512,
    max_width=512,
    convert="RGB",
    snap_size=8,
) -> PIL.Image.Image:
    image = image.convert(convert)
    w, h = image.size
    resize_ratio = 1
    if w > max_width or h > max_height:
        resize_ratio = min(max_width / w, max_height / h)
    elif w < max_width and h < max_height:
        # it's smaller than our target image, enlarge
        resize_ratio = max(max_width / w, max_height / h)

    if resize_ratio != 1:
        w, h = int(w * resize_ratio), int(h * resize_ratio)
    # resize to integer multiple of snap_size
    w -= w % snap_size
    h -= h % snap_size

    if (w, h) != image.size:
        image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
    return image


def pillow_img_to_torch_image(
    img: PIL.Image.Image | LazyLoadingImage, convert="RGB"
) -> torch.Tensor:
    if convert:
        img = img.convert(convert)
    img_np = np.array(img).astype(np.float32) / 255.0
    # b, h, w, c => b, c, h, w
    img_np = img_np[None].transpose(0, 3, 1, 2)
    img_t = torch.from_numpy(img_np)
    return 2.0 * img_t - 1.0


def pillow_mask_to_latent_mask(
    mask_img: PIL.Image.Image | LazyLoadingImage, downsampling_factor
) -> torch.Tensor:
    mask_img = mask_img.resize(
        (
            mask_img.width // downsampling_factor,
            mask_img.height // downsampling_factor,
        ),
        resample=Image.Resampling.LANCZOS,
    )

    mask = np.array(mask_img).astype(np.float32) / 255.0
    mask = mask[None, None]
    mask_t = torch.from_numpy(mask)
    return mask_t


def pillow_img_to_opencv_img(img: PIL.Image.Image | LazyLoadingImage):
    open_cv_image = np.array(img)
    # Convert RGB to BGR
    open_cv_image = open_cv_image[:, :, ::-1].copy()
    return open_cv_image


def torch_image_to_openvcv_img(img: torch.Tensor) -> np.ndarray:
    img = (img + 1) / 2
    img_np = img.detach().cpu().numpy()
    # assert there is only one image
    assert img_np.shape[0] == 1
    img_np = img_np[0]
    img_np = img_np.transpose(1, 2, 0)
    img_np = (img_np * 255).astype(np.uint8)
    # RGB to BGR
    img_np = img_np[:, :, ::-1]
    return img_np


def torch_img_to_pillow_img(img_t: torch.Tensor) -> PIL.Image.Image:
    img_t = img_t.to(torch.float32).detach().cpu()
    if len(img_t.shape) == 3:
        img_t = img_t.unsqueeze(0)
    if img_t.shape[0] != 1:
        raise ValueError("Only batch size 1 supported")
    if img_t.shape[1] == 1:
        colorspace = "L"
    elif img_t.shape[1] == 3:
        colorspace = "RGB"
    else:
        msg = (
            f"Unsupported colorspace. {img_t.shape[1]} channels in {img_t.shape} shape"
        )
        raise ValueError(msg)
    img_t = rearrange(img_t, "b c h w -> b h w c")
    img_t = torch.clamp((img_t + 1.0) / 2.0, min=0.0, max=1.0)
    img_np = (255.0 * img_t).cpu().numpy().astype(np.uint8)[0]
    if colorspace == "L":
        img_np = img_np[:, :, 0]
    return Image.fromarray(img_np, colorspace)


def model_latent_to_pillow_img(latent: torch.Tensor) -> PIL.Image.Image:
    from imaginairy.utils.model_manager import get_current_diffusion_model

    if len(latent.shape) == 3:
        latent = latent.unsqueeze(0)
    if latent.shape[0] != 1:
        raise ValueError("Only batch size 1 supported")
    model = get_current_diffusion_model()
    img_t = model.lda.decode(latent)
    return torch_img_to_pillow_img(img_t)


def model_latents_to_pillow_imgs(latents: torch.Tensor) -> Sequence[PIL.Image.Image]:
    return [model_latent_to_pillow_img(latent) for latent in latents]


def pillow_img_to_model_latent(
    model, img: PIL.Image.Image | LazyLoadingImage, batch_size=1, half=True
):
    init_image = pillow_img_to_torch_image(img).to(get_device())
    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
    if half:
        return model.get_first_stage_encoding(
            model.encode_first_stage(init_image.half())
        )
    return model.get_first_stage_encoding(model.encode_first_stage(init_image))


def imgpaths_to_imgs(imgpaths):
    imgs = []
    for imgpath in imgpaths:
        if isinstance(imgpath, str):
            img = LazyLoadingImage(filepath=imgpath)
            imgs.append(img)
        else:
            imgs.append(imgpath)

    return imgs


def add_caption_to_image(
    img: PIL.Image.Image | LazyLoadingImage,
    caption,
    font_size=16,
    font_path=f"{PKG_ROOT}/data/DejaVuSans.ttf",
):
    img_pil = img.as_pillow() if isinstance(img, LazyLoadingImage) else img
    draw = ImageDraw.Draw(img_pil)

    font = ImageFont.truetype(font_path, font_size)

    x = 15
    y = img_pil.height - 15 - font_size

    draw.text(
        (x, y),
        caption,
        font=font,
        fill=(255, 255, 255),
        stroke_width=3,
        stroke_fill=(0, 0, 0),
    )
feature: controlnet 2023-02-12 07:42:19 +00:00			`"""`
			`image utils.`

			`Library format cheat sheet:`

			`Library Dim Order Channel Order Value Range Type`
			`Pillow R, G, B, A 0-255 PIL.Image.Image`
			`OpenCV B, G, R, A 0-255 np.ndarray`
			`Torch (B), C, H, W R, G, B -1.0-1.0 torch.Tensor`

			`"""`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00			`from typing import Sequence`

			`import numpy as np`
			`import PIL`
			`import torch`
			`from einops import rearrange, repeat`
feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00			`from PIL import Image, ImageDraw, ImageFont`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00
feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00			`from imaginairy.schema import LazyLoadingImage`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00			`from imaginairy.utils import get_device`
refactor: move a bunch of stuff to utils 2023-12-15 21:40:10 +00:00			`from imaginairy.utils.paths import PKG_ROOT`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00

feature: finetuning - feature: finetuning your own image models - feature: image prep command. crops to face or other interesting parts of photo - fix: back-compat for hf_hub_download - feature: add prune-ckpt command - feature: allow specification of model config file 2023-01-01 22:54:49 +00:00			`def pillow_fit_image_within(`
style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`image: PIL.Image.Image \| LazyLoadingImage,`
			`max_height=512,`
			`max_width=512,`
			`convert="RGB",`
			`snap_size=8,`
			`) -> PIL.Image.Image:`
feature: finetuning - feature: finetuning your own image models - feature: image prep command. crops to face or other interesting parts of photo - fix: back-compat for hf_hub_download - feature: add prune-ckpt command - feature: allow specification of model config file 2023-01-01 22:54:49 +00:00			`image = image.convert(convert)`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00			`w, h = image.size`
fix: handle small input images If input images didn't need resizing because they were already smaller than max width/height then they didn't get normalized to a multiple of 64. This caused an exception like the following: ```Sizes of tensors must match except in dimension 1. Expected size 4 but got size 3 for tensor number 1 in the list. ``` 2022-10-06 06:13:48 +00:00			`resize_ratio = 1`
fix: masking now works properly at strengths 0 and 1 2022-09-24 21:41:25 +00:00			`if w > max_width or h > max_height:`
			`resize_ratio = min(max_width / w, max_height / h)`
fix: handle small input images If input images didn't need resizing because they were already smaller than max width/height then they didn't get normalized to a multiple of 64. This caused an exception like the following: ```Sizes of tensors must match except in dimension 1. Expected size 4 but got size 3 for tensor number 1 in the list. ``` 2022-10-06 06:13:48 +00:00			`elif w < max_width and h < max_height:`
			`# it's smaller than our target image, enlarge`
			`resize_ratio = max(max_width / w, max_height / h)`

			`if resize_ratio != 1:`
fix: masking now works properly at strengths 0 and 1 2022-09-24 21:41:25 +00:00			`w, h = int(w * resize_ratio), int(h * resize_ratio)`
feature: image sizes can now be multiples of 8 instead of 64 from https://github.com/CompVis/stable-diffusion/issues/60#issuecomment-1240294667 2023-01-24 06:25:56 +00:00			`# resize to integer multiple of snap_size`
			`w -= w % snap_size`
			`h -= h % snap_size`
lint: new ruff linter 2023-01-02 04:14:22 +00:00
fix: handle small input images If input images didn't need resizing because they were already smaller than max width/height then they didn't get normalized to a multiple of 64. This caused an exception like the following: ```Sizes of tensors must match except in dimension 1. Expected size 4 but got size 3 for tensor number 1 in the list. ``` 2022-10-06 06:13:48 +00:00			`if (w, h) != image.size:`
fix: masking now works properly at strengths 0 and 1 2022-09-24 21:41:25 +00:00			`image = image.resize((w, h), resample=Image.Resampling.LANCZOS)`
style: fix lint issues 2022-09-24 07:29:45 +00:00			`return image`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00

style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`def pillow_img_to_torch_image(`
			`img: PIL.Image.Image \| LazyLoadingImage, convert="RGB"`
			`) -> torch.Tensor:`
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`if convert:`
			`img = img.convert(convert)`
ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`img_np = np.array(img).astype(np.float32) / 255.0`
feature: controlnet 2023-02-12 07:42:19 +00:00			`# b, h, w, c => b, c, h, w`
ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`img_np = img_np[None].transpose(0, 3, 1, 2)`
			`img_t = torch.from_numpy(img_np)`
			`return 2.0 * img_t - 1.0`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00

style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`def pillow_mask_to_latent_mask(`
			`mask_img: PIL.Image.Image \| LazyLoadingImage, downsampling_factor`
			`) -> torch.Tensor:`
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`mask_img = mask_img.resize(`
			`(`
			`mask_img.width // downsampling_factor,`
			`mask_img.height // downsampling_factor,`
			`),`
			`resample=Image.Resampling.LANCZOS,`
			`)`

			`mask = np.array(mask_img).astype(np.float32) / 255.0`
			`mask = mask[None, None]`
style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`mask_t = torch.from_numpy(mask)`
			`return mask_t`
feature: sliced latent decoding allows generation of bigger images. tile seams can be noticeable occasionally despite the feathering 2023-02-12 08:52:50 +00:00

style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`def pillow_img_to_opencv_img(img: PIL.Image.Image \| LazyLoadingImage):`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00			`open_cv_image = np.array(img)`
			`# Convert RGB to BGR`
			`open_cv_image = open_cv_image[:, :, ::-1].copy()`
			`return open_cv_image`


ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`def torch_image_to_openvcv_img(img: torch.Tensor) -> np.ndarray:`
feature: controlnet 2023-02-12 07:42:19 +00:00			`img = (img + 1) / 2`
ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`img_np = img.detach().cpu().numpy()`
feature: controlnet 2023-02-12 07:42:19 +00:00			`# assert there is only one image`
ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`assert img_np.shape[0] == 1`
			`img_np = img_np[0]`
			`img_np = img_np.transpose(1, 2, 0)`
			`img_np = (img_np * 255).astype(np.uint8)`
feature: controlnet 2023-02-12 07:42:19 +00:00			`# RGB to BGR`
ci: add type checker fix some typehint issues 2023-12-11 02:29:47 +00:00			`img_np = img_np[:, :, ::-1]`
			`return img_np`
feature: controlnet 2023-02-12 07:42:19 +00:00

style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`def torch_img_to_pillow_img(img_t: torch.Tensor) -> PIL.Image.Image:`
fix: images came out wrong if processed on mps 2023-02-15 20:43:19 +00:00			`img_t = img_t.to(torch.float32).detach().cpu()`
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`if len(img_t.shape) == 3:`
			`img_t = img_t.unsqueeze(0)`
			`if img_t.shape[0] != 1:`
			`raise ValueError("Only batch size 1 supported")`
			`if img_t.shape[1] == 1:`
			`colorspace = "L"`
			`elif img_t.shape[1] == 3:`
			`colorspace = "RGB"`
			`else:`
style: speed up linting and autoformatting. fix lints 2023-09-29 08:13:50 +00:00			`msg = (`
feature: generate large images Added a composition stage so large images are more coherent 2023-02-12 02:23:45 +00:00			`f"Unsupported colorspace. {img_t.shape[1]} channels in {img_t.shape} shape"`
			`)`
style: speed up linting and autoformatting. fix lints 2023-09-29 08:13:50 +00:00			`raise ValueError(msg)`
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`img_t = rearrange(img_t, "b c h w -> b h w c")`
			`img_t = torch.clamp((img_t + 1.0) / 2.0, min=0.0, max=1.0)`
			`img_np = (255.0 * img_t).cpu().numpy().astype(np.uint8)[0]`
			`if colorspace == "L":`
			`img_np = img_np[:, :, 0]`
			`return Image.fromarray(img_np, colorspace)`


			`def model_latent_to_pillow_img(latent: torch.Tensor) -> PIL.Image.Image:`
refactor: move model_manager to utils 2023-12-15 21:42:45 +00:00			`from imaginairy.utils.model_manager import get_current_diffusion_model`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`if len(latent.shape) == 3:`
			`latent = latent.unsqueeze(0)`
			`if latent.shape[0] != 1:`
			`raise ValueError("Only batch size 1 supported")`
feature: inpainting model support; improved model manager 2022-10-23 21:46:45 +00:00			`model = get_current_diffusion_model()`
feature: use refiners library for generation BREAKING CHANGE - stable diffusion 1.5 + inpainting working - self-attention guidance working. improves image generation quality - tile-mode working - inpainting self-attention guidance working disable/broken features: - sd 1.4, 2.0, 2.1 - most of the samplers - pix2pix edit - most of the controlnets - memory management - python 3.8 support wip 2023-11-16 03:46:56 +00:00			`img_t = model.lda.decode(latent)`
refactor: cleanup image generation code 2023-02-15 16:02:36 +00:00			`return torch_img_to_pillow_img(img_t)`


			`def model_latents_to_pillow_imgs(latents: torch.Tensor) -> Sequence[PIL.Image.Image]:`
			`return [model_latent_to_pillow_img(latent) for latent in latents]`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00

style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`def pillow_img_to_model_latent(`
			`model, img: PIL.Image.Image \| LazyLoadingImage, batch_size=1, half=True`
			`):`
feature: boolean logic masks Specify advanced text based masks using boolean logic and strength modifiers. Mask descriptions must be lowercase. Keywords uppercase. Valid symbols: `AND`, `OR`, `NOT`, `()`, and mask strength modifier `{1.5}` where `+` can be any of `+ - /`. Single-character boolean operators also work. When writing strength modifies know that pixel values are between 0 and 1. - feature: apply mask edits to original files - feature: auto-rotate images if exif data specifies to do so - fix: accept mask images in command line 2022-09-24 05:58:48 +00:00			`init_image = pillow_img_to_torch_image(img).to(get_device())`
			`init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)`
			`if half:`
			`return model.get_first_stage_encoding(`
			`model.encode_first_stage(init_image.half())`
			`)`
			`return model.get_first_stage_encoding(model.encode_first_stage(init_image))`
feature: image edit gifs and demo-reels 2023-01-22 01:36:47 +00:00

feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00			`def imgpaths_to_imgs(imgpaths):`
			`imgs = []`
			`for imgpath in imgpaths:`
			`if isinstance(imgpath, str):`
			`img = LazyLoadingImage(filepath=imgpath)`
			`imgs.append(img)`
			`else:`
			`imgs.append(imgpath)`

			`return imgs`


			`def add_caption_to_image(`
style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`img: PIL.Image.Image \| LazyLoadingImage,`
			`caption,`
			`font_size=16,`
			`font_path=f"{PKG_ROOT}/data/DejaVuSans.ttf",`
feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00			`):`
style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`img_pil = img.as_pillow() if isinstance(img, LazyLoadingImage) else img`
			`draw = ImageDraw.Draw(img_pil)`
feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00
			`font = ImageFont.truetype(font_path, font_size)`

			`x = 15`
style: fix all the mypy typing issues ...or ignore them. 2023-12-12 06:29:36 +00:00			`y = img_pil.height - 15 - font_size`
feature: image edit gifs and demo-reels 2023-01-22 01:36:47 +00:00
feature: add compilation animations (#224) - add generation/compare gifs 2023-01-29 01:16:47 +00:00			`draw.text(`
			`(x, y),`
			`caption,`
			`font=font,`
			`fill=(255, 255, 255),`
			`stroke_width=3,`
			`stroke_fill=(0, 0, 0),`
feature: image edit gifs and demo-reels 2023-01-22 01:36:47 +00:00			`)`