feature: generate large images

Added a composition stage so large images are more coherent
1 year ago · 2aef6089e0
parent b93b6a4d7c
commit 2aef6089e0
8 changed files with 653 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -53,15 +53,17 @@ Use prompt strength to control how strong the edit is. For extra control you can
 with prompt-based masking.

 ```bash
->> aimg edit scenic_landscape.jpg "make it winter" --prompt-strength 20
->> aimg edit scenic_landscape.jpg "make it winter" --steps 30 --arg-schedule "prompt_strength[2:25:0.5]" --compilation-anim
->> aimg edit dog.jpg "make the dog red" --prompt-strength 5
->> aimg edit bowl_of_fruit.jpg "replace the fruit with strawberries"
->> aimg edit freckled_woman.jpg "make her a cyborg" --prompt-strength 13
+# enter imaginairy shell
+>> aimg
+🤖🧠> edit scenic_landscape.jpg -p "make it winter" --prompt-strength 20
+🤖🧠> edit scenic_landscape.jpg -p "make it winter" --steps 30 --arg-schedule "prompt_strength[2:25:0.5]" --compilation-anim
+🤖🧠> edit dog.jpg -p "make the dog red" --prompt-strength 5
+🤖🧠> edit bowl_of_fruit.jpg -p "replace the fruit with strawberries"
+🤖🧠> edit freckled_woman.jpg -p "make her a cyborg" --prompt-strength 13
 # create a comparison gif
->> aimg edit pearl_girl.jpg "make her wear clown makeup" --compare-gif
+🤖🧠> edit pearl_girl.jpg -p "make her wear clown makeup" --compare-gif
 # create an animation showing the edit with increasing prompt strengths
->> aimg edit mona-lisa.jpg "make it a color professional photo headshot" --negative-prompt "old, ugly, blurry" --arg-schedule "prompt-strength[2:8:0.5]" --compilation-anim gif
+🤖🧠> edit mona-lisa.jpg -p "make it a color professional photo headshot" --negative-prompt "old, ugly, blurry" --arg-schedule "prompt-strength[2:8:0.5]" --compilation-anim gif
 ```


@ -570,6 +572,7 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
   - ✅ add k-diffusion sampling methods
   - ✅ tiling
   - ✅ generation videos/gifs
+   - [Attend and Excite](https://attendandexcite.github.io/Attend-and-Excite/)
   - Compositional Visual Generation
     - https://github.com/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch
     - https://colab.research.google.com/github/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch/blob/main/notebooks/demo.ipynb#scrollTo=wt_j3uXZGFAS
@ -600,6 +603,10 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
   - ✅ text based image masking
     - ✅ ClipSeg - https://github.com/timojl/clipseg
     - https://github.com/facebookresearch/detectron2
+     - https://x-decoder-vl.github.io/
+   - Maskless editing
+     - ✅ instruct-pix2pix
+     - 
   - Attention Control Methods
     - https://github.com/bloc97/CrossAttentionControl
     - https://github.com/ChenWu98/cycle-diffusion
@ -609,7 +616,10 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
     - ✅ realesrgan 
     - ldm
     - https://github.com/lowfuel/progrock-stable
-     - gobig
+     - [txt2imghd](https://github.com/jquesnelle/txt2imghd/blob/master/txt2imghd.py)
+     - latent scaling + reprocessing
+     - stability upscaler
+     - rivers have wings upscaler
     - stable super-res?
       - todo: try with 1-0-0-0 mask at full image resolution (rencoding entire image+predicted image at every step)
       - todo: use a gaussian pyramid and only include the "high-detail" level of the pyramid into the next step
@ -684,6 +694,7 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
 - https://stablecog.com/ 

 ## Further Reading
+ - https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 - [Prompt Engineering Handbook](https://openart.ai/promptbook)
 - Differences between samplers
   - https://www.reddit.com/r/StableDiffusion/comments/xbeyw3/can_anyone_offer_a_little_guidance_on_the/
--- a/imaginairy/api.py
+++ b/imaginairy/api.py
@ -1,4 +1,5 @@
 import logging
+import math
 import os
 import re

@ -194,6 +195,8 @@ def _generate_single_image(
    progress_img_interval_min_s=0.1,
    half_mode=None,
    add_caption=False,
+    suppress_inpaint=False,
+    return_latent=False,
 ):
    import torch.nn
    from PIL import Image, ImageOps
@ -241,7 +244,8 @@ def _generate_single_image(
        weights_location=prompt.model,
        config_path=prompt.model_config_path,
        half_mode=half_mode,
-        for_inpainting=prompt.mask_image or prompt.mask_prompt or prompt.outpaint,
+        for_inpainting=(prompt.mask_image or prompt.mask_prompt or prompt.outpaint)
+        and not suppress_inpaint,
    )
    progress_latents = []

@ -288,6 +292,9 @@ def _generate_single_image(
        c_cat = []
        c_cat_neutral = None
        result_images = {}
+        seed_everything(prompt.seed)
+        noise = randn_seeded(seed=prompt.seed, size=shape).to(get_device())
+
        if prompt.init_image:
            starting_image = prompt.init_image
            generation_strength = 1 - prompt.init_image_strength
@ -341,10 +348,11 @@ def _generate_single_image(
            shape = init_latent.shape

            log_latent(init_latent, "init_latent")
-
            seed_everything(prompt.seed)
-            noise = randn_seeded(seed=prompt.seed, size=init_latent.size())
-            noise = noise.to(get_device())
+            noise = randn_seeded(seed=prompt.seed, size=init_latent.shape).to(
+                get_device()
+            )
+            # noise = noise[:, :, : init_latent.shape[2], : init_latent.shape[3]]

            schedule = NoiseSchedule(
                model_num_timesteps=model.num_timesteps,
@ -417,6 +425,40 @@ def _generate_single_image(
        }
        log_latent(init_latent_noised, "init_latent_noised")

+        comp_samples = _generate_composition_latent(
+            sampler=sampler,
+            sampler_kwargs={
+                "num_steps": prompt.steps,
+                "initial_latent": init_latent_noised,
+                "positive_conditioning": positive_conditioning,
+                "neutral_conditioning": neutral_conditioning,
+                "guidance_scale": prompt.prompt_strength,
+                "t_start": t_enc,
+                "mask": mask_latent,
+                "orig_latent": init_latent,
+                "shape": shape,
+                "batch_size": 1,
+                "denoiser_cls": denoiser_cls,
+            },
+        )
+        if comp_samples is not None:
+            noise = noise[:, :, : comp_samples.shape[2], : comp_samples.shape[3]]
+
+            schedule = NoiseSchedule(
+                model_num_timesteps=model.num_timesteps,
+                ddim_num_steps=prompt.steps,
+                model_alphas_cumprod=model.alphas_cumprod,
+                ddim_discretize="uniform",
+            )
+            t_enc = int(prompt.steps * 0.8)
+            init_latent_noised = noise_an_image(
+                comp_samples,
+                torch.tensor([t_enc - 1]).to(get_device()),
+                schedule=schedule,
+                noise=noise,
+            )
+
+        log_latent(comp_samples, "comp_samples")
        with lc.timing("sampling"):
            samples = sampler.sample(
                num_steps=prompt.steps,
@ -431,6 +473,8 @@ def _generate_single_image(
                batch_size=1,
                denoiser_cls=denoiser_cls,
            )
+        if return_latent:
+            return samples

        with lc.timing("decoding"):
            gen_imgs_t = model.decode_first_stage(samples)
@ -441,6 +485,11 @@ def _generate_single_image(
            log_img(mask_final, "reconstituting mask")
            mask_final = ImageOps.invert(mask_final)
            gen_img = Image.composite(gen_img, init_image, mask_final)
+            gen_img = combine_image(
+                original_img=init_image,
+                generated_img=gen_img,
+                mask_img=mask_image_orig,
+            )
            log_img(gen_img, "reconstituted image")

        upscaled_img = None
@ -504,6 +553,80 @@ def _prompts_to_embeddings(prompts, model):
    return conditioning


+def calc_scale_to_fit_within(
+    height,
+    width,
+    max_size,
+):
+    if max(height, width) < max_size:
+        return 1
+
+    if width > height:
+        return max_size / width
+
+    return max_size / height
+
+
+def _generate_composition_latent(
+    sampler,
+    sampler_kwargs,
+):
+    from copy import deepcopy
+
+    from torch.nn import functional as F
+
+    new_kwargs = deepcopy(sampler_kwargs)
+    b, c, h, w = orig_shape = new_kwargs["shape"]
+    max_compose_gen_size = 768
+    shrink_scale = calc_scale_to_fit_within(
+        height=h,
+        width=w,
+        max_size=int(math.ceil(max_compose_gen_size / 8)),
+    )
+    if shrink_scale >= 1:
+        return None
+
+    # shrink everything
+    new_shape = b, c, int(round(h * shrink_scale)), int(round(w * shrink_scale))
+    initial_latent = new_kwargs["initial_latent"]
+    if initial_latent is not None:
+        initial_latent = F.interpolate(initial_latent, size=new_shape[2:], mode="area")
+
+    for cond in [
+        new_kwargs["positive_conditioning"],
+        new_kwargs["neutral_conditioning"],
+    ]:
+        cond["c_concat"] = [
+            F.interpolate(c, size=new_shape[2:], mode="area") for c in cond["c_concat"]
+        ]
+
+    mask_latent = new_kwargs["mask"]
+    if mask_latent is not None:
+        mask_latent = F.interpolate(mask_latent, size=new_shape[2:], mode="area")
+
+    orig_latent = new_kwargs["orig_latent"]
+    if orig_latent is not None:
+        orig_latent = F.interpolate(orig_latent, size=new_shape[2:], mode="area")
+    t_start = new_kwargs["t_start"]
+    if t_start is not None:
+        gen_strength = new_kwargs["t_start"] / new_kwargs["num_steps"]
+        t_start = int(round(15 * gen_strength))
+    new_kwargs.update(
+        {
+            "num_steps": 15,
+            "initial_latent": initial_latent,
+            "t_start": t_start,
+            "mask": mask_latent,
+            "orig_latent": orig_latent,
+            "shape": new_shape,
+        }
+    )
+    samples = sampler.sample(**new_kwargs)
+    # samples = upscale_latent(samples)
+    samples = F.interpolate(samples, size=orig_shape[2:], mode="bilinear")
+    return samples
+
+
 def prompt_normalized(prompt):
    return re.sub(r"[^a-zA-Z0-9.,\[\]-]+", "_", prompt)[:130]

--- a/imaginairy/enhancers/upscale_riverwing.py
+++ b/imaginairy/enhancers/upscale_riverwing.py
@ -0,0 +1,299 @@
+from functools import lru_cache
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pytorch_lightning import seed_everything
+from torch import nn
+
+from imaginairy.model_manager import hf_hub_download
+from imaginairy.utils import get_device, platform_appropriate_autocast
+from imaginairy.vendored import k_diffusion as K
+from imaginairy.vendored.k_diffusion import layers
+from imaginairy.vendored.k_diffusion.models.image_v1 import ImageDenoiserModelV1
+from imaginairy.vendored.k_diffusion.utils import append_dims
+
+
+class NoiseLevelAndTextConditionedUpscaler(nn.Module):
+    def __init__(self, inner_model, sigma_data=1.0, embed_dim=256):
+        super().__init__()
+        self.inner_model = inner_model
+        self.sigma_data = sigma_data
+        self.low_res_noise_embed = K.layers.FourierFeatures(1, embed_dim, std=2)
+
+    def forward(self, inp, sigma, low_res, low_res_sigma, c, **kwargs):
+        cross_cond, cross_cond_padding, pooler = c
+        c_in = 1 / (low_res_sigma**2 + self.sigma_data**2) ** 0.5
+        c_noise = low_res_sigma.log1p()[:, None]
+        c_in = append_dims(c_in, low_res.ndim)
+        low_res_noise_embed = self.low_res_noise_embed(c_noise)
+        low_res_in = F.interpolate(low_res, scale_factor=2, mode="nearest") * c_in
+        mapping_cond = torch.cat([low_res_noise_embed, pooler], dim=1)
+        return self.inner_model(
+            inp,
+            sigma,
+            unet_cond=low_res_in,
+            mapping_cond=mapping_cond,
+            cross_cond=cross_cond,
+            cross_cond_padding=cross_cond_padding,
+            **kwargs,
+        )
+
+
+@lru_cache(maxsize=1)
+def get_upscaler_model(
+    model_path,
+    pooler_dim=768,
+    train=False,
+    device=get_device(),
+):
+    config = {
+        "type": "image_v1",
+        "input_channels": 4,
+        "input_size": [48, 48],
+        "patch_size": 1,
+        "mapping_out": 768,
+        "mapping_cond_dim": 896,
+        "unet_cond_dim": 4,
+        "depths": [4, 4, 4, 4],
+        "channels": [384, 384, 768, 768],
+        "self_attn_depths": [False, False, False, True],
+        "cross_attn_depths": [False, True, True, True],
+        "cross_cond_dim": 768,
+        "has_variance": True,
+        "dropout_rate": 0.0,
+        "augment_prob": 0.0,
+        "augment_wrapper": False,
+        "sigma_data": 1.0,
+        "sigma_min": 1e-2,
+        "sigma_max": 20,
+        "sigma_sample_density": {"type": "lognormal", "mean": -0.5, "std": 1.2},
+        "skip_stages": 0,
+    }
+
+    model = ImageDenoiserModelV1(
+        config["input_channels"],
+        config["mapping_out"],
+        config["depths"],
+        config["channels"],
+        config["self_attn_depths"],
+        config["cross_attn_depths"],
+        patch_size=config["patch_size"],
+        dropout_rate=config["dropout_rate"],
+        mapping_cond_dim=config["mapping_cond_dim"]
+        + (9 if config["augment_wrapper"] else 0),
+        unet_cond_dim=config["unet_cond_dim"],
+        cross_cond_dim=config["cross_cond_dim"],
+        skip_stages=config["skip_stages"],
+        has_variance=config["has_variance"],
+    )
+
+    model = NoiseLevelAndTextConditionedUpscaler(
+        model,
+        sigma_data=config["sigma_data"],
+        embed_dim=config["mapping_cond_dim"] - pooler_dim,
+    )
+    ckpt = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(ckpt["model_ema"])
+    model = layers.DenoiserWithVariance(model, sigma_data=config["sigma_data"])
+    if not train:
+        model = model.eval().requires_grad_(False)
+    return model.to(device)
+
+
+class CFGUpscaler(nn.Module):
+    def __init__(self, model, uc, cond_scale, device):
+        super().__init__()
+        self.inner_model = model
+        self.uc = uc
+        self.cond_scale = cond_scale
+        self.device = device
+
+    def forward(self, x, sigma, low_res, low_res_sigma, c):
+        if self.cond_scale in (0.0, 1.0):
+            # Shortcut for when we don't need to run both.
+            if self.cond_scale == 0.0:
+                c_in = self.uc
+            elif self.cond_scale == 1.0:
+                c_in = c
+            return self.inner_model(
+                x, sigma, low_res=low_res, low_res_sigma=low_res_sigma, c=c_in
+            )
+
+        x_in = torch.cat([x] * 2)
+        sigma_in = torch.cat([sigma] * 2)
+        low_res_in = torch.cat([low_res] * 2)
+        low_res_sigma_in = torch.cat([low_res_sigma] * 2)
+        c_in = [torch.cat([uc_item, c_item]) for uc_item, c_item in zip(self.uc, c)]
+        uncond, cond = self.inner_model(
+            x_in, sigma_in, low_res=low_res_in, low_res_sigma=low_res_sigma_in, c=c_in
+        ).chunk(2)
+        return uncond + (cond - uncond) * self.cond_scale
+
+
+class CLIPTokenizerTransform:
+    def __init__(self, version="openai/clip-vit-large-patch14", max_length=77):
+        from transformers import CLIPTokenizer
+
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.max_length = max_length
+
+    def __call__(self, text):
+        indexer = 0 if isinstance(text, str) else ...
+        tok_out = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = tok_out["input_ids"][indexer]
+        attention_mask = 1 - tok_out["attention_mask"][indexer]
+        return input_ids, attention_mask
+
+
+class CLIPEmbedder(nn.Module):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)."""
+
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda"):
+        super().__init__()
+        from transformers import CLIPTextModel, logging
+
+        logging.set_verbosity_error()
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.transformer = self.transformer.eval().requires_grad_(False).to(device)
+
+    @property
+    def device(self):
+        return self.transformer.device
+
+    def forward(self, tok_out):
+        input_ids, cross_cond_padding = tok_out
+        clip_out = self.transformer(
+            input_ids=input_ids.to(self.device), output_hidden_states=True
+        )
+        return (
+            clip_out.hidden_states[-1],
+            cross_cond_padding.to(self.device),
+            clip_out.pooler_output,
+        )
+
+
+@lru_cache()
+def clip_up_models():
+    with platform_appropriate_autocast():
+        tok_up = CLIPTokenizerTransform()
+        text_encoder_up = CLIPEmbedder(device=get_device())
+    return text_encoder_up, tok_up
+
+
+@torch.no_grad()
+def condition_up(prompts):
+    text_encoder_up, tok_up = clip_up_models()
+    return text_encoder_up(tok_up(prompts))
+
+
+@torch.no_grad()
+def upscale_latent(
+    low_res_latent,
+    upscale_prompt="",
+    seed=0,
+    steps=30,
+    guidance_scale=1.0,
+    batch_size=1,
+    num_samples=1,
+    # Amount of noise to add per step (0.0=deterministic). Used in all samplers except `k_euler`.
+    eta=1.0,
+    device=get_device(),
+):
+    # Add noise to the latent vectors before upscaling. This theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default.
+    noise_aug_level = 0  # @param {type: 'slider', min: 0.0, max: 0.6, step:0.025}
+    noise_aug_type = "gaussian"  # @param ["gaussian", "fake"]
+
+    # @markdown Sampler settings. `k_dpm_adaptive` uses an adaptive solver with error tolerance `tol_scale`, all other use a fixed number of steps.
+    sampler = "k_dpm_2_ancestral"  # @param ["k_euler", "k_euler_ancestral", "k_dpm_2_ancestral", "k_dpm_fast", "k_dpm_adaptive"]
+
+    tol_scale = 0.25  # @param {type: 'number'}
+
+    seed_everything(seed)
+
+    # uc = condition_up(batch_size * ["blurry, low resolution, 720p, grainy"])
+    uc = condition_up(batch_size * [""])
+    c = condition_up(batch_size * [upscale_prompt])
+
+    [_, C, H, W] = low_res_latent.shape
+
+    # Noise levels from stable diffusion.
+    sigma_min, sigma_max = 0.029167532920837402, 14.614642143249512
+    model_up = get_upscaler_model(
+        model_path=hf_hub_download(
+            "pcuenq/k-upscaler", "laion_text_cond_latent_upscaler_2_1_00470000_slim.pth"
+        ),
+        device=device,
+    )
+    model_wrap = CFGUpscaler(model_up, uc, cond_scale=guidance_scale, device=device)
+    low_res_sigma = torch.full([batch_size], noise_aug_level, device=device)
+    x_shape = [batch_size, C, 2 * H, 2 * W]
+
+    def do_sample(noise, extra_args):
+        # We take log-linear steps in noise-level from sigma_max to sigma_min, using one of the k diffusion samplers.
+        sigmas = (
+            torch.linspace(np.log(sigma_max), np.log(sigma_min), steps + 1)
+            .exp()
+            .to(device)
+        )
+        if sampler == "k_euler":
+            return K.sampling.sample_euler(
+                model_wrap, noise * sigma_max, sigmas, extra_args=extra_args
+            )
+        if sampler == "k_euler_ancestral":
+            return K.sampling.sample_euler_ancestral(
+                model_wrap, noise * sigma_max, sigmas, extra_args=extra_args, eta=eta
+            )
+        if sampler == "k_dpm_2_ancestral":
+            return K.sampling.sample_dpm_2_ancestral(
+                model_wrap, noise * sigma_max, sigmas, extra_args=extra_args, eta=eta
+            )
+        if sampler == "k_dpm_fast":
+            return K.sampling.sample_dpm_fast(
+                model_wrap,
+                noise * sigma_max,
+                sigma_min,
+                sigma_max,
+                steps,
+                extra_args=extra_args,
+                eta=eta,
+            )
+        if sampler == "k_dpm_adaptive":
+            sampler_opts = {
+                "s_noise": 1.0,
+                "rtol": tol_scale * 0.05,
+                "atol": tol_scale / 127.5,
+                "pcoeff": 0.2,
+                "icoeff": 0.4,
+                "dcoeff": 0,
+            }
+            return K.sampling.sample_dpm_adaptive(
+                model_wrap,
+                noise * sigma_max,
+                sigma_min,
+                sigma_max,
+                extra_args=extra_args,
+                eta=eta,
+                **sampler_opts,
+            )
+        raise ValueError(f"Unknown sampler {sampler}")
+
+    for _ in range((num_samples - 1) // batch_size + 1):
+        if noise_aug_type == "gaussian":
+            latent_noised = low_res_latent + noise_aug_level * torch.randn_like(
+                low_res_latent
+            )
+        elif noise_aug_type == "fake":
+            latent_noised = low_res_latent * (noise_aug_level**2 + 1) ** 0.5
+        extra_args = {"low_res": latent_noised, "low_res_sigma": low_res_sigma, "c": c}
+        noise = torch.randn(x_shape, device=device)
+        up_latents = do_sample(noise, extra_args)
+        return up_latents
--- a/imaginairy/img_utils.py
+++ b/imaginairy/img_utils.py
@ -76,7 +76,9 @@ def torch_img_to_pillow_img(img_t: torch.Tensor):
    elif img_t.shape[1] == 3:
        colorspace = "RGB"
    else:
-        raise ValueError("Unsupported colorspace")
+        raise ValueError(
+            f"Unsupported colorspace. {img_t.shape[1]} channels in {img_t.shape} shape"
+        )
    img_t = rearrange(img_t, "b c h w -> b h w c")
    img_t = torch.clamp((img_t + 1.0) / 2.0, min=0.0, max=1.0)
    img_np = (255.0 * img_t).cpu().numpy().astype(np.uint8)[0]
--- a/imaginairy/log_utils.py
+++ b/imaginairy/log_utils.py
@ -91,14 +91,17 @@ class ImageLoggingContext:
        self.last_progress_img_ts = 0
        self.last_progress_img_step = -1000

+        self._prev_log_context = None
+
    def __enter__(self):
        global _CURRENT_LOGGING_CONTEXT  # noqa
+        self._prev_log_context = _CURRENT_LOGGING_CONTEXT
        _CURRENT_LOGGING_CONTEXT = self
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        global _CURRENT_LOGGING_CONTEXT  # noqa
-        _CURRENT_LOGGING_CONTEXT = None
+        _CURRENT_LOGGING_CONTEXT = self._prev_log_context

    def timing(self, description):
        return TimingContext(self, description)
--- a/imaginairy/modules/autoencoder.py
+++ b/imaginairy/modules/autoencoder.py
@ -1,5 +1,6 @@
 # pylama:ignore=W0613
 import logging
+import math
 from contextlib import contextmanager

 import pytorch_lightning as pl
@ -317,3 +318,62 @@ class IdentityFirstStage(torch.nn.Module):

    def forward(self, x, *args, **kwargs):
        return x
+
+
+def chunk_latent(tensor, chunk_size=64, overlap_size=8):
+    # Get the shape of the tensor
+    batch_size, num_channels, height, width = tensor.shape
+
+    # Calculate the number of chunks along each dimension
+    num_rows = int(math.ceil(height / chunk_size))
+    num_cols = int(math.ceil(width / chunk_size))
+
+    # Initialize a list to store the chunks
+    chunks = []
+
+    # Loop over the rows and columns
+    for row in range(num_rows):
+        for col in range(num_cols):
+            # Calculate the start and end indices for the chunk along each dimension
+            row_start = max(row * chunk_size - overlap_size, 0)
+            row_end = min(row_start + chunk_size + overlap_size, height)
+            col_start = max(col * chunk_size - overlap_size, 0)
+            col_end = min(col_start + chunk_size + overlap_size, width)
+
+            # Extract the chunk from the tensor and append it to the list of chunks
+            chunk = tensor[:, :, row_start:row_end, col_start:col_end]
+            chunks.append((chunk, row_start, col_start))
+
+    return chunks, num_rows, num_cols
+
+
+def merge_tensors(tensor_list, num_rows, num_cols):
+    print(f"num_rows: {num_rows}")
+    print(f"num_cols: {num_cols}")
+    n, channel, h, w = tensor_list[0].size()
+    assert n == 1
+    final_width = 0
+    final_height = 0
+    for col_idx in range(num_cols):
+        final_width += tensor_list[col_idx].size()[3]
+
+    for row_idx in range(num_rows):
+        final_height += tensor_list[row_idx * num_cols].size()[2]
+
+    final_tensor = torch.zeros([1, channel, final_height, final_width])
+    print(f"final size {final_tensor.size()}")
+    for row_idx in range(num_rows):
+        for col_idx in range(num_cols):
+
+            list_idx = row_idx * num_cols + col_idx
+            chunk = tensor_list[list_idx]
+            print(f"chunk size: {chunk.size()}")
+            _, _, chunk_h, chunk_w = chunk.size()
+            final_tensor[
+                :,
+                :,
+                row_idx * h : row_idx * h + chunk_h,
+                col_idx * w : col_idx * w + chunk_w,
+            ] = chunk
+
+    return final_tensor
--- a/imaginairy/outpaint.py
+++ b/imaginairy/outpaint.py
@ -1,74 +1,188 @@
 import re

+import torch
 from PIL import Image, ImageDraw
+from torch import nn

+from imaginairy.img_utils import torch_img_to_pillow_img

-def prepare_image_for_outpaint(
-    img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
+
+def outpaint_calculations(
+    img_width,
+    img_height,
+    up=None,
+    down=None,
+    left=None,
+    right=None,
+    _all=0,
+    snap_multiple=8,
 ):
    up = up if up is not None else _all
    down = down if down is not None else _all
    left = left if left is not None else _all
    right = right if right is not None else _all

-    lft_pct = left / (left + right)
-    rgt_pct = right / (left + right)
-    up_pct = up / (up + down)
-    dwn_pct = down / (up + down)
+    lft_pct = left / (left + right) if left + right else 0
+    rgt_pct = right / (left + right) if left + right else 0
+    up_pct = up / (up + down) if up + down else 0
+    dwn_pct = down / (up + down) if up + down else 0

-    new_width = round((img.width + left + right) / snap_multiple) * snap_multiple
-    new_height = round((img.height + up + down) / snap_multiple) * snap_multiple
-    height_addition = max(new_height - img.height, 0)
-    width_addition = max(new_width - img.width, 0)
+    new_width = round((img_width + left + right) / snap_multiple) * snap_multiple
+    new_height = round((img_height + up + down) / snap_multiple) * snap_multiple
+    height_addition = max(new_height - img_height, 0)
+    width_addition = max(new_width - img_width, 0)
    up = int(round(height_addition * up_pct))
    down = int(round(height_addition * dwn_pct))
    left = int(round(width_addition * lft_pct))
    right = int(round(width_addition * rgt_pct))

-    expanded_image = Image.new(
-        "RGB", (img.width + left + right, img.height + up + down), (0, 0, 0)
+    return up, down, left, right, new_width, new_height
+
+
+def prepare_tensor_for_outpaint(
+    img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
+):
+    up, down, left, right, new_width, new_height = outpaint_calculations(
+        img_width=img.shape[2],
+        img_height=img.shape[1],
+        up=up,
+        down=down,
+        left=left,
+        right=right,
+        _all=_all,
+        snap_multiple=snap_multiple,
    )
-    expanded_image.paste(img, (left, up))
+
+    def resize(img_t, h, w):
+        new_size = (img_t.shape[0], h, w)
+        return nn.functional.interpolate(img_t, size=new_size, mode="nearest")
+
+    def paste(dst, src, y, x):
+        dst[:, y : y + src.shape[1], x : x + src.shape[2]] = src
+
+    expanded_img = torch.zeros(
+        img.shape[0], img.shape[1] + up + down, img.shape[2] + left + right
+    )
+    expanded_img[:, up : up + img.shape[1], left : left + img.shape[2]] = img

    # extend border pixels outward, this helps prevents lines at the boundary because masks getting reduced to
    # 64x64 latent space can cause some inaccuracies

    if up > 0:
+        top_row = img[:, 0, :]
+        paste(expanded_img, resize(top_row, h=up, w=expanded_img.shape[2]), y=0, x=0)
+        paste(expanded_img, resize(top_row, h=up, w=img.shape[2]), y=0, x=left)
+
+    if down > 0:
+        bottom_row = img[:, -1, :]
+        paste(
+            expanded_img,
+            resize(bottom_row, h=down, w=expanded_img.shape[2]),
+            y=expanded_img.shape[1] - down,
+            x=0,
+        )
+        paste(
+            expanded_img,
+            resize(bottom_row, h=down, w=img.shape[2]),
+            y=expanded_img.shape[1] - down,
+            x=left,
+        )
+
+    if left > 0:
+        left_column = img[:, :, 0]
+        paste(
+            expanded_img, resize(left_column, h=expanded_img.shape[1], w=left), y=0, x=0
+        )
+        paste(expanded_img, resize(left_column, h=img.shape[1], w=left), y=up, x=0)
+
+    if right > 0:
+        right_column = img[:, :, -1]
+        paste(
+            expanded_img,
+            resize(right_column, h=expanded_img.shape[1], w=right),
+            y=0,
+            x=expanded_img.shape[2] - right,
+        )
+        paste(
+            expanded_img,
+            resize(right_column, h=img.shape[1], w=right),
+            y=up,
+            x=expanded_img.shape[2] - right,
+        )
+
+    # create a mask for the new boundaries
+    expanded_mask = torch.zeros_like(expanded_img)
+
+    if mask is None:
+        # set to black
+        expanded_mask[:, up : up + img.shape[1], left : left + img.shape[2]] = 1
+    else:
+        expanded_mask[:, up : up + mask.shape[1], left : left + mask.shape[2]] = mask
+
+    return expanded_img, expanded_mask
+
+
+def prepare_image_for_outpaint(
+    img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
+):
+    up, down, left, right, new_width, new_height = outpaint_calculations(
+        img_width=img.width,
+        img_height=img.height,
+        up=up,
+        down=down,
+        left=left,
+        right=right,
+        _all=_all,
+        snap_multiple=snap_multiple,
+    )
+    ran_img_t = torch.randn((1, 3, new_height, new_width), device="cpu")
+    expanded_image = torch_img_to_pillow_img(ran_img_t)
+    # expanded_image = Image.new(
+    #     "RGB", (img.width + left + right, img.height + up + down), (0, 0, 0)
+    # )
+    expanded_image.paste(img, (left, up))
+
+    # extend border pixels outward, this helps prevents lines at the boundary because masks getting reduced to
+    # 64x64 latent space can cause some inaccuracies
+    alpha = 20
+    if up > 0:
+        top_row = img.crop((0, 0, img.width, 1))
+        top_row.putalpha(alpha)
        expanded_image.paste(
-            img.crop((0, 0, img.width, 1)).resize((expanded_image.width, up)),
+            top_row.resize((expanded_image.width, up)),
            (0, 0),
        )
        expanded_image.paste(
-            img.crop((0, 0, img.width, 1)).resize((img.width, up)),
+            top_row.resize((img.width, up)),
            (left, 0),
        )
    if down > 0:
+        bottom_row = img.crop((0, img.height - 1, img.width, img.height))
+        bottom_row.putalpha(alpha)
        expanded_image.paste(
-            img.crop((0, img.height - 1, img.width, img.height)).resize(
-                (expanded_image.width, down)
-            ),
+            bottom_row.resize((expanded_image.width, down)),
            (0, expanded_image.height - down),
        )
        expanded_image.paste(
-            img.crop((0, img.height - 1, img.width, img.height)).resize(
-                (img.width, down)
-            ),
+            bottom_row.resize((img.width, down)),
            (left, expanded_image.height - down),
        )
    if left > 0:
+        left_column = img.crop((0, 0, 1, img.height))
+        left_column.putalpha(alpha)
        expanded_image.paste(
-            img.crop((0, 0, 1, img.height)).resize((left, expanded_image.height)),
+            left_column.resize((left, expanded_image.height)),
            (0, 0),
        )
        expanded_image.paste(
-            img.crop((0, 0, 1, img.height)).resize((left, img.height)),
+            left_column.resize((left, img.height)),
            (0, up),
        )
    if right > 0:
+        right_column = img.crop((img.width - 1, 0, img.width, img.height))
+        right_column.putalpha(alpha)
        expanded_image.paste(
-            img.crop((img.width - 1, 0, img.width, img.height)).resize(
-                (right, expanded_image.height)
-            ),
+            right_column.resize((right, expanded_image.height)),
            (expanded_image.width - right, 0),
        )
        expanded_image.paste(
--- a/imaginairy/schema.py
+++ b/imaginairy/schema.py
@ -238,7 +238,7 @@ class ImaginePrompt:
            "negative_prompt": negative_prompts,
            "init_image": str(self.init_image),
            "init_image_strength": self.init_image_strength,
-            "seed": self.seed,
+            # "seed": self.seed,
            "steps": self.steps,
            "height": self.height,
            "width": self.width,