feature: image sizes can now be multiples of 8 instead of 64

from https://github.com/CompVis/stable-diffusion/issues/60#issuecomment-1240294667
2 years ago · 248679d8de
parent 9bef5a151a
commit 248679d8de
5 changed files with 24 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -282,6 +282,10 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -
 [Example Colab](https://colab.research.google.com/drive/1rOvQNs0Cmn_yU1bKWjCOHzGVDgZkaTtO?usp=sharing)

 ## ChangeLog
+
+**8.1.0**
+- feature: image sizes can now be multiples of 8 instead of 64. Inputs will be silently rounded down.
+
 **8.0.5**
 - fix: bypass huggingface cache retrieval bug

@ -483,14 +487,14 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
 ## Todo

 - Performance Optimizations
-   - ✅ https://github.com/huggingface/diffusers/blob/main/docs/source/optimization/fp16.mdx
-   - ✅ https://github.com/CompVis/stable-diffusion/compare/main...Doggettx:stable-diffusion:autocast-improvements#
-   - ✅ https://www.reddit.com/r/StableDiffusion/comments/xalaws/test_update_for_less_memory_usage_and_higher/
+   - ✅ fp16
+   - ✅ [Doggettx Sliced attention](https://github.com/CompVis/stable-diffusion/compare/main...Doggettx:stable-diffusion:autocast-improvements#)
+   - ✅ xformers support https://www.photoroom.com/tech/stable-diffusion-100-percent-faster-with-memory-efficient-attention/
   - https://github.com/neonsecret/stable-diffusion  
   - https://github.com/CompVis/stable-diffusion/pull/177
   - https://github.com/huggingface/diffusers/pull/532/files
   - https://github.com/HazyResearch/flash-attention
-   - ✅ xformers improvements https://www.photoroom.com/tech/stable-diffusion-100-percent-faster-with-memory-efficient-attention/
+   
 - Development Environment
   - ✅ add tests
   - ✅ set up ci (test/lint/format)
--- a/imaginairy/cmds.py
+++ b/imaginairy/cmds.py
@ -69,7 +69,7 @@ logger = logging.getLogger(__name__)
    default=None,
    show_default=True,
    type=int,
-    help="Image height. Should be multiple of 64.",
+    help="Image height. Should be multiple of 8.",
 )
@click.option(
    "-w",
@ -77,7 +77,7 @@ logger = logging.getLogger(__name__)
    default=None,
    show_default=True,
    type=int,
-    help="Image width. Should be multiple of 64.",
+    help="Image width. Should be multiple of 8.",
 )
@click.option(
    "--steps",
@ -174,7 +174,7 @@ logger = logging.getLogger(__name__)
@click.option(
    "--outpaint",
    help=(
-        "Specify in what directions to expand the image. Values will be snapped such that output image size is multiples of 64. Examples\n"
+        "Specify in what directions to expand the image. Values will be snapped such that output image size is multiples of 8. Examples\n"
        "  `--outpaint up10,down300,left50,right50`\n"
        "  `--outpaint u10,d300,l50,r50`\n"
        "  `--outpaint all200`\n"
@ -341,7 +341,7 @@ def imagine_cmd(
    default=None,
    show_default=True,
    type=int,
-    help="Image height. Should be multiple of 64.",
+    help="Image height. Should be multiple of 8.",
 )
@click.option(
    "-w",
@ -349,7 +349,7 @@ def imagine_cmd(
    default=None,
    show_default=True,
    type=int,
-    help="Image width. Should be multiple of 64.",
+    help="Image width. Should be multiple of 8.",
 )
@click.option(
    "--steps",
@ -446,7 +446,7 @@ def imagine_cmd(
@click.option(
    "--outpaint",
    help=(
-        "Specify in what directions to expand the image. Values will be snapped such that output image size is multiples of 64. Examples\n"
+        "Specify in what directions to expand the image. Values will be snapped such that output image size is multiples of 8. Examples\n"
        "  `--outpaint up10,down300,left50,right50`\n"
        "  `--outpaint u10,d300,l50,r50`\n"
        "  `--outpaint all200`\n"
--- a/imaginairy/img_utils.py
+++ b/imaginairy/img_utils.py
@ -10,7 +10,7 @@ from imaginairy.utils import get_device


 def pillow_fit_image_within(
-    image: PIL.Image.Image, max_height=512, max_width=512, convert="RGB"
+    image: PIL.Image.Image, max_height=512, max_width=512, convert="RGB", snap_size=8
 ):
    image = image.convert(convert)
    w, h = image.size
@ -23,9 +23,9 @@ def pillow_fit_image_within(

    if resize_ratio != 1:
        w, h = int(w * resize_ratio), int(h * resize_ratio)
-    # resize to integer multiple of 64
-    w -= w % 64
-    h -= h % 64
+    # resize to integer multiple of snap_size
+    w -= w % snap_size
+    h -= h % snap_size

    if (w, h) != image.size:
        image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
--- a/imaginairy/modules/diffusion/openaimodel.py
+++ b/imaginairy/modules/diffusion/openaimodel.py
@ -836,6 +836,10 @@ class UNetModel(nn.Module):
            hs.append(h)
        h = self.middle_block(h, emb, context)
        for module in self.output_blocks:
+            # allows us to work with multiples of 8 instead of 64 for image sizes
+            # https://github.com/CompVis/stable-diffusion/issues/60#issuecomment-1240294667
+            if h.shape[-2:] != hs[-1].shape[-2:]:
+                h = F.interpolate(h, hs[-1].shape[-2:], mode="nearest")
            h = th.cat([h, hs.pop()], dim=1)
            h = module(h, emb, context)
        h = h.type(x.dtype)
--- a/imaginairy/outpaint.py
+++ b/imaginairy/outpaint.py
@ -4,7 +4,7 @@ from PIL import Image, ImageDraw


 def prepare_image_for_outpaint(
-    img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=64
+    img, mask=None, up=None, down=None, left=None, right=None, _all=0, snap_multiple=8
 ):
    up = up if up is not None else _all
    down = down if down is not None else _all
@ -31,7 +31,7 @@ def prepare_image_for_outpaint(
    expanded_image.paste(img, (left, up))

    # extend border pixels outward, this helps prevents lines at the boundary because masks getting reduced to
-    # 64x64 latent space can cause som inaccuracies
+    # 64x64 latent space can cause some inaccuracies

    if up > 0:
        expanded_image.paste(