feature: Stable Diffusion 2.1

This commit is contained in:
Bryce 2022-12-07 10:16:38 -08:00 committed by Bryce Drennan
parent 26a504fef4
commit f30823e0b5
25 changed files with 61 additions and 10 deletions

View File

@ -230,6 +230,10 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -
[Example Colab](https://colab.research.google.com/drive/1rOvQNs0Cmn_yU1bKWjCOHzGVDgZkaTtO?usp=sharing)
## ChangeLog
**7.1.0**
- feature: 🎉 Stable Diffusion 2.1. Generated people are no longer (completely) distorted.
Use with `--model SD-2.1` or `--model SD-2.0-v`
**7.0.0**
- feature: negative prompting. `--negative-prompt` or `ImaginePrompt(..., negative_prompt="ugly, deformed, extra arms, etc")`
- feature: a default negative prompt is added to all generations. Images in SD-2.0 don't look bad anymore. Images in 1.5 look improved as well.

View File

@ -19,6 +19,7 @@ class ModelConfig:
config_path: str
weights_url: str
default_image_size: int
forced_attn_precision: str = "default"
MODEL_CONFIGS = [
@ -52,18 +53,37 @@ MODEL_CONFIGS = [
weights_url="https://huggingface.co/stabilityai/stable-diffusion-2-inpainting/resolve/main/512-inpainting-ema.ckpt",
default_image_size=512,
),
ModelConfig(
short_name="SD-2.1",
config_path="configs/stable-diffusion-v2-inference.yaml",
weights_url="https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt",
default_image_size=512,
),
ModelConfig(
short_name="SD-2.1-inpaint",
config_path="configs/stable-diffusion-v2-inpainting-inference.yaml",
weights_url="https://huggingface.co/stabilityai/stable-diffusion-2-inpainting/resolve/main/512-inpainting-ema.ckpt",
default_image_size=512,
),
ModelConfig(
short_name="SD-2.1-v",
config_path="configs/stable-diffusion-v2-inference-v.yaml",
weights_url="https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.ckpt",
default_image_size=768,
forced_attn_precision="fp32",
),
ModelConfig(
short_name="SD-2.0-v",
config_path="configs/stable-diffusion-v2-inference-v.yaml",
weights_url="https://huggingface.co/stabilityai/stable-diffusion-2/resolve/main/768-v-ema.ckpt",
default_image_size=768,
),
ModelConfig(
short_name="SD-2.0-upscale",
config_path="configs/stable-diffusion-v2-upscaling.yaml",
weights_url="https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/resolve/main/x4-upscaler-ema.ckpt",
default_image_size=512,
),
# ModelConfig(
# short_name="SD-2.0-upscale",
# config_path="configs/stable-diffusion-v2-upscaling.yaml",
# weights_url="https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/resolve/main/x4-upscaler-ema.ckpt",
# default_image_size=512,
# ),
]
MODEL_CONFIG_SHORTCUTS = {m.short_name: m for m in MODEL_CONFIGS}

View File

@ -11,6 +11,7 @@ from transformers.utils.hub import TRANSFORMERS_CACHE, HfFolder
from transformers.utils.hub import url_to_filename as tf_url_to_filename
from imaginairy import config as iconfig
from imaginairy.modules import attention
from imaginairy.paths import PKG_ROOT
from imaginairy.utils import get_device, instantiate_from_config
@ -137,6 +138,7 @@ def _get_diffusion_model(
Weights location may also be shortcut name, e.g. "SD-1.5"
"""
global MOST_RECENTLY_LOADED_MODEL # noqa
model_config = None
if weights_location is None:
weights_location = iconfig.DEFAULT_MODEL
if (
@ -155,6 +157,12 @@ def _get_diffusion_model(
model_config.weights_url,
)
# some models need the attention calculated in float32
if model_config is not None:
attention.ATTENTION_PRECISION_OVERRIDE = model_config.forced_attn_precision
else:
attention.ATTENTION_PRECISION_OVERRIDE = "default"
key = (config_path, weights_location)
if key not in LOADED_MODELS:
MemoryAwareModel(

View File

@ -18,6 +18,9 @@ except ImportError:
XFORMERS_IS_AVAILBLE = False
ATTENTION_PRECISION_OVERRIDE = "default"
class GEGLU(nn.Module):
def __init__(self, dim_in, dim_out):
super().__init__()
@ -178,13 +181,20 @@ class CrossAttention(nn.Module):
q = self.to_q(x)
context = context if context is not None else x
k = self.to_k(context)
k = self.to_k(context) * self.scale
v = self.to_v(context)
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
# force cast to fp32 to avoid overflowing
if ATTENTION_PRECISION_OVERRIDE == "fp32":
with torch.autocast(enabled=False, device_type=get_device()):
q, k = q.float(), k.float()
sim = einsum("b i d, b j d -> b i j", q, k)
else:
sim = einsum("b i d, b j d -> b i j", q, k)
del q, k
# if mask is not None:
# if sim.shape[2] == 320 and False:
# mask = [mask] * 2
@ -237,7 +247,14 @@ class CrossAttention(nn.Module):
slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
for i in range(0, q.shape[1], slice_size):
end = i + slice_size
s1 = einsum("b i d, b j d -> b i j", q[:, i:end], k)
# force cast to fp32 to avoid overflowing
if ATTENTION_PRECISION_OVERRIDE == "fp32":
with torch.autocast(enabled=False, device_type=get_device()):
q, k = q.float(), k.float()
s1 = einsum("b i d, b j d -> b i j", q[:, i:end], k)
else:
s1 = einsum("b i d, b j d -> b i j", q[:, i:end], k)
s2 = s1.softmax(dim=-1, dtype=q.dtype)
del s1

Binary file not shown.

Before

Width:  |  Height:  |  Size: 322 KiB

After

Width:  |  Height:  |  Size: 323 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 342 KiB

After

Width:  |  Height:  |  Size: 352 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 881 KiB

After

Width:  |  Height:  |  Size: 880 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 892 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 365 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 772 KiB

After

Width:  |  Height:  |  Size: 769 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 812 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 411 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 820 KiB

After

Width:  |  Height:  |  Size: 820 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 833 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 400 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 809 KiB

After

Width:  |  Height:  |  Size: 809 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 962 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 934 KiB

After

Width:  |  Height:  |  Size: 935 KiB

View File

@ -39,7 +39,9 @@ compare_prompts = [
@pytest.mark.skipif(get_device() != "cuda", reason="Too slow to run on CPU or MPS")
@pytest.mark.parametrize("model_version", ["SD-1.4", "SD-1.5", "SD-2.0", "SD-2.0-v"])
@pytest.mark.parametrize(
"model_version", ["SD-1.4", "SD-1.5", "SD-2.0", "SD-2.0-v", "SD-2.1", "SD-2.1-v"]
)
def test_model_versions(filename_base_for_orig_outputs, model_version):
"""Test that we can switch between model versions"""
prompts = []