diff --git a/README.md b/README.md index 166b56c..95ff78e 100644 --- a/README.md +++ b/README.md @@ -366,7 +366,7 @@ imagine --tile-x -w 1024 -h 512 "360 degree equirectangular panorama photograph Use depth maps for amazing "translations" of existing images. ```bash ->> imagine --model SD-2.0-depth --init-image girl_with_a_pearl_earring_large.jpg --init-image-strength 0.05 "professional headshot photo of a woman with a pearl earring" -r 4 -w 1024 -h 1024 --steps 50 +>> imagine --init-image girl_with_a_pearl_earring_large.jpg --init-image-strength 0.05 "professional headshot photo of a woman with a pearl earring" -r 4 -w 1024 -h 1024 --steps 50 ```

➡️ diff --git a/imaginairy/api/generate.py b/imaginairy/api/generate.py index e826a2c..eab401b 100755 --- a/imaginairy/api/generate.py +++ b/imaginairy/api/generate.py @@ -286,7 +286,7 @@ def _generate_single_image_compvis( if control_inputs: control_modes = [c.mode for c in prompt.control_inputs] if inpaint_method == "auto": - if prompt.model_weights in {"SD-1.5", "SD-2.0"}: + if prompt.model_weights in {"SD-1.5"}: inpaint_method = "finetune" else: inpaint_method = "controlnet" diff --git a/imaginairy/configs/stable-diffusion-v2-inference-v.yaml b/imaginairy/configs/stable-diffusion-v2-inference-v.yaml deleted file mode 100644 index ee71794..0000000 --- a/imaginairy/configs/stable-diffusion-v2-inference-v.yaml +++ /dev/null @@ -1,68 +0,0 @@ -model: - base_learning_rate: 1.0e-4 - target: imaginairy.modules.diffusion.ddpm.LatentDiffusion - params: - parameterization: "v" - linear_start: 0.00085 - linear_end: 0.0120 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: "jpg" - cond_stage_key: "txt" - image_size: 64 - channels: 4 - cond_stage_trainable: false - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_factor: 0.18215 - use_ema: False # we set this to false because this is an inference only config - - unet_config: - target: imaginairy.modules.diffusion.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: False - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False - - first_stage_config: - target: imaginairy.modules.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: imaginairy.modules.encoders.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" diff --git a/imaginairy/configs/stable-diffusion-v2-inference.yaml b/imaginairy/configs/stable-diffusion-v2-inference.yaml deleted file mode 100644 index 4acf817..0000000 --- a/imaginairy/configs/stable-diffusion-v2-inference.yaml +++ /dev/null @@ -1,67 +0,0 @@ -model: - base_learning_rate: 1.0e-4 - target: imaginairy.modules.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.00085 - linear_end: 0.0120 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: "jpg" - cond_stage_key: "txt" - image_size: 64 - channels: 4 - cond_stage_trainable: false - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_factor: 0.18215 - use_ema: False # we set this to false because this is an inference only config - - unet_config: - target: imaginairy.modules.diffusion.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: False - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False - - first_stage_config: - target: imaginairy.modules.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: imaginairy.modules.encoders.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" diff --git a/imaginairy/configs/stable-diffusion-v2-inpainting-inference.yaml b/imaginairy/configs/stable-diffusion-v2-inpainting-inference.yaml deleted file mode 100644 index 9873f3c..0000000 --- a/imaginairy/configs/stable-diffusion-v2-inpainting-inference.yaml +++ /dev/null @@ -1,158 +0,0 @@ -model: - base_learning_rate: 5.0e-05 - target: imaginairy.modules.diffusion.ddpm.LatentInpaintDiffusion - params: - linear_start: 0.00085 - linear_end: 0.0120 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: "jpg" - cond_stage_key: "txt" - image_size: 64 - channels: 4 - cond_stage_trainable: false - conditioning_key: hybrid - scale_factor: 0.18215 - monitor: val/loss_simple_ema - finetune_keys: null - use_ema: False - - unet_config: - target: imaginairy.modules.diffusion.openaimodel.UNetModel - params: - use_checkpoint: True - image_size: 32 # unused - in_channels: 9 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False - - first_stage_config: - target: imaginairy.modules.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: imaginairy.modules.encoders.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" - - -data: - target: ldm.data.laion.WebDataModuleFromConfig - params: - tar_base: null # for concat as in LAION-A - p_unsafe_threshold: 0.1 - filter_word_list: "data/filters.yaml" - max_pwatermark: 0.45 - batch_size: 8 - num_workers: 6 - multinode: True - min_size: 512 - train: - shards: - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" - shuffle: 10000 - image_key: jpg - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - interpolation: 3 - - target: torchvision.transforms.RandomCrop - params: - size: 512 - postprocess: - target: ldm.data.laion.AddMask - params: - mode: "512train-large" - p_drop: 0.25 - # NOTE use enough shards to avoid empty validation loops in workers - validation: - shards: - - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " - shuffle: 0 - image_key: jpg - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - interpolation: 3 - - target: torchvision.transforms.CenterCrop - params: - size: 512 - postprocess: - target: ldm.data.laion.AddMask - params: - mode: "512train-large" - p_drop: 0.25 - -lightning: - find_unused_parameters: True - modelcheckpoint: - params: - every_n_train_steps: 5000 - - callbacks: - metrics_over_trainsteps_checkpoint: - params: - every_n_train_steps: 10000 - - image_logger: - target: main.ImageLogger - params: - enable_autocast: False - disabled: False - batch_frequency: 1000 - max_images: 4 - increase_log_steps: False - log_first_step: False - log_images_kwargs: - use_ema_scope: False - inpaint: False - plot_progressive_rows: False - plot_diffusion_rows: False - N: 4 - unconditional_guidance_scale: 5.0 - unconditional_guidance_label: [""] - ddim_steps: 50 # todo check these out for depth2img, - ddim_eta: 0.0 # todo check these out for depth2img, - - trainer: - benchmark: True - val_check_interval: 5000000 - num_sanity_val_steps: 0 - accumulate_grad_batches: 1 diff --git a/imaginairy/modules/diffusion/ddpm.py b/imaginairy/modules/diffusion/ddpm.py index 982e922..f653a50 100644 --- a/imaginairy/modules/diffusion/ddpm.py +++ b/imaginairy/modules/diffusion/ddpm.py @@ -1981,71 +1981,6 @@ class LatentInpaintDiffusion(LatentDiffusion): return z, all_conds -class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion): - """ - condition on monocular depth estimation. - """ - - def __init__(self, depth_stage_config, concat_keys=("midas_in",), **kwargs): - super().__init__(concat_keys=concat_keys, **kwargs) - self.depth_model = instantiate_from_config(depth_stage_config) - self.depth_stage_key = concat_keys[0] - - @torch.no_grad() - def get_input( - self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False - ): - # note: restricted to non-trainable encoders currently - assert ( - not self.cond_stage_trainable - ), "trainable cond stages not yet supported for depth2img" - z, c, x, xrec, xc = super().get_input( - batch, - self.first_stage_key, - return_first_stage_outputs=True, - force_c_encode=True, - return_original_cond=True, - bs=bs, - ) - - assert self.concat_keys is not None - assert len(self.concat_keys) == 1 - c_cat = [] - for ck in self.concat_keys: - cc = batch[ck] - if bs is not None: - cc = cc[:bs] - cc = cc.to(self.device) - cc = self.depth_model(cc) - cc = torch.nn.functional.interpolate( - cc, - size=z.shape[2:], - mode="bicubic", - align_corners=False, - ) - - depth_min, depth_max = torch.amin( - cc, dim=[1, 2, 3], keepdim=True - ), torch.amax(cc, dim=[1, 2, 3], keepdim=True) - cc = 2.0 * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.0 - c_cat.append(cc) - c_cat = torch.cat(c_cat, dim=1) - all_conds = {"c_concat": [c_cat], "c_crossattn": [c]} - if return_first_stage_outputs: - return z, all_conds, x, xrec, xc - return z, all_conds - - @torch.no_grad() - def log_images(self, *args, **kwargs): - log = super().log_images(*args, **kwargs) - depth = self.depth_model(args[0][self.depth_stage_key]) - depth_min, depth_max = torch.amin( - depth, dim=[1, 2, 3], keepdim=True - ), torch.amax(depth, dim=[1, 2, 3], keepdim=True) - log["depth"] = 2.0 * (depth - depth_min) / (depth_max - depth_min) - 1.0 - return log - - class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion): """ condition on low-res image (and optionally on some spatial noise augmentation). diff --git a/imaginairy/schema.py b/imaginairy/schema.py index 66446b0..66093b6 100644 --- a/imaginairy/schema.py +++ b/imaginairy/schema.py @@ -541,9 +541,6 @@ class ImaginePrompt(BaseModel, protected_namespaces=()): v = v.lower() - if info.data.get("model") == "SD-2.0-v" and v == SolverName.PLMS: - raise ValueError("PLMS solvers is not supported for SD-2.0-v model.") - if info.data.get("model") == "edit" and v in ( SolverName.PLMS, SolverName.DDIM, diff --git a/tests/test_utils/test_model_cache.py b/tests/test_utils/test_model_cache.py index 21a0621..8b0ed66 100644 --- a/tests/test_utils/test_model_cache.py +++ b/tests/test_utils/test_model_cache.py @@ -27,12 +27,7 @@ def create_model_of_n_bytes(n): @pytest.mark.parametrize( "model_version", [ - # "SD-1.4", "SD-1.5", - # "SD-2.0", - # "SD-2.0-v", - # "SD-2.1", - # "SD-2.1-v", "openjourney-v1", "openjourney-v2", "openjourney-v4",