num_classes:1000# timesteps for noise conditioning (here constant, just need one)
image_size:128
in_channels:7
out_channels:4
model_channels:256
attention_resolutions:[2,4,8]
num_res_blocks:2
channel_mult:[1,2,2,4]
disable_self_attentions:[True,True,True,False]
disable_middle_self_attn:False
num_heads:8
use_spatial_transformer:True
transformer_depth:1
context_dim:1024
legacy:False
use_linear_in_transformer:True
first_stage_config:
target:ldm.models.autoencoder.AutoencoderKL
params:
embed_dim:4
ddconfig:
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)