style: fix all the mypy typing issues

...or ignore them.
6 months ago · 012cc648d3
parent 5a636e45c5
commit 012cc648d3
17 changed files with 141 additions and 107 deletions
--- a/imaginairy/api.py
+++ b/imaginairy/api.py
@ -1,12 +1,12 @@
 import logging
 import os
 import re
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable

 from imaginairy.utils.named_resolutions import normalize_image_size

 if TYPE_CHECKING:
-    from imaginairy.schema import ImaginePrompt
+    from imaginairy.schema import ImaginePrompt, LazyLoadingImage

 logger = logging.getLogger(__name__)

@ -335,20 +335,24 @@ def _generate_single_image_compvis(
        ]
        SolverCls = SOLVER_LOOKUP[prompt.solver_type.lower()]
        solver = SolverCls(model)
-        mask_latent = mask_image = mask_image_orig = mask_grayscale = None
-        t_enc = init_latent = control_image = None
+        mask_image: Image.Image | LazyLoadingImage | None = None
+        mask_latent = mask_image_orig = mask_grayscale = None
+        init_latent: torch.Tensor | None = None
+        t_enc = None
        starting_image = None
        denoiser_cls = None

        c_cat = []
        c_cat_neutral = None
-        result_images = {}
+        result_images: dict[str, torch.Tensor | Image.Image | None] = {}
+        assert prompt.seed is not None
        seed_everything(prompt.seed)
        noise = randn_seeded(seed=prompt.seed, size=shape).to(get_device())
        control_strengths = []

        if prompt.init_image:
            starting_image = prompt.init_image
+            assert prompt.init_image_strength is not None
            generation_strength = 1 - prompt.init_image_strength

            if model.cond_stage_key == "edit" or generation_strength >= 1:
@ -367,18 +371,18 @@ def _generate_single_image_compvis(
                starting_image, mask_image = prepare_image_for_outpaint(
                    starting_image, mask_image, **outpaint_kwargs
                )
-
+            assert starting_image is not None
            init_image = pillow_fit_image_within(
                starting_image,
                max_height=prompt.height,
                max_width=prompt.width,
            )
-            init_image_t = pillow_img_to_torch_image(init_image)
-            init_image_t = init_image_t.to(get_device())
+            init_image_t = pillow_img_to_torch_image(init_image).to(get_device())
            init_latent = model.get_first_stage_encoding(
                model.encode_first_stage(init_image_t)
            )
-            shape = init_latent.shape
+            assert init_latent is not None
+            shape = list(init_latent.shape)

            log_latent(init_latent, "init_latent")

@ -405,9 +409,9 @@ def _generate_single_image_compvis(
                    control_inputs.append(
                        ControlInput(mode="inpaint", image=mask_image)
                    )
-
+            assert prompt.seed is not None
            seed_everything(prompt.seed)
-            noise = randn_seeded(seed=prompt.seed, size=init_latent.shape).to(
+            noise = randn_seeded(seed=prompt.seed, size=list(init_latent.shape)).to(
                get_device()
            )
            # noise = noise[:, :, : init_latent.shape[2], : init_latent.shape[3]]
@ -451,8 +455,13 @@ def _generate_single_image_compvis(
                    control_image = control_input.image_raw
                elif control_input.image is not None:
                    control_image = control_input.image
+                else:
+                    raise RuntimeError("Control image must be provided")
+                assert control_image is not None
                control_image = control_image.convert("RGB")
                log_img(control_image, "control_image_input")
+                assert control_image is not None
+
                control_image_input = pillow_fit_image_within(
                    control_image,
                    max_height=prompt.height,
@ -464,11 +473,11 @@ def _generate_single_image_compvis(
                if control_input.image_raw is None:
                    control_prep_function = CONTROL_MODES[control_input.mode]
                    if control_input.mode == "inpaint":
-                        control_image_t = control_prep_function(
+                        control_image_t = control_prep_function(  # type: ignore
                            control_image_input_t, init_image_t
                        )
                    else:
-                        control_image_t = control_prep_function(control_image_input_t)
+                        control_image_t = control_prep_function(control_image_input_t)  # type: ignore
                else:
                    control_image_t = (control_image_input_t + 1) / 2

@ -499,6 +508,8 @@ def _generate_single_image_compvis(

        elif hasattr(model, "masked_image_key"):
            # inpainting model
+            assert mask_image_orig is not None
+            assert mask_latent is not None
            mask_t = pillow_img_to_torch_image(ImageOps.invert(mask_image_orig)).to(
                get_device()
            )
@ -519,6 +530,7 @@ def _generate_single_image_compvis(
        elif model.cond_stage_key == "edit":
            # pix2pix model
            c_cat = [model.encode_first_stage(init_image_t)]
+            assert init_latent is not None
            c_cat_neutral = [torch.zeros_like(init_latent)]
            denoiser_cls = CFGEditingDenoiser
        if c_cat:
@ -527,18 +539,24 @@ def _generate_single_image_compvis(
        if c_cat_neutral is None:
            c_cat_neutral = c_cat

-        positive_conditioning = {
+        positive_conditioning_d: dict[str, Any] = {
            "c_concat": c_cat,
            "c_crossattn": [positive_conditioning],
        }
-        neutral_conditioning = {
+        neutral_conditioning_d: dict[str, Any] = {
            "c_concat": c_cat_neutral,
            "c_crossattn": [neutral_conditioning],
        }
+        del neutral_conditioning
+        del positive_conditioning

        if control_strengths and is_controlnet_model:
-            positive_conditioning["control_strengths"] = torch.Tensor(control_strengths)
-            neutral_conditioning["control_strengths"] = torch.Tensor(control_strengths)
+            positive_conditioning_d["control_strengths"] = torch.Tensor(
+                control_strengths
+            )
+            neutral_conditioning_d["control_strengths"] = torch.Tensor(
+                control_strengths
+            )

        if (
            prompt.allow_compose_phase
@ -575,8 +593,8 @@ def _generate_single_image_compvis(
        with lc.timing("sampling"):
            samples = solver.sample(
                num_steps=prompt.steps,
-                positive_conditioning=positive_conditioning,
-                neutral_conditioning=neutral_conditioning,
+                positive_conditioning=positive_conditioning_d,
+                neutral_conditioning=neutral_conditioning_d,
                guidance_scale=prompt.prompt_strength,
                t_start=t_enc,
                mask=mask_latent,
--- a/imaginairy/api_refiners.py
+++ b/imaginairy/api_refiners.py
@ -19,7 +19,7 @@ def _generate_single_image(
    half_mode=None,
 ):
    import torch.nn
-    from PIL import ImageOps
+    from PIL import Image, ImageOps
    from pytorch_lightning import seed_everything
    from refiners.foundationals.latent_diffusion.schedulers import DDIM, DPMSolver
    from tqdm import tqdm
@ -51,7 +51,7 @@ def _generate_single_image(
    from imaginairy.outpaint import outpaint_arg_str_parse, prepare_image_for_outpaint
    from imaginairy.safety import create_safety_score
    from imaginairy.samplers import SolverName
-    from imaginairy.schema import ImaginePrompt, ImagineResult
+    from imaginairy.schema import ImagineResult
    from imaginairy.utils import get_device, randn_seeded

    if dtype is None:
@ -75,7 +75,6 @@ def _generate_single_image(

    mask_image = None
    mask_image_orig = None
-    prompt: ImaginePrompt = prompt.make_concrete_copy()

    def latent_logger(latents):
        progress_latents.append(latents)
@ -101,8 +100,8 @@ def _generate_single_image(
        )
        clip_text_embedding = clip_text_embedding.to(device=sd.device, dtype=sd.dtype)

-        result_images = {}
-        progress_latents = []
+        result_images: dict[str, torch.Tensor | None | Image.Image] = {}
+        progress_latents: list[torch.Tensor] = []
        first_step = 0
        mask_grayscale = None

@ -125,7 +124,8 @@ def _generate_single_image(

        if prompt.init_image:
            starting_image = prompt.init_image
-            first_step = int((prompt.steps) * prompt.init_image_strength)
+            assert prompt.init_image_strength is not None
+            first_step = int(prompt.steps * prompt.init_image_strength)
            # noise_step = int((prompt.steps - 1) * prompt.init_image_strength)

            if prompt.mask_prompt:
@ -150,7 +150,7 @@ def _generate_single_image(
            init_image_t = init_image_t.to(device=sd.device, dtype=sd.dtype)
            init_latent = sd.lda.encode(init_image_t)

-            shape = init_latent.shape
+            shape = list(init_latent.shape)

            log_latent(init_latent, "init_latent")

@ -179,6 +179,7 @@ def _generate_single_image(
                    )

        seed_everything(prompt.seed)
+        assert prompt.seed is not None

        noise = randn_seeded(seed=prompt.seed, size=shape).to(
            get_device(), dtype=sd.dtype
@ -210,11 +211,11 @@ def _generate_single_image(
                if control_input.image_raw is None:
                    control_prep_function = CONTROL_MODES[control_input.mode]
                    if control_input.mode == "inpaint":
-                        control_image_t = control_prep_function(
+                        control_image_t = control_prep_function(  # type: ignore
                            control_image_input_t, init_image_t
                        )
                    else:
-                        control_image_t = control_prep_function(control_image_input_t)
+                        control_image_t = control_prep_function(control_image_input_t)  # type: ignore
                else:
                    control_image_t = (control_image_input_t + 1) / 2

@ -246,9 +247,9 @@ def _generate_single_image(
                    raise ValueError(msg)
                from refiners.foundationals.latent_diffusion import SD1ControlnetAdapter

-                controlnet = SD1ControlnetAdapter(
+                controlnet = SD1ControlnetAdapter(  # type: ignore
                    name=control_input.mode,
-                    target=sd.unet,
+                    target=sd.unet,  # type: ignore
                    weights_location=control_config.weights_location,
                )
                controlnet.set_scale(control_input.strength)
--- a/imaginairy/enhancers/clip_masking.py
+++ b/imaginairy/enhancers/clip_masking.py
@ -9,6 +9,7 @@ from torchvision import transforms

 from imaginairy.img_utils import pillow_fit_image_within
 from imaginairy.log_utils import log_img
+from imaginairy.schema import LazyLoadingImage
 from imaginairy.vendored.clipseg import CLIPDensePredT

 weights_url = "https://github.com/timojl/clipseg/raw/master/weights/rd64-uni.pth"
@ -32,7 +33,7 @@ def clip_mask_model():


 def get_img_mask(
-    img: PIL.Image.Image,
+    img: PIL.Image.Image | LazyLoadingImage,
    mask_description_statement: str,
    threshold: Optional[float] = None,
 ):
--- a/imaginairy/img_processors/control_modes.py
+++ b/imaginairy/img_processors/control_modes.py
@ -1,7 +1,12 @@
 """Functions to create hint images for controlnet."""
+from typing import TYPE_CHECKING, Callable, Dict, Union
+
+if TYPE_CHECKING:
+    import numpy as np
+    from torch import Tensor  # noqa


-def create_canny_edges(img):
+def create_canny_edges(img: "Tensor") -> "Tensor":
    import cv2
    import numpy as np
    import torch
@ -33,7 +38,7 @@ def create_canny_edges(img):
    return canny_image


-def create_depth_map(img):
+def create_depth_map(img: "Tensor") -> "Tensor":
    import torch

    orig_size = img.shape[2:]
@ -56,7 +61,7 @@ def create_depth_map(img):
    return depth_pt


-def _create_depth_map_raw(img):
+def _create_depth_map_raw(img: "Tensor") -> "Tensor":
    import torch

    from imaginairy.modules.midas.api import MiDaSInference, midas_device
@ -83,7 +88,7 @@ def _create_depth_map_raw(img):
    return depth_pt


-def create_normal_map(img):
+def create_normal_map(img: "Tensor") -> "Tensor":
    import torch

    from imaginairy.vendored.imaginairy_normal_map.model import (
@ -97,7 +102,7 @@ def create_normal_map(img):
    return normal_img_t


-def create_hed_edges(img_t):
+def create_hed_edges(img_t: "Tensor") -> "Tensor":
    import torch

    from imaginairy.img_processors.hed_boundary import create_hed_map
@ -120,7 +125,7 @@ def create_hed_edges(img_t):
    return hint_t


-def create_pose_map(img_t):
+def create_pose_map(img_t: "Tensor"):
    from imaginairy.img_processors.openpose import create_body_pose_img
    from imaginairy.utils import get_device

@ -130,7 +135,7 @@ def create_pose_map(img_t):
    return pose_t


-def make_noise_disk(H, W, C, F):
+def make_noise_disk(H: int, W: int, C: int, F: int) -> "np.ndarray":
    import cv2
    import numpy as np

@ -144,7 +149,7 @@ def make_noise_disk(H, W, C, F):
    return noise


-def shuffle_map_np(img, h=None, w=None, f=256):
+def shuffle_map_np(img: "np.ndarray", h=None, w=None, f=256) -> "np.ndarray":
    import cv2
    import numpy as np

@ -160,7 +165,7 @@ def shuffle_map_np(img, h=None, w=None, f=256):
    return cv2.remap(img, flow, None, cv2.INTER_LINEAR)


-def shuffle_map_torch(tensor, h=None, w=None, f=256):
+def shuffle_map_torch(tensor: "Tensor", h=None, w=None, f=256) -> "Tensor":
    import torch

    # Assuming the input tensor is in shape (B, C, H, W)
@ -187,7 +192,7 @@ def shuffle_map_torch(tensor, h=None, w=None, f=256):
    return shuffled_tensor.to(device)


-def inpaint_prep(mask_image_t, target_image_t):
+def inpaint_prep(mask_image_t: "Tensor", target_image_t: "Tensor") -> "Tensor":
    """
    Combines the masked image and target image into a single tensor.

@ -207,7 +212,7 @@ def inpaint_prep(mask_image_t, target_image_t):
    return output_image_t


-def to_grayscale(img):
+def to_grayscale(img: "Tensor") -> "Tensor":
    # The dimensions of input should be (batch_size, channels, height, width)
    if img.dim() != 4:
        raise ValueError("Input should be a 4d tensor")
@ -228,11 +233,13 @@ def to_grayscale(img):
    return (gray_3_channels + 1.0) / 2.0


-def noop(img):
+def noop(img: "Tensor") -> "Tensor":
    return (img + 1.0) / 2.0


-CONTROL_MODES = {
+FunctionType = Union["Callable[[Tensor, Tensor], Tensor]", "Callable[[Tensor], Tensor]"]
+
+CONTROL_MODES: Dict[str, FunctionType] = {
    "canny": create_canny_edges,
    "depth": create_depth_map,
    "normal": create_normal_map,
--- a/imaginairy/img_utils.py
+++ b/imaginairy/img_utils.py
@ -23,8 +23,12 @@ from imaginairy.utils import get_device


 def pillow_fit_image_within(
-    image: PIL.Image.Image, max_height=512, max_width=512, convert="RGB", snap_size=8
-):
+    image: PIL.Image.Image | LazyLoadingImage,
+    max_height=512,
+    max_width=512,
+    convert="RGB",
+    snap_size=8,
+) -> PIL.Image.Image:
    image = image.convert(convert)
    w, h = image.size
    resize_ratio = 1
@ -45,7 +49,9 @@ def pillow_fit_image_within(
    return image


-def pillow_img_to_torch_image(img: PIL.Image.Image, convert="RGB"):
+def pillow_img_to_torch_image(
+    img: PIL.Image.Image | LazyLoadingImage, convert="RGB"
+) -> torch.Tensor:
    if convert:
        img = img.convert(convert)
    img_np = np.array(img).astype(np.float32) / 255.0
@ -55,7 +61,9 @@ def pillow_img_to_torch_image(img: PIL.Image.Image, convert="RGB"):
    return 2.0 * img_t - 1.0


-def pillow_mask_to_latent_mask(mask_img: PIL.Image.Image, downsampling_factor):
+def pillow_mask_to_latent_mask(
+    mask_img: PIL.Image.Image | LazyLoadingImage, downsampling_factor
+) -> torch.Tensor:
    mask_img = mask_img.resize(
        (
            mask_img.width // downsampling_factor,
@ -66,11 +74,11 @@ def pillow_mask_to_latent_mask(mask_img: PIL.Image.Image, downsampling_factor):

    mask = np.array(mask_img).astype(np.float32) / 255.0
    mask = mask[None, None]
-    mask = torch.from_numpy(mask)
-    return mask
+    mask_t = torch.from_numpy(mask)
+    return mask_t


-def pillow_img_to_opencv_img(img: PIL.Image.Image):
+def pillow_img_to_opencv_img(img: PIL.Image.Image | LazyLoadingImage):
    open_cv_image = np.array(img)
    # Convert RGB to BGR
    open_cv_image = open_cv_image[:, :, ::-1].copy()
@ -90,7 +98,7 @@ def torch_image_to_openvcv_img(img: torch.Tensor) -> np.ndarray:
    return img_np


-def torch_img_to_pillow_img(img_t: torch.Tensor):
+def torch_img_to_pillow_img(img_t: torch.Tensor) -> PIL.Image.Image:
    img_t = img_t.to(torch.float32).detach().cpu()
    if len(img_t.shape) == 3:
        img_t = img_t.unsqueeze(0)
@ -129,7 +137,9 @@ def model_latents_to_pillow_imgs(latents: torch.Tensor) -> Sequence[PIL.Image.Im
    return [model_latent_to_pillow_img(latent) for latent in latents]


-def pillow_img_to_model_latent(model, img, batch_size=1, half=True):
+def pillow_img_to_model_latent(
+    model, img: PIL.Image.Image | LazyLoadingImage, batch_size=1, half=True
+):
    init_image = pillow_img_to_torch_image(img).to(get_device())
    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
    if half:
@ -152,14 +162,18 @@ def imgpaths_to_imgs(imgpaths):


 def add_caption_to_image(
-    img, caption, font_size=16, font_path=f"{PKG_ROOT}/data/DejaVuSans.ttf"
+    img: PIL.Image.Image | LazyLoadingImage,
+    caption,
+    font_size=16,
+    font_path=f"{PKG_ROOT}/data/DejaVuSans.ttf",
 ):
-    draw = ImageDraw.Draw(img)
+    img_pil = img.as_pillow() if isinstance(img, LazyLoadingImage) else img
+    draw = ImageDraw.Draw(img_pil)

    font = ImageFont.truetype(font_path, font_size)

    x = 15
-    y = img.height - 15 - font_size
+    y = img_pil.height - 15 - font_size

    draw.text(
        (x, y),
--- a/imaginairy/model_manager.py
+++ b/imaginairy/model_manager.py
@ -266,6 +266,7 @@ def _get_diffusion_model_refiners(
        text_encoder_weights,
    ) = load_stable_diffusion_compvis_weights(weights_location)

+    StableDiffusionCls: type[LatentDiffusionModel]
    if for_inpainting:
        unet = SD1UNet(in_channels=9)
        StableDiffusionCls = StableDiffusion_1_Inpainting
@ -390,9 +391,9 @@ def resolve_model_weights_config(
    return model_weights_config


-def get_model_default_image_size(model_architecture: str | ModelArchitecture):
+def get_model_default_image_size(model_architecture: str | ModelArchitecture | None):
    if isinstance(model_architecture, str):
-        model_architecture = iconfig.MODEL_WEIGHT_CONFIG_LOOKUP.get(
+        model_architecture = iconfig.MODEL_ARCHITECTURE_LOOKUP.get(
            model_architecture, None
        )
    default_size = None
--- a/imaginairy/modules/cldm.py
+++ b/imaginairy/modules/cldm.py
@ -2,7 +2,7 @@ import torch
 from torch import nn

 from imaginairy.modules.attention import SpatialTransformer
-from imaginairy.modules.diffusion.ddpm import LatentDiffusion
+from imaginairy.modules.diffusion.ddpm import LatentDiffusion  # type: ignore
 from imaginairy.modules.diffusion.openaimodel import (
    AttentionBlock,
    Downsample,
--- a/imaginairy/modules/diffusion/ddpm.py
+++ b/imaginairy/modules/diffusion/ddpm.py
@ -890,7 +890,7 @@ class LatentDiffusion(DDPM):
        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
        return denoise_grid

-    def get_first_stage_encoding(self, encoder_posterior):
+    def get_first_stage_encoding(self, encoder_posterior) -> torch.Tensor:
        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
            z = encoder_posterior.mode()
        elif isinstance(encoder_posterior, torch.Tensor):
--- a/imaginairy/modules/diffusion/model.py
+++ b/imaginairy/modules/diffusion/model.py
@ -1,7 +1,6 @@
 # pytorch_diffusion + derived encoder decoder
 import gc
 import math
-from typing import Any, Optional

 import numpy as np
 import torch
@ -300,7 +299,7 @@ class MemoryEfficientAttnBlock(nn.Module):
        self.proj_out = torch.nn.Conv2d(
            in_channels, in_channels, kernel_size=1, stride=1, padding=0
        )
-        self.attention_op: Optional[Any] = None
+        self.attention_op = None

    def forward(self, x):
        h_ = x
--- a/imaginairy/modules/refiners_sd.py
+++ b/imaginairy/modules/refiners_sd.py
@ -62,13 +62,13 @@ class TileModeMixin(nn.Module):
            if isinstance(m, nn.Conv2d):
                if not hasattr(m, "_orig_conv_forward"):
                    # patch with a function that can handle tiling in a single direction
-                    m._initial_padding_mode = m.padding_mode
-                    m._orig_conv_forward = m._conv_forward
-                    m._conv_forward = _tile_mode_conv2d_conv_forward.__get__(
+                    m._initial_padding_mode = m.padding_mode  # type: ignore
+                    m._orig_conv_forward = m._conv_forward  # type: ignore
+                    m._conv_forward = _tile_mode_conv2d_conv_forward.__get__(  # type: ignore
                        m, nn.Conv2d
                    )
-                m.padding_modeX = "circular" if tile_x else "constant"
-                m.padding_modeY = "circular" if tile_y else "constant"
+                m.padding_modeX = "circular" if tile_x else "constant"  # type: ignore
+                m.padding_modeY = "circular" if tile_y else "constant"  # type: ignore
                if m.padding_modeY == m.padding_modeX:
                    m.padding_mode = m.padding_modeX
                m.paddingX = (
@ -76,13 +76,13 @@ class TileModeMixin(nn.Module):
                    m._reversed_padding_repeated_twice[1],
                    0,
                    0,
-                )
+                )  # type: ignore
                m.paddingY = (
                    0,
                    0,
                    m._reversed_padding_repeated_twice[2],
                    m._reversed_padding_repeated_twice[3],
-                )
+                )  # type: ignore


 class StableDiffusion_1(TileModeMixin, RefinerStableDiffusion_1):
@ -291,7 +291,9 @@ def monkeypatch_sd1controlnetadapter():
            dtype=target.dtype,
        )

-        self._controlnet: list[Controlnet] = [controlnet]  # not registered by PyTorch
+        self._controlnet: list[Controlnet] = [  # type: ignore
+            controlnet
+        ]  # not registered by PyTorch

        with self.setup_adapter(target):
            super(SD1ControlnetAdapter, self).__init__(target)
--- a/imaginairy/safety.py
+++ b/imaginairy/safety.py
@ -21,7 +21,7 @@ class SafetyResult:
        self.special_care_scores = {}
        self.is_filtered = False

-    def add_special_care_score(self, concept_idx, abs_score, threshold):
+    def add_special_care_score(self, concept_idx: int, abs_score, threshold):
        adjustment = self._default_adjustment
        adjusted_score = round(abs_score - threshold + adjustment, 3)
        try:
@ -138,8 +138,8 @@ def monkeypatch_safety_cosine_distance():
    safety_checker_mod.cosine_distance = cosine_distance_float32


-_CONCEPT_DESCRIPTIONS = []
-_SPECIAL_CARE_DESCRIPTIONS = []
+_CONCEPT_DESCRIPTIONS: list[str] = []
+_SPECIAL_CARE_DESCRIPTIONS: list[str] = []


 def create_safety_score(img, safety_mode=SafetyMode.STRICT):
--- a/imaginairy/samplers/kdiff.py
+++ b/imaginairy/samplers/kdiff.py
@ -1,5 +1,6 @@
 # pylama:ignore=W0613
 from abc import ABC
+from typing import Callable

 import torch
 from torch import nn
@ -58,7 +59,7 @@ def sample_dpm_fast(model, x, sigmas, extra_args=None, disable=False, callback=N


 class KDiffusionSolver(ImageSolver, ABC):
-    sampler_func: callable
+    sampler_func: Callable

    def __init__(self, model):
        super().__init__(model)
--- a/imaginairy/schema.py
+++ b/imaginairy/schema.py
@ -279,7 +279,7 @@ class ImaginePrompt(BaseModel, protected_namespaces=()):
    mask_mode: MaskMode = MaskMode.REPLACE
    mask_modify_original: bool = True
    outpaint: str | None = ""
-    model_weights: config.ModelWeightsConfig = Field(
+    model_weights: config.ModelWeightsConfig = Field(  # type: ignore
        default=config.DEFAULT_MODEL_WEIGHTS, validate_default=True
    )
    solver_type: str = Field(default=config.DEFAULT_SOLVER, validate_default=True)
@ -504,7 +504,7 @@ class ImaginePrompt(BaseModel, protected_namespaces=()):
            model_weights = config.DEFAULT_MODEL_WEIGHTS
        from imaginairy.model_manager import resolve_model_weights_config

-        should_use_inpainting = (
+        should_use_inpainting = bool(
            data.get("mask_image") or data.get("mask_prompt") or data.get("outpaint")
        )
        should_use_inpainting_weights = (
--- a/imaginairy/train.py
+++ b/imaginairy/train.py
@ -19,7 +19,7 @@ try:
 except ImportError:
    # let's not break all of imaginairy just because a training import doesn't exist in an older version of PL
    # Use >= 1.6.0 to make this work
-    DDPStrategy = None
+    DDPStrategy = None  # type: ignore
 import contextlib

 from pytorch_lightning.trainer import Trainer
--- a/imaginairy/video_sample.py
+++ b/imaginairy/video_sample.py
@ -6,7 +6,7 @@ import re
 import time
 from glob import glob
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional

 import cv2
 import numpy as np
@ -31,9 +31,10 @@ logger = logging.getLogger(__name__)


 def generate_video(
-    input_path: str = "other/images/sound-music.jpg",  # Can either be image file or folder with image files
-    num_frames: Optional[int] = None,
-    num_steps: Optional[int] = None,
+    input_path: str,  # Can either be image file or folder with image files
+    output_folder: str | None = None,
+    num_frames: int = 6,
+    num_steps: int = 30,
    model_name: str = "svd_xt",
    fps_id: int = 6,
    output_fps: int = 6,
@ -42,7 +43,6 @@ def generate_video(
    seed: Optional[int] = None,
    decoding_t: int = 1,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: Optional[str] = None,
-    output_folder: Optional[str] = None,
    repetitions=1,
 ):
    """
@ -77,7 +77,8 @@ def generate_video(

    num_frames = default(num_frames, video_model_config.defaults.get("frames", 12))
    num_steps = default(num_steps, video_model_config.defaults.get("steps", 30))
-    output_folder = default(output_folder, "outputs/video/")
+    output_folder_str = default(output_folder, "outputs/video/")
+    del output_folder
    video_config_path = f"{PKG_ROOT}/{video_model_config.architecture.config_path}"

    logger.info(
@ -119,11 +120,10 @@ def generate_video(
    for _ in range(repetitions):
        for input_path in all_img_paths:
            if input_path.startswith("http"):
-                image = LazyLoadingImage(url=input_path)
+                image = LazyLoadingImage(url=input_path).as_pillow()
            else:
-                image = LazyLoadingImage(filepath=input_path)
+                image = LazyLoadingImage(filepath=input_path).as_pillow()
            crop_coords = None
-            image = image.as_pillow()
            if image.mode == "RGBA":
                image = image.convert("RGB")
            if image.size != expected_size:
@ -180,7 +180,7 @@ def generate_video(
                    "Large fps value! This may lead to suboptimal performance."
                )

-            value_dict = {}
+            value_dict: dict[str, Any] = {}
            value_dict["motion_bucket_id"] = motion_bucket_id
            value_dict["fps_id"] = fps_id
            value_dict["cond_aug"] = cond_aug
@ -250,14 +250,14 @@ def generate_video(
                    left, upper, right, lower = crop_coords
                    samples = samples[:, :, upper:lower, left:right]

-                os.makedirs(output_folder, exist_ok=True)
-                base_count = len(glob(os.path.join(output_folder, "*.mp4"))) + 1
+                os.makedirs(output_folder_str, exist_ok=True)
+                base_count = len(glob(os.path.join(output_folder_str, "*.mp4"))) + 1
                source_slug = make_safe_filename(input_path)
                video_filename = f"{base_count:06d}_{model_name}_{seed}_{fps_id}fps_{source_slug}.mp4"
-                video_path = os.path.join(output_folder, video_filename)
+                video_path = os.path.join(output_folder_str, video_filename)
                writer = cv2.VideoWriter(
                    video_path,
-                    cv2.VideoWriter_fourcc(*"MP4V"),
+                    cv2.VideoWriter_fourcc(*"MP4V"),  # type: ignore
                    output_fps,
                    (samples.shape[-1], samples.shape[-2]),
                )
@ -332,7 +332,7 @@ def load_model(
 ):
    oconfig = OmegaConf.load(config)
    ckpt_path = get_cached_url_path(weights_url)
-    oconfig["model"]["params"]["ckpt_path"] = ckpt_path
+    oconfig["model"]["params"]["ckpt_path"] = ckpt_path  # type: ignore
    if device == "cuda":
        oconfig.model.params.conditioner_config.params.emb_models[
            0
@ -407,13 +407,3 @@ def make_safe_filename(input_string):
    safe_name = re.sub(r"[^a-zA-Z0-9\-]", "", name_without_extension)

    return safe_name
-
-
-if __name__ == "__main__":
-    # configure logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    generate_video()
--- a/imaginairy/weight_management/conversion.py
+++ b/imaginairy/weight_management/conversion.py
@ -6,7 +6,7 @@ from typing import TYPE_CHECKING
 from imaginairy.weight_management import utils

 if TYPE_CHECKING:
-    from torch import Tensor
+    from torch import Tensor  # noqa


@dataclass
@ -69,8 +69,8 @@ class WeightMap:
        source_keys = set(source_weights.keys())
        return source_keys.issubset(self.all_valid_prefixes)

-    def cast_weights(self, source_weights):
-        converted_state_dict: dict[str, Tensor] = {}
+    def cast_weights(self, source_weights) -> dict[str, "Tensor"]:
+        converted_state_dict: dict[str, "Tensor"] = {}
        for source_key in source_weights:
            source_prefix, suffix = source_key.rsplit(sep=".", maxsplit=1)
            # handle aliases
@ -96,7 +96,7 @@ def load_state_dict_conversion_maps() -> dict[str, dict]:
    from importlib.resources import files

    for file in files("imaginairy").joinpath("weight_conversion/maps").iterdir():
-        if file.is_file() and file.suffix == ".json":
+        if file.is_file() and file.suffix == ".json":  # type: ignore
            conversion_maps[file.name] = json.loads(file.read_text())
    return conversion_maps

--- a/setup.py
+++ b/setup.py
@ -8,7 +8,7 @@ is_for_windows = len(sys.argv) >= 3 and sys.argv[2].startswith("--plat-name=win"

 if is_for_windows:
    scripts = None
-    entry_points = {
+    entry_points: dict | None = {
        "console_scripts": [
            "imagine=imaginairy.cli.main:imagine_cmd",
            "aimg=imaginairy.cli.main:aimg",