fix: remove padding approach

- padding didnt make animations better - some changes to better support CPU generation (not yet working) - better log output coloring - better log messages when cuda not found
6 months ago · 8267482aad
parent 6e1c44dae7
commit 8267482aad
3 changed files with 54 additions and 19 deletions
--- a/imaginairy/log_utils.py
+++ b/imaginairy/log_utils.py
@ -207,6 +207,7 @@ def conditioning_to_img(conditioning):
 class ColorIndentingFormatter(logging.Formatter):
    RED = "\033[31m"
    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
    RESET = "\033[0m"

    def format(self, record):
@ -215,11 +216,13 @@ class ColorIndentingFormatter(logging.Formatter):
        reset = ""
        if record.levelno >= logging.ERROR:
            color = self.RED
+        elif record.levelno >= logging.WARNING:
+            color = self.YELLOW

        if _CURRENT_LOGGING_CONTEXT is not None:
            s = f"    {s}"

-        if not s.startswith("    "):
+        if color is None and not s.startswith("    "):
            color = self.GREEN

        if color:
--- a/imaginairy/modules/sgm/diffusionmodules/sampling.py
+++ b/imaginairy/modules/sgm/diffusionmodules/sampling.py
@ -3,7 +3,7 @@
 """


-from typing import Dict, Optional, Union
+from typing import Dict, Union

 import torch
 from omegaconf import ListConfig, OmegaConf
@ -16,7 +16,7 @@ from imaginairy.modules.sgm.diffusionmodules.sampling_utils import (
    to_neg_log_sigma,
    to_sigma,
 )
-from imaginairy.utils import default, get_device, instantiate_from_config
+from imaginairy.utils import default, instantiate_from_config
 from imaginairy.vendored.k_diffusion.utils import append_dims

 DEFAULT_GUIDER = {
@ -31,9 +31,8 @@ class BaseDiffusionSampler:
        num_steps: Union[int, None] = None,
        guider_config: Union[Dict, ListConfig, OmegaConf, None] = None,
        verbose: bool = False,
-        device: Optional[str] = None,
+        # device: Optional[str] = None,
    ):
-        device = default(device, get_device)
        self.num_steps = num_steps
        self.discretization = instantiate_from_config(discretization_config)
        self.guider = instantiate_from_config(
@ -43,11 +42,11 @@ class BaseDiffusionSampler:
            )
        )
        self.verbose = verbose
-        self.device = device
+        # self.device = device

    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
        sigmas = self.discretization(
-            self.num_steps if num_steps is None else num_steps, device=self.device
+            self.num_steps if num_steps is None else num_steps, device=x.device
        )
        uc = default(uc, cond)

--- a/imaginairy/video_sample.py
+++ b/imaginairy/video_sample.py
@ -48,8 +48,24 @@ def generate_video(
    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
    """
-    start_time = time.perf_counter()
    device = default(device, get_device)
+
+    if device == "mps":
+        msg = "Apple Silicon MPS (M1, M2, etc) is not currently supported for video generation. Switching to cpu generation."
+        logger.warning(msg)
+        device = "cpu"
+
+    elif not torch.cuda.is_available():
+        msg = (
+            "CUDA is not available. This will be verrrry slow or not work at all.\n"
+            "If you have a GPU, make sure you have CUDA installed and PyTorch is compiled with CUDA support.\n"
+            "Unfortunately, we cannot automatically install the proper version.\n\n"
+            "You can install the proper version by following these directions:\n"
+            "https://pytorch.org/get-started/locally/"
+        )
+        logger.warning(msg)
+
+    start_time = time.perf_counter()
    seed = default(seed, random.randint(0, 1000000))
    output_fps = default(output_fps, fps_id)

@ -64,7 +80,7 @@ def generate_video(
    video_config_path = f"{PKG_ROOT}/{video_model_config['config_path']}"

    logger.info(
-        f"Generating {num_frames} frame video from {input_path}. Device: {device} seed: {seed}"
+        f"Generating a {num_frames} frame video from {input_path}. Device:{device} seed:{seed}"
    )
    model, safety_filter = load_model(
        config=video_config_path,
@ -122,9 +138,18 @@ def generate_video(
                x = (background.width - image.width) // 2
                y = (background.height - image.height) // 2
                background.paste(image, (x, y))
-                crop_coords = (x, y, x + image.width, y + image.height)
-
-                image = background
+                # crop_coords = (x, y, x + image.width, y + image.height)
+
+                # image = background
+            w, h = image.size
+            snap_to = 64
+            if h % snap_to != 0 or w % snap_to != 0:
+                width = w - w % snap_to
+                height = h - h % snap_to
+                image = image.resize((width, height))
+                logger.warning(
+                    f"Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+                )

            image = ToTensor()(image)
            image = image * 2.0 - 1.0
@ -163,7 +188,14 @@ def generate_video(
            value_dict["cond_aug"] = cond_aug

            with torch.no_grad(), platform_appropriate_autocast():
-                reload_model(model.conditioner)
+                reload_model(model.conditioner, device=device)
+                if device == "cpu":
+                    model.conditioner.to(torch.float32)
+                for k in value_dict:
+                    if isinstance(value_dict[k], torch.Tensor):
+                        value_dict[k] = value_dict[k].to(
+                            next(model.conditioner.parameters()).dtype
+                        )
                batch, batch_uc = get_batch(
                    get_unique_embedder_keys_from_conditioner(model.conditioner),
                    value_dict,
@ -196,18 +228,18 @@ def generate_video(
                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

                def denoiser(_input, sigma, c):
-                    _input = _input.half()
+                    _input = _input.half().to(device)
                    return model.denoiser(
                        model.model, _input, sigma, c, **additional_model_inputs
                    )

-                reload_model(model.denoiser)
-                reload_model(model.model)
+                reload_model(model.denoiser, device=device)
+                reload_model(model.model, device=device)
                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
                unload_model(model.model)
                unload_model(model.denoiser)

-                reload_model(model.first_stage_model)
+                reload_model(model.first_stage_model, device=device)
                model.en_and_decode_n_samples_a_time = decoding_t
                samples_x = model.decode_first_stage(samples_z)
                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
@ -332,8 +364,9 @@ def unload_model(model):
            torch.cuda.empty_cache()


-def reload_model(model):
-    model.to(get_device())
+def reload_model(model, device=None):
+    device = default(device, get_device)
+    model.to(device)


 def pillow_fit_image_within(