feature: autoresize and crop videos.

This means you can just stick any image into the video generator without worrying about the size.

- better generated video filenames
- output h264 video as well
pull/404/head
Bryce 6 months ago
parent f6c9927d0c
commit aa91d0a9c9

@ -12,7 +12,9 @@ logger = logging.getLogger(__name__)
help="Input path for image file.",
)
@click.option("--num-frames", default=None, type=int, help="Number of frames.")
@click.option("--num-steps", default=None, type=int, help="Number of steps.")
@click.option(
"-s", "--steps", default=None, type=int, help="Number of diffusion steps."
)
@click.option(
"--model",
default="svd",
@ -47,7 +49,7 @@ logger = logging.getLogger(__name__)
def videogen_cmd(
start_image,
num_frames,
num_steps,
steps,
model,
fps,
output_fps,
@ -72,18 +74,18 @@ def videogen_cmd(
configure_logging()
output_fps = output_fps or fps
for i in range(repeats):
logger.info(f"Generating video from image {start_image}")
generate_video(
input_path=start_image,
num_frames=num_frames,
num_steps=num_steps,
model_name=model,
fps_id=fps,
output_fps=output_fps,
motion_bucket_id=motion_amount,
cond_aug=cond_aug,
seed=seed,
decoding_t=decoding_t,
output_folder=output_folder,
)
generate_video(
input_path=start_image,
num_frames=num_frames,
num_steps=steps,
model_name=model,
fps_id=fps,
output_fps=output_fps,
motion_bucket_id=motion_amount,
cond_aug=cond_aug,
seed=seed,
decoding_t=decoding_t,
output_folder=output_folder,
repetitions=repeats,
)

@ -692,7 +692,9 @@ def open_weights(filepath, device=None):
from refiners.fluxion.utils import safe_open
with safe_open(path=filepath, framework="pytorch", device=device) as tensors:
state_dict = {key: tensors.get_tensor(key) for key in tensors}
state_dict = {
key: tensors.get_tensor(key) for key in tensors.keys() # noqa
}
else:
import torch

@ -13,7 +13,7 @@ from imaginairy.modules.sgm.autoencoding.lpips.vqperceptual import (
hinge_d_loss,
vanilla_d_loss,
)
from imaginairy.modules.util import default, instantiate_from_config
from imaginairy.utils import default, instantiate_from_config
class GeneralLPIPSWithDiscriminator(nn.Module):

@ -2,7 +2,7 @@ import torch
import torch.nn as nn
from imaginairy.modules.sgm.autoencoding.lpips.loss.lpips import LPIPS
from imaginairy.modules.util import default, instantiate_from_config
from imaginairy.utils import default, instantiate_from_config
class LatentLPIPS(nn.Module):

@ -2,6 +2,7 @@ import logging
import math
import os
import random
import re
import time
from glob import glob
from pathlib import Path
@ -12,6 +13,7 @@ import numpy as np
import torch
from einops import rearrange, repeat
from omegaconf import OmegaConf
from PIL import Image
from torchvision.transforms import ToTensor
from imaginairy import LazyLoadingImage, config
@ -40,6 +42,7 @@ def generate_video(
decoding_t: int = 1, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
device: Optional[str] = None,
output_folder: Optional[str] = None,
repetitions=1,
):
"""
Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
@ -50,8 +53,6 @@ def generate_video(
seed = default(seed, random.randint(0, 1000000))
output_fps = default(output_fps, fps_id)
logger.info(f"Device: {device} seed: {seed}")
torch.cuda.reset_peak_memory_stats()
video_model_config = config.video_models.get(model_name, None)
if video_model_config is None:
@ -63,6 +64,9 @@ def generate_video(
output_folder = default(output_folder, "outputs/video/")
video_config_path = f"{PKG_ROOT}/{video_model_config['config_path']}"
logger.info(
f"Generating {num_frames} frame video from {input_path}. Device: {device} seed: {seed}"
)
model, safety_filter = load_model(
config=video_config_path,
device="cpu",
@ -71,11 +75,11 @@ def generate_video(
weights_url=video_model_config["weights_url"],
)
torch.manual_seed(seed)
if input_path.startswith("http"):
input_images = [LazyLoadingImage(url=input_path)]
all_img_paths = [input_path]
else:
path = Path(input_path)
all_img_paths = []
if path.is_file():
if any(input_path.endswith(x) for x in ["jpg", "jpeg", "png"]):
all_img_paths = [input_path]
@ -84,7 +88,7 @@ def generate_video(
elif path.is_dir():
all_img_paths = sorted(
[
f
str(f)
for f in path.iterdir()
if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
]
@ -93,134 +97,159 @@ def generate_video(
raise ValueError("Folder does not contain any images.")
else:
raise ValueError
input_images = [LazyLoadingImage(filepath=str(x)) for x in all_img_paths]
for image in input_images:
image = image.as_pillow()
if image.mode == "RGBA":
image = image.convert("RGB")
w, h = image.size
if h % 64 != 0 or w % 64 != 0:
width, height = (x - x % 64 for x in (w, h))
image = image.resize((width, height))
logger.info(
f"Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
)
expected_size = (1024, 576)
for _ in range(repetitions):
for input_path in all_img_paths:
if input_path.startswith("http"):
image = LazyLoadingImage(url=input_path)
else:
image = LazyLoadingImage(filepath=input_path)
crop_coords = None
image = image.as_pillow()
if image.mode == "RGBA":
image = image.convert("RGB")
if image.size != expected_size:
logger.info(
f"Resizing image from {image.size} to {expected_size}. (w, h)"
)
image = pillow_fit_image_within(
image, max_height=expected_size[1], max_width=expected_size[0]
)
logger.debug(f"Image is now of size: {image.size}")
background = Image.new("RGB", expected_size, "white")
# Calculate the position to center the original image
x = (background.width - image.width) // 2
y = (background.height - image.height) // 2
background.paste(image, (x, y))
crop_coords = (x, y, x + image.width, y + image.height)
image = background
image = ToTensor()(image)
image = image * 2.0 - 1.0
image = image.unsqueeze(0).to(device)
H, W = image.shape[2:]
assert image.shape[1] == 3
F = 8
C = 4
shape = (num_frames, C, H // F, W // F)
if expected_size != (W, H):
logger.warning(
f"The {W, H} image you provided is not {expected_size}. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
)
if motion_bucket_id > 255:
logger.warning(
"High motion bucket! This may lead to suboptimal performance."
)
image = ToTensor()(image)
image = image * 2.0 - 1.0
image = image.unsqueeze(0).to(device)
H, W = image.shape[2:]
assert image.shape[1] == 3
F = 8
C = 4
shape = (num_frames, C, H // F, W // F)
if (H, W) != (576, 1024):
logger.warning(
"The image you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
)
if motion_bucket_id > 255:
logger.warning(
"High motion bucket! This may lead to suboptimal performance."
)
if fps_id < 5:
logger.warning(
"Small fps value! This may lead to suboptimal performance."
)
if fps_id < 5:
logger.warning("Small fps value! This may lead to suboptimal performance.")
if fps_id > 30:
logger.warning("Large fps value! This may lead to suboptimal performance.")
value_dict = {}
value_dict["motion_bucket_id"] = motion_bucket_id
value_dict["fps_id"] = fps_id
value_dict["cond_aug"] = cond_aug
value_dict["cond_frames_without_noise"] = image
value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
value_dict["cond_aug"] = cond_aug
with torch.no_grad(), platform_appropriate_autocast():
reload_model(model.conditioner)
batch, batch_uc = get_batch(
get_unique_embedder_keys_from_conditioner(model.conditioner),
value_dict,
[1, num_frames],
T=num_frames,
device=device,
)
c, uc = model.conditioner.get_unconditional_conditioning(
batch,
batch_uc=batch_uc,
force_uc_zero_embeddings=[
"cond_frames",
"cond_frames_without_noise",
],
)
unload_model(model.conditioner)
for k in ["crossattn", "concat"]:
uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
randn = torch.randn(shape, device=device)
additional_model_inputs = {}
additional_model_inputs["image_only_indicator"] = torch.zeros(
2, num_frames
).to(device)
additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
def denoiser(_input, sigma, c):
_input = _input.half()
return model.denoiser(
model.model, _input, sigma, c, **additional_model_inputs
if fps_id > 30:
logger.warning(
"Large fps value! This may lead to suboptimal performance."
)
reload_model(model.denoiser)
reload_model(model.model)
samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
unload_model(model.model)
unload_model(model.denoiser)
reload_model(model.first_stage_model)
model.en_and_decode_n_samples_a_time = decoding_t
samples_x = model.decode_first_stage(samples_z)
samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
unload_model(model.first_stage_model)
os.makedirs(output_folder, exist_ok=True)
base_count = len(glob(os.path.join(output_folder, "*.mp4"))) + 1
video_filename = f"{base_count:06d}_{model_name}_{seed}.mp4"
video_path = os.path.join(output_folder, video_filename)
writer = cv2.VideoWriter(
video_path,
cv2.VideoWriter_fourcc(*"MP4V"),
output_fps,
(samples.shape[-1], samples.shape[-2]),
)
value_dict = {}
value_dict["motion_bucket_id"] = motion_bucket_id
value_dict["fps_id"] = fps_id
value_dict["cond_aug"] = cond_aug
value_dict["cond_frames_without_noise"] = image
value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
value_dict["cond_aug"] = cond_aug
with torch.no_grad(), platform_appropriate_autocast():
reload_model(model.conditioner)
batch, batch_uc = get_batch(
get_unique_embedder_keys_from_conditioner(model.conditioner),
value_dict,
[1, num_frames],
T=num_frames,
device=device,
)
c, uc = model.conditioner.get_unconditional_conditioning(
batch,
batch_uc=batch_uc,
force_uc_zero_embeddings=[
"cond_frames",
"cond_frames_without_noise",
],
)
unload_model(model.conditioner)
for k in ["crossattn", "concat"]:
uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
randn = torch.randn(shape, device=device)
additional_model_inputs = {}
additional_model_inputs["image_only_indicator"] = torch.zeros(
2, num_frames
).to(device)
additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
def denoiser(_input, sigma, c):
_input = _input.half()
return model.denoiser(
model.model, _input, sigma, c, **additional_model_inputs
)
reload_model(model.denoiser)
reload_model(model.model)
samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
unload_model(model.model)
unload_model(model.denoiser)
reload_model(model.first_stage_model)
model.en_and_decode_n_samples_a_time = decoding_t
samples_x = model.decode_first_stage(samples_z)
samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
unload_model(model.first_stage_model)
if crop_coords:
left, upper, right, lower = crop_coords
samples = samples[:, :, upper:lower, left:right]
os.makedirs(output_folder, exist_ok=True)
base_count = len(glob(os.path.join(output_folder, "*.mp4"))) + 1
source_slug = make_safe_filename(input_path)
video_filename = f"{base_count:06d}_{model_name}_{seed}_{fps_id}fps_{source_slug}.mp4"
video_path = os.path.join(output_folder, video_filename)
writer = cv2.VideoWriter(
video_path,
cv2.VideoWriter_fourcc(*"MP4V"),
output_fps,
(samples.shape[-1], samples.shape[-2]),
)
samples = safety_filter(samples)
vid = (
(rearrange(samples, "t c h w -> t h w c") * 255)
.cpu()
.numpy()
.astype(np.uint8)
samples = safety_filter(samples)
vid = (
(rearrange(samples, "t c h w -> t h w c") * 255)
.cpu()
.numpy()
.astype(np.uint8)
)
for frame in vid:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
writer.write(frame)
writer.release()
video_path_h264 = video_path[:-4] + "_h264.mp4"
os.system(f"ffmpeg -i {video_path} -c:v libx264 {video_path_h264}")
if torch.cuda.is_available():
peak_memory_usage = torch.cuda.max_memory_allocated()
msg = f"Peak memory usage: {peak_memory_usage / (1024 ** 2)} MB"
logger.info(msg)
duration = time.perf_counter() - start_time
logger.info(
f"Video of {num_frames} frames generated in {duration:.2f} seconds and saved to {video_path}\n"
)
for frame in vid:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
writer.write(frame)
writer.release()
if torch.cuda.is_available():
peak_memory_usage = torch.cuda.max_memory_allocated()
msg = f"Peak memory usage: {peak_memory_usage / (1024 ** 2)} MB"
logger.info(msg)
duration = time.perf_counter() - start_time
logger.info(
f"Video of {num_frames} frames generated in {duration:.2f} seconds and saved to {video_path}\n"
)
def get_unique_embedder_keys_from_conditioner(conditioner):
@ -310,6 +339,45 @@ def reload_model(model):
model.to(get_device())
def pillow_fit_image_within(
image: Image.Image, max_height=512, max_width=512, convert="RGB", snap_size=8
):
image = image.convert(convert)
w, h = image.size
resize_ratio = 1
if w > max_width or h > max_height:
resize_ratio = min(max_width / w, max_height / h)
elif w < max_width and h < max_height:
# it's smaller than our target image, enlarge
resize_ratio = min(max_width / w, max_height / h)
if resize_ratio != 1:
w, h = int(w * resize_ratio), int(h * resize_ratio)
# resize to integer multiple of snap_size
w -= w % snap_size
h -= h % snap_size
if (w, h) != image.size:
image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
return image
def make_safe_filename(input_string):
stripped_url = re.sub(r"^https?://[^/]+/", "", input_string)
# Remove directory path if present
base_name = os.path.basename(stripped_url)
# Remove file extension
name_without_extension = os.path.splitext(base_name)[0]
# Keep only alphanumeric characters and dashes
safe_name = re.sub(r"[^a-zA-Z0-9\-]", "", name_without_extension)
return safe_name
if __name__ == "__main__":
# configure logging
logging.basicConfig(

Loading…
Cancel
Save