|
|
|
import os
|
|
|
|
import os.path
|
|
|
|
from functools import lru_cache
|
|
|
|
|
|
|
|
import torch
|
|
|
|
from torchvision import transforms
|
|
|
|
from torchvision.transforms.functional import InterpolationMode
|
|
|
|
|
|
|
|
from imaginairy.model_manager import get_cached_url_path
|
|
|
|
from imaginairy.utils import get_device
|
|
|
|
from imaginairy.vendored.blip.blip import BLIP_Decoder, load_checkpoint
|
|
|
|
|
|
|
|
device = get_device()
|
|
|
|
if "mps" in device:
|
|
|
|
device = "cpu"
|
|
|
|
|
|
|
|
BLIP_EVAL_SIZE = 384
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache
|
|
|
|
def blip_model():
|
|
|
|
from imaginairy.paths import PKG_ROOT
|
|
|
|
|
|
|
|
config_path = os.path.join(
|
|
|
|
PKG_ROOT, "vendored", "blip", "configs", "med_config.json"
|
|
|
|
)
|
|
|
|
url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth"
|
|
|
|
|
|
|
|
model = BLIP_Decoder(image_size=BLIP_EVAL_SIZE, vit="base", med_config=config_path)
|
|
|
|
cached_url_path = get_cached_url_path(url)
|
|
|
|
model, msg = load_checkpoint(model, cached_url_path)
|
|
|
|
model.eval()
|
|
|
|
model = model.to(device)
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
feature: prompt expansion (#51)
You can use `{}` to randomly pull values from lists. A list of values separated by `|` and enclosed in `{ }` will be randomly drawn from in a non-repeating fashion. Values that are surrounded by `_ _` will pull from a phrase list of the same name. Folders containing .txt phraselist files may be specified via
`--prompt_library_path`. The option may be specified multiple times. Built-in categories:
3d-term, adj-architecture, adj-beauty, adj-detailed, adj-emotion, adj-general, adj-horror, animal, art-movement,
art-site, artist, artist-botanical, artist-surreal, aspect-ratio, bird, body-of-water, body-pose, camera-brand,
camera-model, color, cosmic-galaxy, cosmic-nebula, cosmic-star, cosmic-term, dinosaur, eyecolor, f-stop,
fantasy-creature, fantasy-setting, fish, flower, focal-length, food, fruit, games, gen-modifier, hair, hd,
iso-stop, landscape-type, national-park, nationality, neg-weight, noun-beauty, noun-fantasy, noun-general,
noun-horror, occupation, photo-term, pop-culture, pop-location, punk-style, quantity, rpg-item, scenario-desc,
skin-color, spaceship, style, tree-species, trippy, world-heritage-site
Examples:
`imagine "a {red|black} dog" -r 2 --seed 0` will generate both "a red dog" and "a black dog"
`imagine "a {_color_} dog" -r 4 --seed 0` will generate four, different colored dogs. The colors will eb pulled from an included
phraselist of colors.
`imagine "a {_spaceship_|_fruit_|hot air balloon}. low-poly" -r 4 --seed 0` will generate images of spaceships or fruits or a hot air balloon
Credit to [noodle-soup-prompts](https://github.com/WASasquatch/noodle-soup-prompts/) where most, but not all, of the wordlists originate.
2 years ago
|
|
|
def generate_caption(image, min_length=30):
|
|
|
|
"""Given an image, return a caption."""
|
|
|
|
image = image.convert("RGB")
|
|
|
|
gpu_image = (
|
|
|
|
transforms.Compose(
|
|
|
|
[
|
|
|
|
transforms.Resize(
|
|
|
|
(BLIP_EVAL_SIZE, BLIP_EVAL_SIZE),
|
|
|
|
interpolation=InterpolationMode.BICUBIC,
|
|
|
|
),
|
|
|
|
transforms.ToTensor(),
|
|
|
|
transforms.Normalize(
|
|
|
|
(0.48145466, 0.4578275, 0.40821073),
|
|
|
|
(0.26862954, 0.26130258, 0.27577711),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)(image)
|
|
|
|
.unsqueeze(0)
|
|
|
|
.to(device)
|
|
|
|
)
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
|
caption = blip_model().generate(
|
|
|
|
gpu_image, sample=True, num_beams=3, max_length=80, min_length=min_length
|
|
|
|
)
|
|
|
|
return caption[0]
|