feature: better masking segmentation from clipseg

2024-10-31 03:20:40 +00:00 · 2022-09-27 23:15:34 -07:00 · 2022-09-27 23:15:34 -07:00 · f21c979f08
commit f21c979f08
parent c92e5c443f
8 changed files with 47 additions and 25 deletions
--- a/10
+++ b/10
@ -89,14 +89,19 @@ revendorize: vendorize_kdiffusion
 	make af

 vendorize_clipseg:
-	make download_repo REPO=git@github.com:timojl/clipseg.git PKG=clipseg COMMIT=664ee94393491cdd7ad422f67eb1ce670d3d00e6
+	make download_repo REPO=git@github.com:timojl/clipseg.git PKG=clipseg COMMIT=ea54753df1e444c4445bac6e023546b6a41951d8
 	rm -rf ./imaginairy/vendored/clipseg
 	mkdir -p ./imaginairy/vendored/clipseg
 	cp -R ./downloads/clipseg/models/* ./imaginairy/vendored/clipseg/
 	sed -i '' -e 's#import clip#from imaginairy.vendored import clip#g' ./imaginairy/vendored/clipseg/clipseg.py
 	rm ./imaginairy/vendored/clipseg/vitseg.py
 	mv ./imaginairy/vendored/clipseg/clipseg.py ./imaginairy/vendored/clipseg/__init__.py
-	wget https://github.com/timojl/clipseg/raw/master/weights/rd64-uni.pth -P ./imaginairy/vendored/clipseg
+	# download weights
+	rm -rf ./downloads/clipseg-weights
+	mkdir -p ./downloads/clipseg-weights
+	wget https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download -O ./downloads/clipseg-weights/weights.tar
+	cd downloads/clipseg-weights && unzip -d weights -j weights.tar
+	cp ./downloads/clipseg-weights/weights/rd64-uni-refined.pth ./imaginairy/vendored/clipseg/

 vendorize_blip:
 	make download_repo REPO=git@github.com:salesforce/BLIP.git PKG=blip COMMIT=48211a1594f1321b00f14c9f7a5b4813144b2fb9
@ -131,6 +136,7 @@ vendorize:  ## vendorize a github repo.  `make vendorize REPO=git@github.com:ope

 download_repo:
 	mkdir -p ./downloads
+	rm -rf ./downloads/$(PKG)
 	-cd ./downloads && git clone $(REPO) $(PKG)
 	cd ./downloads/$(PKG) && git pull

--- a/README.md
+++ b/README.md
@ -185,6 +185,7 @@ docker run -it --gpus all -v $HOME/.cache/huggingface:/root/.cache/huggingface -
 [Example Colab](https://colab.research.google.com/drive/1rOvQNs0Cmn_yU1bKWjCOHzGVDgZkaTtO?usp=sharing)

 ## ChangeLog
+ - add improved masking update from clipseg

 **2.0.3**
 - fix memory leak in face enhancer
@ -250,6 +251,7 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
   - ✅ https://www.reddit.com/r/StableDiffusion/comments/xalaws/test_update_for_less_memory_usage_and_higher/
   - https://github.com/neonsecret/stable-diffusion  https://github.com/CompVis/stable-diffusion/pull/177
   - https://github.com/huggingface/diffusers/pull/532/files
+   - https://github.com/HazyResearch/flash-attention
 - ✅ deploy to pypi
 - find similar images https://knn5.laion.ai/?back=https%3A%2F%2Fknn5.laion.ai%2F&index=laion5B&useMclip=false
 - Development Environment
@ -291,6 +293,7 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
     - https://www.reddit.com/r/StableDiffusion/comments/xboy90/a_better_way_of_doing_img2img_by_finding_the/
     - https://gist.github.com/trygvebw/c71334dd127d537a15e9d59790f7f5e1
     - https://github.com/pesser/stable-diffusion/commit/bbb52981460707963e2a62160890d7ecbce00e79
+     - https://github.com/SHI-Labs/FcF-Inpainting https://praeclarumjj3.github.io/fcf-inpainting/
   - CPU support
   - ✅ img2img for plms
   - img2img for kdiff functions
@ -304,7 +307,8 @@ would be uncorrelated to the rest of the surrounding image.  It created terrible
     - https://www.reddit.com/r/MachineLearning/comments/x6k5bm/n_stable_diffusion_image_variations_released/
   - animations
     - https://github.com/francislabountyjr/stable-diffusion/blob/main/inferencing_notebook.ipynb
-     - https://www.youtube.com/watch?v=E7aAFEhdngI 
+     - https://www.youtube.com/watch?v=E7aAFEhdngI
+     - https://github.com/pytti-tools/frame-interpolation
   - cross-attention control: 
     - https://github.com/bloc97/CrossAttentionControl/blob/main/CrossAttention_Release_NoImages.ipynb
   - guided generation 
--- a/imaginairy/enhancers/clip_masking.py
+++ b/imaginairy/enhancers/clip_masking.py
@ -5,7 +5,6 @@ import cv2
 import numpy as np
 import PIL.Image
 import torch
-from kornia.filters import median_blur
 from torchvision import transforms

 from imaginairy.img_log import log_img
@ -19,12 +18,12 @@ weights_url = "https://github.com/timojl/clipseg/raw/master/weights/rd64-uni.pth
 def clip_mask_model():
    from imaginairy import PKG_ROOT  # noqa

-    model = CLIPDensePredT(version="ViT-B/16", reduce_dim=64)
+    model = CLIPDensePredT(version="ViT-B/16", reduce_dim=64, complex_trans_conv=True)
    model.eval()

    model.load_state_dict(
        torch.load(
-            f"{PKG_ROOT}/vendored/clipseg/rd64-uni.pth",
+            f"{PKG_ROOT}/vendored/clipseg/rd64-uni-refined.pth",
            map_location=torch.device("cpu"),
        ),
        strict=False,
@ -48,10 +47,6 @@ def get_img_mask(
    mask = parsed_mask.apply_masks(mask_cache)
    log_img(mask, "combined mask")

-    # try to blur the square shaped artifacts somewhat
-    mask = median_blur(mask.unsqueeze(dim=0).unsqueeze(dim=0), (11, 11)).squeeze()
-    log_img(mask, "median blurred")
-
    kernel = np.ones((3, 3), np.uint8)
    mask_g = mask.clone()

--- a/imaginairy/vendored/clipseg/init.py
+++ b/imaginairy/vendored/clipseg/init.py
@ -423,6 +423,7 @@ class CLIPDensePredT(CLIPDenseBase):
        rev_activations=False,
        trans_conv=None,
        n_tokens=None,
+        complex_trans_conv=False,
    ):

        super().__init__(version, reduce_cond, reduce_dim, prompt, n_tokens)
@ -465,9 +466,31 @@ class CLIPDensePredT(CLIPDenseBase):
            # explicitly define transposed conv kernel size
            trans_conv_ks = (trans_conv, trans_conv)

-        self.trans_conv = nn.ConvTranspose2d(
-            reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks
-        )
+        if not complex_trans_conv:
+            self.trans_conv = nn.ConvTranspose2d(
+                reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks
+            )
+        else:
+            assert trans_conv_ks[0] == trans_conv_ks[1]
+
+            tp_kernels = (trans_conv_ks[0] // 4, trans_conv_ks[0] // 4)
+
+            self.trans_conv = nn.Sequential(
+                nn.Conv2d(reduce_dim, reduce_dim, kernel_size=3, padding=1),
+                nn.ReLU(),
+                nn.ConvTranspose2d(
+                    reduce_dim,
+                    reduce_dim // 2,
+                    kernel_size=tp_kernels[0],
+                    stride=tp_kernels[0],
+                ),
+                nn.ReLU(),
+                nn.ConvTranspose2d(
+                    reduce_dim // 2, 1, kernel_size=tp_kernels[1], stride=tp_kernels[1]
+                ),
+            )
+
+        #        self.trans_conv = nn.ConvTranspose2d(reduce_dim, 1, trans_conv_ks, stride=trans_conv_ks)

        assert len(self.extract_layers) == depth

--- a/imaginairy/vendored/clipseg/rd64-uni-refined.pth
+++ b/imaginairy/vendored/clipseg/rd64-uni-refined.pth
--- a/imaginairy/vendored/clipseg/rd64-uni.pth
+++ b/imaginairy/vendored/clipseg/rd64-uni.pth
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -247,7 +247,7 @@ def test_cliptext_inpainting_pearl_doctor(filename_base_for_outputs):
        prompt_strength=12,
        init_image=img,
        init_image_strength=0.2,
-        mask_prompt="face AND NOT (bandana OR hair OR blue fabric){*6}",
+        mask_prompt="face AND NOT (bandana OR hair OR blue fabric){*5}",
        mask_mode=ImaginePrompt.MaskMode.KEEP,
        width=512,
        height=512,
@ -259,13 +259,4 @@ def test_cliptext_inpainting_pearl_doctor(filename_base_for_outputs):

    img = pillow_fit_image_within(img)
    img.save(f"{filename_base_for_outputs}__orig.jpg")
-    result.img.save(f"{filename_base_for_outputs}_{prompt.seed}.jpg")
-
-    found_match = result.md5() in set(
-        [
-            "84868e7477a7375f7089160ac6adc064",
-            "c5c0166185c284fc849901123e78d608",
-            "6ef63037f5a1bd8bce6aec1c7ad46880",
-        ]  # mps
-    )
-    assert found_match
+    result.img.save(f"{filename_base_for_outputs}_{prompt.seed}_01.jpg")
--- a/tests/test_enhancers.py
+++ b/tests/test_enhancers.py
@ -26,7 +26,10 @@ def test_fix_faces():
        assert img_hash(img) == "a75991307eda675a26eeb7073f828e93"
    else:
        # probably different based on whether first run or not. looks the same either way
-        assert img_hash(img) in ["c840cf3bfe5a7760734f425a3f8941cf", "e56c1205bbc8f251be05773f2ba7fa24"]
+        assert img_hash(img) in [
+            "c840cf3bfe5a7760734f425a3f8941cf",
+            "e56c1205bbc8f251be05773f2ba7fa24",
+        ]


 def img_hash(img):