perf: mps perf improvement

attention slicing wasn't working
1 year ago · 003a512dc8
parent 66b28c80fc
commit 003a512dc8
3 changed files with 32 additions and 14 deletions
--- a/imaginairy/modules/attention.py
+++ b/imaginairy/modules/attention.py
@ -1,4 +1,5 @@
 import math
+from functools import lru_cache

 import psutil
 import torch
@ -151,6 +152,11 @@ def get_mem_free_total(device):
    return mem_free_total


+@lru_cache(maxsize=1)
+def get_mps_gb_ram():
+    return psutil.virtual_memory().total / (1024**3)
+
+
 class CrossAttention(nn.Module):
    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
        super().__init__()
@ -235,23 +241,34 @@ class CrossAttention(nn.Module):
        tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size()
        modifier = 3 if q.element_size() == 2 else 2.5
        mem_required = tensor_size * modifier
+
        steps = 1

-        if mem_required > mem_free_total:
-            steps = 2 ** (math.ceil(math.log(mem_required / mem_free_total, 2)))
+        if "mps" in get_device():
+            # https://github.com/brycedrennan/imaginAIry/issues/175
+            # https://github.com/invoke-ai/InvokeAI/issues/1244
+            mps_gb = get_mps_gb_ram()
+            factor = 32 / mps_gb

-        if steps > 64:
-            max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
-            raise RuntimeError(
-                f"Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). "
-                f"Need: {mem_required / 64 / gb:0.1f}GB free, Have:{mem_free_total / gb:0.1f}GB free"
+            slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1] * 16 * factor))
+        else:
+            if mem_required > mem_free_total:
+                steps = 2 ** (math.ceil(math.log(mem_required / mem_free_total, 2)))
+
+            if steps > 64:
+                max_res = (
+                    math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
+                )
+                raise RuntimeError(
+                    f"Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). "
+                    f"Need: {mem_required / 64 / gb:0.1f}GB free, Have:{mem_free_total / gb:0.1f}GB free"
+                )
+            slice_size = (
+                q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
            )

-        slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
-        if get_device() == "mps":
-            # https://github.com/brycedrennan/imaginAIry/issues/175
-            # https://github.com/invoke-ai/InvokeAI/issues/1244
-            slice_size = min(slice_size, 2**30)
+        # steps = len(range(0, q.shape[1], slice_size))
+        # print(f"Splitting attention into {steps} steps of {slice_size} slices")

        for i in range(0, q.shape[1], slice_size):
            end = i + slice_size
--- a/scripts/asses_memory_usage.py
+++ b/scripts/asses_memory_usage.py
@ -7,7 +7,7 @@ from imaginairy.utils import get_device

 def assess_memory_usage():
    assert get_device() == "cuda"
-    img_size = 1664
+    img_size = 3048
    prompt = ImaginePrompt("strawberries", width=64, height=64, seed=1)
    imagine_image_files([prompt], outdir="outputs")
    datalog = []
@ -18,6 +18,7 @@ def assess_memory_usage():
            width=img_size,
            height=img_size,
            seed=1,
+            steps=2,
        )
        try:
            imagine_image_files([prompt], outdir="outputs")
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -274,4 +274,4 @@ def test_tile_mode(filename_base_for_outputs):
    result = next(imagine(prompt))

    img_path = f"{filename_base_for_outputs}.png"
-    assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=22000)
+    assert_image_similar_to_expectation(result.img, img_path=img_path, threshold=24000)