From 6b991a5269afd0683f72a4833d8d782cd3b79125 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 19 Feb 2024 15:21:55 -0500
Subject: [PATCH 01/30] add draft multi-gpu support

---
 .../services/model_load/model_load_base.py    |  5 ++
 .../services/model_load/model_load_default.py |  6 +++
 .../model_manager/model_manager_base.py       |  3 +-
 .../model_manager/model_manager_default.py    |  7 +--
 .../session_processor_default.py              |  6 +--
 .../load/model_cache/model_cache_base.py      | 22 ++++++--
 .../load/model_cache/model_cache_default.py   | 50 ++++++++++++++++---
 .../load/model_cache/model_locker.py          | 13 ++++-
 .../model_manager/model_manager_fixtures.py   |  2 +-
 9 files changed, 94 insertions(+), 20 deletions(-)

diff --git a/invokeai/app/services/model_load/model_load_base.py b/invokeai/app/services/model_load/model_load_base.py
index cc80333e932..cdd59f4e749 100644
--- a/invokeai/app/services/model_load/model_load_base.py
+++ b/invokeai/app/services/model_load/model_load_base.py
@@ -38,3 +38,8 @@ def ram_cache(self) -> ModelCacheBase[AnyModel]:
     @abstractmethod
     def convert_cache(self) -> ModelConvertCacheBase:
         """Return the checkpoint convert cache used by this loader."""
+
+    @property
+    @abstractmethod
+    def gpu_count(self) -> int:
+        """Return the number of GPUs we are configured to use."""
diff --git a/invokeai/app/services/model_load/model_load_default.py b/invokeai/app/services/model_load/model_load_default.py
index 21d3c56f36b..42d1e745430 100644
--- a/invokeai/app/services/model_load/model_load_default.py
+++ b/invokeai/app/services/model_load/model_load_default.py
@@ -39,6 +39,7 @@ def __init__(
         self._registry = registry
 
     def start(self, invoker: Invoker) -> None:
+        """Start the service."""
         self._invoker = invoker
 
     @property
@@ -46,6 +47,11 @@ def ram_cache(self) -> ModelCacheBase[AnyModel]:
         """Return the RAM cache used by this loader."""
         return self._ram_cache
 
+    @property
+    def gpu_count(self) -> int:
+        """Return the number of GPUs available for our uses."""
+        return len(self._ram_cache.execution_devices)
+
     @property
     def convert_cache(self) -> ModelConvertCacheBase:
         """Return the checkpoint convert cache used by this loader."""
diff --git a/invokeai/app/services/model_manager/model_manager_base.py b/invokeai/app/services/model_manager/model_manager_base.py
index af1b68e1ec3..d20aefd4f3a 100644
--- a/invokeai/app/services/model_manager/model_manager_base.py
+++ b/invokeai/app/services/model_manager/model_manager_base.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 
 from abc import ABC, abstractmethod
+from typing import Optional, Set
 
 import torch
 from typing_extensions import Self
@@ -31,7 +32,7 @@ def build_model_manager(
         model_record_service: ModelRecordServiceBase,
         download_queue: DownloadQueueServiceBase,
         events: EventServiceBase,
-        execution_device: torch.device,
+        execution_devices: Optional[Set[torch.device]] = None,
     ) -> Self:
         """
         Construct the model manager service instance.
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index b160ff6fede..e9035a3a809 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 """Implementation of ModelManagerServiceBase."""
 
+from typing import Optional, Set
+
 import torch
 from typing_extensions import Self
 
 from invokeai.app.services.invoker import Invoker
 from invokeai.backend.model_manager.load import ModelCache, ModelConvertCache, ModelLoaderRegistry
-from invokeai.backend.util.devices import choose_torch_device
 from invokeai.backend.util.logging import InvokeAILogger
 
 from ..config import InvokeAIAppConfig
@@ -67,7 +68,7 @@ def build_model_manager(
         model_record_service: ModelRecordServiceBase,
         download_queue: DownloadQueueServiceBase,
         events: EventServiceBase,
-        execution_device: torch.device = choose_torch_device(),
+        execution_devices: Optional[Set[torch.device]] = None,
     ) -> Self:
         """
         Construct the model manager service instance.
@@ -81,7 +82,7 @@ def build_model_manager(
             max_cache_size=app_config.ram,
             max_vram_cache_size=app_config.vram,
             logger=logger,
-            execution_device=execution_device,
+            execution_devices=execution_devices,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
         loader = ModelLoadService(
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index a9039e24815..9484395780e 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -21,7 +21,7 @@
 
 
 class DefaultSessionProcessor(SessionProcessorBase):
-    def start(self, invoker: Invoker, thread_limit: int = 1, polling_interval: int = 1) -> None:
+    def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
         self._invoker: Invoker = invoker
         self._queue_item: Optional[SessionQueueItem] = None
         self._invocation: Optional[BaseInvocation] = None
@@ -33,8 +33,8 @@ def start(self, invoker: Invoker, thread_limit: int = 1, polling_interval: int =
 
         local_handler.register(event_name=EventServiceBase.queue_event, _func=self._on_queue_event)
 
-        self._thread_limit = thread_limit
-        self._thread_semaphore = BoundedSemaphore(thread_limit)
+        self._thread_limit = self._invoker.services.model_manager.load.gpu_count
+        self._thread_semaphore = BoundedSemaphore(self._thread_limit)
         self._polling_interval = polling_interval
 
         # If profiling is enabled, create a profiler. The same profiler will be used for all sessions. Internally,
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index eb82f87cb22..c54a35f15aa 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -10,7 +10,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from logging import Logger
-from typing import Dict, Generic, Optional, TypeVar
+from typing import Dict, Generic, Optional, Set, TypeVar
 
 import torch
 
@@ -89,8 +89,24 @@ def storage_device(self) -> torch.device:
 
     @property
     @abstractmethod
-    def execution_device(self) -> torch.device:
-        """Return the exection device (e.g. "cuda" for VRAM)."""
+    def execution_devices(self) -> Set[torch.device]:
+        """Return the set of available execution devices."""
+        pass
+
+    @abstractmethod
+    def acquire_execution_device(self, timeout: int = 0) -> torch.device:
+        """
+        Pick the next available execution device.
+
+        If all devices are currently engaged (locked), then
+        block until timeout seconds have passed and raise a
+        TimeoutError if no devices are available.
+        """
+        pass
+
+    @abstractmethod
+    def release_execution_device(self, device: torch.device) -> None:
+        """Release a previously-acquired execution device."""
         pass
 
     @property
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 3bcd1840353..26185b2fba6 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -24,7 +24,8 @@
 import time
 from contextlib import suppress
 from logging import Logger
-from typing import Dict, List, Optional
+from threading import BoundedSemaphore, Lock
+from typing import Dict, List, Optional, Set
 
 import torch
 
@@ -60,8 +61,8 @@ def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
         max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
-        execution_device: torch.device = torch.device("cuda"),
         storage_device: torch.device = torch.device("cpu"),
+        execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
         sequential_offload: bool = False,
         lazy_offloading: bool = True,
@@ -73,7 +74,7 @@ def __init__(
         Initialize the model RAM cache.
 
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
-        :param execution_device: Torch device to load active model into [torch.device('cuda')]
+        :param execution_devices: Set of torch device to load active model into [calculated]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
         :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
@@ -88,7 +89,7 @@ def __init__(
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
         self._max_vram_cache_size: float = max_vram_cache_size
-        self._execution_device: torch.device = execution_device
+        self._execution_devices: Set[torch.device] = execution_devices or self._get_execution_devices()
         self._storage_device: torch.device = storage_device
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
         self._log_memory_usage = log_memory_usage
@@ -97,6 +98,12 @@ def __init__(
         self._cached_models: Dict[str, CacheRecord[AnyModel]] = {}
         self._cache_stack: List[str] = []
 
+        self._lock = Lock()
+        self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
+        self._busy_execution_devices: Set[torch.device] = set()
+        
+        self.logger.info(f"Using rendering device(s) {[self._device_name(x) for x in self._execution_devices]}")
+
     @property
     def logger(self) -> Logger:
         """Return the logger used by the cache."""
@@ -113,9 +120,24 @@ def storage_device(self) -> torch.device:
         return self._storage_device
 
     @property
-    def execution_device(self) -> torch.device:
-        """Return the exection device (e.g. "cuda" for VRAM)."""
-        return self._execution_device
+    def execution_devices(self) -> Set[torch.device]:
+        """Return the set of available execution devices."""
+        return self._execution_devices
+
+    def acquire_execution_device(self, timeout: int = 0) -> torch.device:
+        """Acquire and return an execution device (e.g. "cuda" for VRAM)."""
+        with self._lock:
+            self._free_execution_device.acquire(timeout=timeout)
+            free_devices = self.execution_devices - self._busy_execution_devices
+            chosen_device = list(free_devices)[0]
+            self._busy_execution_devices.add(chosen_device)
+        return chosen_device
+
+    def release_execution_device(self, device: torch.device) -> None:
+        """Mark this execution device as unused."""
+        with self._lock:
+            self._free_execution_device.release()
+            self._busy_execution_devices.remove(device)
 
     @property
     def max_cache_size(self) -> float:
@@ -422,3 +444,17 @@ def _check_free_vram(self, target_device: torch.device, needed_size: int) -> Non
         free_mem, _ = torch.cuda.mem_get_info(torch.device(vram_device))
         if needed_size > free_mem:
             raise torch.cuda.OutOfMemoryError
+
+    @staticmethod
+    def _get_execution_devices() -> Set[torch.device]:
+        default_device = choose_torch_device()
+        if default_device != torch.device("cuda"):
+            return {default_device}
+
+        # we get here if the default device is cuda, and return each of the
+        # cuda devices.
+        return {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+
+    @staticmethod
+    def _device_name(device: torch.device) -> str:
+        return f"{device.type}:{device.index}"
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 81dca346e52..fa4eb1d5bec 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -2,12 +2,16 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """
 
+from typing import Optional
+
 import torch
 
 from invokeai.backend.model_manager import AnyModel
 
 from .model_cache_base import CacheRecord, ModelCacheBase, ModelLockerBase
 
+MAX_GPU_WAIT = 600  # wait up to 10 minutes for a GPU to become free
+
 
 class ModelLocker(ModelLockerBase):
     """Internal class that mediates movement in and out of GPU."""
@@ -21,6 +25,7 @@ def __init__(self, cache: ModelCacheBase[AnyModel], cache_entry: CacheRecord[Any
         """
         self._cache = cache
         self._cache_entry = cache_entry
+        self._execution_device: Optional[torch.device] = None
 
     @property
     def model(self) -> AnyModel:
@@ -39,10 +44,12 @@ def lock(self) -> AnyModel:
             if self._cache.lazy_offloading:
                 self._cache.offload_unlocked_models(self._cache_entry.size)
 
-            self._cache.move_model_to_device(self._cache_entry, self._cache.execution_device)
+            # We wait for a gpu to be free - may raise a TimeoutError
+            self._execution_device = self._cache.acquire_execution_device(MAX_GPU_WAIT)
+            self._cache.move_model_to_device(self._cache_entry, self._execution_device)
             self._cache_entry.loaded = True
 
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._cache.execution_device}")
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -59,6 +66,8 @@ def unlock(self) -> None:
             return
 
         self._cache_entry.unlock()
+        if self._execution_device:
+            self._cache.release_execution_device(self._execution_device)
         if not self._cache.lazy_offloading:
             self._cache.offload_unlocked_models(self._cache_entry.size)
             self._cache.print_cuda_stats()
diff --git a/tests/backend/model_manager/model_manager_fixtures.py b/tests/backend/model_manager/model_manager_fixtures.py
index 8d4ccf196cd..1f89b7e6a6f 100644
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -111,7 +111,7 @@ def stop_queue() -> None:
 
 
 @pytest.fixture
-def mm2_loader(mm2_app_config: InvokeAIAppConfig, mm2_record_store: ModelRecordServiceBase) -> ModelLoadServiceBase:
+def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
     ram_cache = ModelCache(
         logger=InvokeAILogger.get_logger(),
         max_cache_size=mm2_app_config.ram,

From eaa2c686935ba87406e56570836c2bec80ff12ac Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 31 Mar 2024 16:37:13 -0400
Subject: [PATCH 02/30] remove vram_cache and don't move VRAM models back into
 CPU

---
 docs/contributing/MODEL_MANAGER.md            |   2 +-
 invokeai/app/invocations/compel.py            | 170 +++++++++---------
 .../app/services/config/config_default.py     |   8 -
 .../backend/model_manager/load/load_base.py   |   3 +-
 .../load/model_cache/model_cache_base.py      |  18 +-
 .../load/model_cache/model_cache_default.py   | 103 +----------
 .../load/model_cache/model_locker.py          |  23 +--
 7 files changed, 102 insertions(+), 225 deletions(-)

diff --git a/docs/contributing/MODEL_MANAGER.md b/docs/contributing/MODEL_MANAGER.md
index 98e8702c8fc..2b843fe0426 100644
--- a/docs/contributing/MODEL_MANAGER.md
+++ b/docs/contributing/MODEL_MANAGER.md
@@ -1345,7 +1345,7 @@ from invokeai.app.services.model_load import ModelLoadService, ModelLoaderRegist
 
 config = InvokeAIAppConfig.get_config()
 ram_cache = ModelCache(
- max_cache_size=config.ram_cache_size, max_vram_cache_size=config.vram_cache_size, logger=logger
+ max_cache_size=config.ram_cache_size, logger=logger
 )
 convert_cache = ModelConvertCache(
  cache_path=config.models_convert_cache_path, max_size=config.convert_cache_size
diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index c23dd3d908e..a0928f37acc 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -58,65 +58,62 @@ def invoke(self, context: InvocationContext) -> ConditioningOutput:
         tokenizer_model = tokenizer_info.model
         assert isinstance(tokenizer_model, CLIPTokenizer)
         text_encoder_info = context.models.load(self.clip.text_encoder)
-        text_encoder_model = text_encoder_info.model
-        assert isinstance(text_encoder_model, CLIPTextModel)
 
         def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             for lora in self.clip.loras:
                 lora_info = context.models.load(lora.lora)
                 assert isinstance(lora_info.model, LoRAModelRaw)
-                yield (lora_info.model, lora.weight)
+                with lora_info as model:
+                    yield (model, lora.weight)
                 del lora_info
             return
 
-        # loras = [(context.models.get(**lora.dict(exclude={"weight"})).context.model, lora.weight) for lora in self.clip.loras]
-
         ti_list = generate_ti_list(self.prompt, text_encoder_info.config.base, context)
 
-        with (
-            ModelPatcher.apply_ti(tokenizer_model, text_encoder_model, ti_list) as (
-                tokenizer,
-                ti_manager,
-            ),
-            text_encoder_info as text_encoder,
-            # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
-            ModelPatcher.apply_lora_text_encoder(text_encoder, _lora_loader()),
-            # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
-            ModelPatcher.apply_clip_skip(text_encoder_model, self.clip.skipped_layers),
-        ):
-            assert isinstance(text_encoder, CLIPTextModel)
-            compel = Compel(
-                tokenizer=tokenizer,
-                text_encoder=text_encoder,
-                textual_inversion_manager=ti_manager,
-                dtype_for_device_getter=torch_dtype,
-                truncate_long_prompts=False,
-            )
+        with text_encoder_info as text_encoder:
+            with (
+                ModelPatcher.apply_ti(tokenizer_model, text_encoder, ti_list) as (
+                    tokenizer,
+                    ti_manager,
+                ),
+                # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
+                ModelPatcher.apply_lora_text_encoder(text_encoder, _lora_loader()),
+                # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
+                ModelPatcher.apply_clip_skip(text_encoder, self.clip.skipped_layers),
+            ):
+                assert isinstance(text_encoder, CLIPTextModel)
+                compel = Compel(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    textual_inversion_manager=ti_manager,
+                    dtype_for_device_getter=torch_dtype,
+                    truncate_long_prompts=False,
+                )
 
-            conjunction = Compel.parse_prompt_string(self.prompt)
+                conjunction = Compel.parse_prompt_string(self.prompt)
 
-            if context.config.get().log_tokenization:
-                log_tokenization_for_conjunction(conjunction, tokenizer)
+                if context.config.get().log_tokenization:
+                    log_tokenization_for_conjunction(conjunction, tokenizer)
 
-            c, options = compel.build_conditioning_tensor_for_conjunction(conjunction)
+                c, options = compel.build_conditioning_tensor_for_conjunction(conjunction)
 
-            ec = ExtraConditioningInfo(
-                tokens_count_including_eos_bos=get_max_token_count(tokenizer, conjunction),
-                cross_attention_control_args=options.get("cross_attention_control", None),
-            )
+                ec = ExtraConditioningInfo(
+                    tokens_count_including_eos_bos=get_max_token_count(tokenizer, conjunction),
+                    cross_attention_control_args=options.get("cross_attention_control", None),
+                )
 
-        c = c.detach().to("cpu")
+            c = c.detach().to("cpu")
 
-        conditioning_data = ConditioningFieldData(
-            conditionings=[
-                BasicConditioningInfo(
-                    embeds=c,
-                    extra_conditioning=ec,
-                )
-            ]
-        )
+            conditioning_data = ConditioningFieldData(
+                conditionings=[
+                    BasicConditioningInfo(
+                        embeds=c,
+                        extra_conditioning=ec,
+                    )
+                ]
+            )
 
-        conditioning_name = context.conditioning.save(conditioning_data)
+            conditioning_name = context.conditioning.save(conditioning_data)
 
         return ConditioningOutput.build(conditioning_name)
 
@@ -137,8 +134,7 @@ def run_clip_compel(
         tokenizer_model = tokenizer_info.model
         assert isinstance(tokenizer_model, CLIPTokenizer)
         text_encoder_info = context.models.load(clip_field.text_encoder)
-        text_encoder_model = text_encoder_info.model
-        assert isinstance(text_encoder_model, (CLIPTextModel, CLIPTextModelWithProjection))
+        assert isinstance(text_encoder_info.model, (CLIPTextModel, CLIPTextModelWithProjection))
 
         # return zero on empty
         if prompt == "" and zero_on_empty:
@@ -174,55 +170,55 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
 
         ti_list = generate_ti_list(prompt, text_encoder_info.config.base, context)
 
-        with (
-            ModelPatcher.apply_ti(tokenizer_model, text_encoder_model, ti_list) as (
-                tokenizer,
-                ti_manager,
-            ),
-            text_encoder_info as text_encoder,
-            # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
-            ModelPatcher.apply_lora(text_encoder, _lora_loader(), lora_prefix),
-            # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
-            ModelPatcher.apply_clip_skip(text_encoder_model, clip_field.skipped_layers),
-        ):
-            assert isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection))
-            text_encoder = cast(CLIPTextModel, text_encoder)
-            compel = Compel(
-                tokenizer=tokenizer,
-                text_encoder=text_encoder,
-                textual_inversion_manager=ti_manager,
-                dtype_for_device_getter=torch_dtype,
-                truncate_long_prompts=False,  # TODO:
-                returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,  # TODO: clip skip
-                requires_pooled=get_pooled,
-            )
+        with text_encoder_info as text_encoder:
+            with (
+                ModelPatcher.apply_ti(tokenizer_model, text_encoder, ti_list) as (
+                    tokenizer,
+                    ti_manager,
+                ),
+                # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
+                ModelPatcher.apply_lora(text_encoder, _lora_loader(), lora_prefix),
+                # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
+                ModelPatcher.apply_clip_skip(text_encoder, clip_field.skipped_layers),
+            ):
+                assert isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection))
+                text_encoder = cast(CLIPTextModel, text_encoder)
+                compel = Compel(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    textual_inversion_manager=ti_manager,
+                    dtype_for_device_getter=torch_dtype,
+                    truncate_long_prompts=False,  # TODO:
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,  # TODO: clip skip
+                    requires_pooled=get_pooled,
+                )
 
-            conjunction = Compel.parse_prompt_string(prompt)
+                conjunction = Compel.parse_prompt_string(prompt)
 
-            if context.config.get().log_tokenization:
-                # TODO: better logging for and syntax
-                log_tokenization_for_conjunction(conjunction, tokenizer)
+                if context.config.get().log_tokenization:
+                    # TODO: better logging for and syntax
+                    log_tokenization_for_conjunction(conjunction, tokenizer)
 
-            # TODO: ask for optimizations? to not run text_encoder twice
-            c, options = compel.build_conditioning_tensor_for_conjunction(conjunction)
-            if get_pooled:
-                c_pooled = compel.conditioning_provider.get_pooled_embeddings([prompt])
-            else:
-                c_pooled = None
+                # TODO: ask for optimizations? to not run text_encoder twice
+                c, options = compel.build_conditioning_tensor_for_conjunction(conjunction)
+                if get_pooled:
+                    c_pooled = compel.conditioning_provider.get_pooled_embeddings([prompt])
+                else:
+                    c_pooled = None
 
-            ec = ExtraConditioningInfo(
-                tokens_count_including_eos_bos=get_max_token_count(tokenizer, conjunction),
-                cross_attention_control_args=options.get("cross_attention_control", None),
-            )
+                ec = ExtraConditioningInfo(
+                    tokens_count_including_eos_bos=get_max_token_count(tokenizer, conjunction),
+                    cross_attention_control_args=options.get("cross_attention_control", None),
+                )
 
-        del tokenizer
-        del text_encoder
-        del tokenizer_info
-        del text_encoder_info
+            del tokenizer
+            del text_encoder
+            del tokenizer_info
+            del text_encoder_info
 
-        c = c.detach().to("cpu")
-        if c_pooled is not None:
-            c_pooled = c_pooled.detach().to("cpu")
+            c = c.detach().to("cpu")
+            if c_pooled is not None:
+                c_pooled = c_pooled.detach().to("cpu")
 
         return c, c_pooled, ec
 
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index ee579f4bc42..c56fae2c4fb 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -23,7 +23,6 @@
 DB_FILE = Path("invokeai.db")
 LEGACY_INIT_FILE = Path("invokeai.init")
 DEFAULT_RAM_CACHE = 10.0
-DEFAULT_VRAM_CACHE = 0.25
 DEFAULT_CONVERT_CACHE = 20.0
 DEVICE = Literal["auto", "cpu", "cuda", "cuda:1", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32", "autocast"]
@@ -100,9 +99,7 @@ class InvokeAIAppConfig(BaseSettings):
         profile_prefix: An optional prefix for profile output files.
         profiles_dir: Path to profiles output directory.
         ram: Maximum memory amount used by memory model cache for rapid switching (GB).
-        vram: Amount of VRAM reserved for model storage (GB).
         convert_cache: Maximum size of on-disk converted models cache (GB).
-        lazy_offload: Keep models in VRAM until their space is needed.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
         precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`, `autocast`
@@ -168,9 +165,7 @@ class InvokeAIAppConfig(BaseSettings):
 
     # CACHE
     ram:                          float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
-    vram:                         float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
     convert_cache:                float = Field(default=DEFAULT_CONVERT_CACHE, ge=0, description="Maximum size of on-disk converted models cache (GB).")
-    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
 
     # DEVICE
@@ -372,9 +367,6 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
             # `max_cache_size` was renamed to `ram` some time in v3, but both names were used
             if k == "max_cache_size" and "ram" not in category_dict:
                 parsed_config_dict["ram"] = v
-            # `max_vram_cache_size` was renamed to `vram` some time in v3, but both names were used
-            if k == "max_vram_cache_size" and "vram" not in category_dict:
-                parsed_config_dict["vram"] = v
             if k == "conf_path":
                 parsed_config_dict["legacy_models_yaml_path"] = v
             if k == "legacy_conf_dir":
diff --git a/invokeai/backend/model_manager/load/load_base.py b/invokeai/backend/model_manager/load/load_base.py
index b8ce56eb16d..95a681f7d2a 100644
--- a/invokeai/backend/model_manager/load/load_base.py
+++ b/invokeai/backend/model_manager/load/load_base.py
@@ -28,8 +28,7 @@ class LoadedModel:
 
     def __enter__(self) -> AnyModel:
         """Context entry."""
-        self._locker.lock()
-        return self.model
+        return self._locker.lock()
 
     def __exit__(self, *args: Any, **kwargs: Any) -> None:
         """Context exit."""
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index c54a35f15aa..1d6a4f15dbc 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -45,8 +45,8 @@ class CacheRecord(Generic[T]):
     """Elements of the cache."""
 
     key: str
-    model: T
     size: int
+    model: T
     loaded: bool = False
     _locks: int = 0
 
@@ -109,28 +109,12 @@ def release_execution_device(self, device: torch.device) -> None:
         """Release a previously-acquired execution device."""
         pass
 
-    @property
-    @abstractmethod
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        pass
-
     @property
     @abstractmethod
     def max_cache_size(self) -> float:
         """Return true if the cache is configured to lazily offload models in VRAM."""
         pass
 
-    @abstractmethod
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Offload from VRAM any models not actively in use."""
-        pass
-
-    @abstractmethod
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device."""
-        pass
-
     @property
     @abstractmethod
     def stats(self) -> CacheStats:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 26185b2fba6..82935ef7869 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -19,9 +19,7 @@
 """
 
 import gc
-import math
 import sys
-import time
 from contextlib import suppress
 from logging import Logger
 from threading import BoundedSemaphore, Lock
@@ -30,7 +28,7 @@
 import torch
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
 from invokeai.backend.util.devices import choose_torch_device
 from invokeai.backend.util.logging import InvokeAILogger
 
@@ -44,9 +42,6 @@
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
 
-# amount of GPU memory to hold in reserve for use by generations (GB)
-DEFAULT_MAX_VRAM_CACHE_SIZE = 2.75
-
 # actual size of a gig
 GIG = 1073741824
 
@@ -60,12 +55,10 @@ class ModelCache(ModelCacheBase[AnyModel]):
     def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
-        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
         execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
         sequential_offload: bool = False,
-        lazy_offloading: bool = True,
         sha_chunksize: int = 16777216,
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
@@ -77,18 +70,14 @@ def __init__(
         :param execution_devices: Set of torch device to load active model into [calculated]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
-        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
         :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
             operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
             snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
             behaviour.
         """
-        # allow lazy offloading only when vram cache enabled
-        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
-        self._max_vram_cache_size: float = max_vram_cache_size
         self._execution_devices: Set[torch.device] = execution_devices or self._get_execution_devices()
         self._storage_device: torch.device = storage_device
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -101,7 +90,7 @@ def __init__(
         self._lock = Lock()
         self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
         self._busy_execution_devices: Set[torch.device] = set()
-        
+
         self.logger.info(f"Using rendering device(s) {[self._device_name(x) for x in self._execution_devices]}")
 
     @property
@@ -109,11 +98,6 @@ def logger(self) -> Logger:
         """Return the logger used by the cache."""
         return self._logger
 
-    @property
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        return self._lazy_offloading
-
     @property
     def storage_device(self) -> torch.device:
         """Return the storage device (e.g. "CPU" for RAM)."""
@@ -181,7 +165,7 @@ def put(
         key = self._make_cache_key(key, submodel_type)
         assert key not in self._cached_models
 
-        cache_record = CacheRecord(key, model, size)
+        cache_record = CacheRecord(key=key, model=model, size=size)
         self._cached_models[key] = cache_record
         self._cache_stack.append(key)
 
@@ -242,87 +226,6 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
         else:
             return model_key
 
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Move any unused models from VRAM."""
-        reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated() + size_required
-        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
-        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
-            if vram_in_use <= reserved:
-                break
-            if not cache_entry.loaded:
-                continue
-            if not cache_entry.locked:
-                self.move_model_to_device(cache_entry, self.storage_device)
-                cache_entry.loaded = False
-                vram_in_use = torch.cuda.memory_allocated() + size_required
-                self.logger.debug(
-                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
-                )
-
-        torch.cuda.empty_cache()
-        if choose_torch_device() == torch.device("mps"):
-            mps.empty_cache()
-
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device.
-
-        :param cache_entry: The CacheRecord for the model
-        :param target_device: The torch.device to move the model into
-
-        May raise a torch.cuda.OutOfMemoryError
-        """
-        # These attributes are not in the base ModelMixin class but in various derived classes.
-        # Some models don't have these attributes, in which case they run in RAM/CPU.
-        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
-        if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
-            return
-
-        source_device = cache_entry.model.device
-
-        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
-        # This would need to be revised to support multi-GPU.
-        if torch.device(source_device).type == torch.device(target_device).type:
-            return
-
-        # may raise an exception here if insufficient GPU VRAM
-        self._check_free_vram(target_device, cache_entry.size)
-
-        start_model_to_time = time.time()
-        snapshot_before = self._capture_memory_snapshot()
-        cache_entry.model.to(target_device)
-        snapshot_after = self._capture_memory_snapshot()
-        end_model_to_time = time.time()
-        self.logger.debug(
-            f"Moved model '{cache_entry.key}' from {source_device} to"
-            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
-            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
-            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-        )
-
-        if (
-            snapshot_before is not None
-            and snapshot_after is not None
-            and snapshot_before.vram is not None
-            and snapshot_after.vram is not None
-        ):
-            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
-
-            # If the estimated model size does not match the change in VRAM, log a warning.
-            if not math.isclose(
-                vram_change,
-                cache_entry.size,
-                rel_tol=0.1,
-                abs_tol=10 * MB,
-            ):
-                self.logger.debug(
-                    f"Moving model '{cache_entry.key}' from {source_device} to"
-                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
-                    " estimated size may be incorrect. Estimated model size:"
-                    f" {(cache_entry.size/GIG):.3f} GB.\n"
-                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-                )
-
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index fa4eb1d5bec..30c5dfa8c89 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -2,6 +2,7 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """
 
+import copy
 from typing import Optional
 
 import torch
@@ -41,15 +42,13 @@ def lock(self) -> AnyModel:
         self._cache_entry.lock()
 
         try:
-            if self._cache.lazy_offloading:
-                self._cache.offload_unlocked_models(self._cache_entry.size)
-
             # We wait for a gpu to be free - may raise a TimeoutError
             self._execution_device = self._cache.acquire_execution_device(MAX_GPU_WAIT)
-            self._cache.move_model_to_device(self._cache_entry, self._execution_device)
-            self._cache_entry.loaded = True
-
             self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
+            model_in_gpu = copy.deepcopy(self._cache_entry.model)
+            if hasattr(model_in_gpu, "to"):
+                model_in_gpu.to(self._execution_device)
+            self._cache_entry.loaded = True
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -58,7 +57,7 @@ def lock(self) -> AnyModel:
         except Exception:
             self._cache_entry.unlock()
             raise
-        return self.model
+        return model_in_gpu
 
     def unlock(self) -> None:
         """Call upon exit from context."""
@@ -68,6 +67,10 @@ def unlock(self) -> None:
         self._cache_entry.unlock()
         if self._execution_device:
             self._cache.release_execution_device(self._execution_device)
-        if not self._cache.lazy_offloading:
-            self._cache.offload_unlocked_models(self._cache_entry.size)
-            self._cache.print_cuda_stats()
+
+        try:
+            torch.cuda.empty_cache()
+            torch.mps.empty_cache()
+        except Exception:
+            pass
+        self._cache.print_cuda_stats()

From bd9b00a6bffa479cfb954dbea756dfc96169b6eb Mon Sep 17 00:00:00 2001
From: psychedelicious <4822129+psychedelicious@users.noreply.github.com>
Date: Mon, 1 Apr 2024 07:45:36 +1100
Subject: [PATCH 03/30] fix(nodes): 100% cpu usage when processor paused

Should be waiting on the resume event instead of checking it in a loop
---
 .../session_processor_default.py              | 267 +++++++++---------
 1 file changed, 132 insertions(+), 135 deletions(-)

diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index 2487a122de0..f921d1fdfdb 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -122,152 +122,149 @@ def _process(
                 # Middle processor try block; any unhandled exception is a non-fatal processor error
                 try:
                     # If we are paused, wait for resume event
-                    if resume_event.is_set():
-                        # Get the next session to process
-                        self._queue_item = self._invoker.services.session_queue.dequeue()
-
-                        if self._queue_item is not None:
-                            self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
-                            cancel_event.clear()
-
-                            # If profiling is enabled, start the profiler
-                            if self._profiler is not None:
-                                self._profiler.start(profile_id=self._queue_item.session_id)
-
-                            # Prepare invocations and take the first
-                            self._invocation = self._queue_item.session.next()
-
-                            # Loop over invocations until the session is complete or canceled
-                            while self._invocation is not None and not cancel_event.is_set():
-                                # get the source node id to provide to clients (the prepared node id is not as useful)
-                                source_invocation_id = self._queue_item.session.prepared_source_mapping[
-                                    self._invocation.id
-                                ]
-
-                                # Send starting event
-                                self._invoker.services.events.emit_invocation_started(
-                                    queue_batch_id=self._queue_item.batch_id,
-                                    queue_item_id=self._queue_item.item_id,
-                                    queue_id=self._queue_item.queue_id,
-                                    graph_execution_state_id=self._queue_item.session_id,
-                                    node=self._invocation.model_dump(),
-                                    source_node_id=source_invocation_id,
-                                )
+                    resume_event.wait()
 
-                                # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
-                                try:
-                                    with self._invoker.services.performance_statistics.collect_stats(
-                                        self._invocation, self._queue_item.session.id
-                                    ):
-                                        # Build invocation context (the node-facing API)
-                                        data = InvocationContextData(
-                                            invocation=self._invocation,
-                                            source_invocation_id=source_invocation_id,
-                                            queue_item=self._queue_item,
-                                        )
-                                        context = build_invocation_context(
-                                            data=data,
-                                            services=self._invoker.services,
-                                            cancel_event=self._cancel_event,
-                                        )
-
-                                        # Invoke the node
-                                        outputs = self._invocation.invoke_internal(
-                                            context=context, services=self._invoker.services
-                                        )
-
-                                        # Save outputs and history
-                                        self._queue_item.session.complete(self._invocation.id, outputs)
-
-                                        # Send complete event
-                                        self._invoker.services.events.emit_invocation_complete(
-                                            queue_batch_id=self._queue_item.batch_id,
-                                            queue_item_id=self._queue_item.item_id,
-                                            queue_id=self._queue_item.queue_id,
-                                            graph_execution_state_id=self._queue_item.session.id,
-                                            node=self._invocation.model_dump(),
-                                            source_node_id=source_invocation_id,
-                                            result=outputs.model_dump(),
-                                        )
-
-                                except KeyboardInterrupt:
-                                    # TODO(MM2): Create an event for this
-                                    pass
-
-                                except CanceledException:
-                                    # When the user cancels the graph, we first set the cancel event. The event is checked
-                                    # between invocations, in this loop. Some invocations are long-running, and we need to
-                                    # be able to cancel them mid-execution.
-                                    #
-                                    # For example, denoising is a long-running invocation with many steps. A step callback
-                                    # is executed after each step. This step callback checks if the canceled event is set,
-                                    # then raises a CanceledException to stop execution immediately.
-                                    #
-                                    # When we get a CanceledException, we don't need to do anything - just pass and let the
-                                    # loop go to its next iteration, and the cancel event will be handled correctly.
-                                    pass
-
-                                except Exception as e:
-                                    error = traceback.format_exc()
-
-                                    # Save error
-                                    self._queue_item.session.set_node_error(self._invocation.id, error)
-                                    self._invoker.services.logger.error(
-                                        f"Error while invoking session {self._queue_item.session_id}, invocation {self._invocation.id} ({self._invocation.get_type()}):\n{e}"
+                    # Get the next session to process
+                    self._queue_item = self._invoker.services.session_queue.dequeue()
+
+                    if self._queue_item is not None:
+                        self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
+                        cancel_event.clear()
+
+                        # If profiling is enabled, start the profiler
+                        if self._profiler is not None:
+                            self._profiler.start(profile_id=self._queue_item.session_id)
+
+                        # Prepare invocations and take the first
+                        self._invocation = self._queue_item.session.next()
+
+                        # Loop over invocations until the session is complete or canceled
+                        while self._invocation is not None and not cancel_event.is_set():
+                            # get the source node id to provide to clients (the prepared node id is not as useful)
+                            source_invocation_id = self._queue_item.session.prepared_source_mapping[self._invocation.id]
+
+                            # Send starting event
+                            self._invoker.services.events.emit_invocation_started(
+                                queue_batch_id=self._queue_item.batch_id,
+                                queue_item_id=self._queue_item.item_id,
+                                queue_id=self._queue_item.queue_id,
+                                graph_execution_state_id=self._queue_item.session_id,
+                                node=self._invocation.model_dump(),
+                                source_node_id=source_invocation_id,
+                            )
+
+                            # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
+                            try:
+                                with self._invoker.services.performance_statistics.collect_stats(
+                                    self._invocation, self._queue_item.session.id
+                                ):
+                                    # Build invocation context (the node-facing API)
+                                    data = InvocationContextData(
+                                        invocation=self._invocation,
+                                        source_invocation_id=source_invocation_id,
+                                        queue_item=self._queue_item,
+                                    )
+                                    context = build_invocation_context(
+                                        data=data,
+                                        services=self._invoker.services,
+                                        cancel_event=self._cancel_event,
                                     )
-                                    self._invoker.services.logger.error(error)
 
-                                    # Send error event
-                                    self._invoker.services.events.emit_invocation_error(
-                                        queue_batch_id=self._queue_item.session_id,
-                                        queue_item_id=self._queue_item.item_id,
-                                        queue_id=self._queue_item.queue_id,
-                                        graph_execution_state_id=self._queue_item.session.id,
-                                        node=self._invocation.model_dump(),
-                                        source_node_id=source_invocation_id,
-                                        error_type=e.__class__.__name__,
-                                        error=error,
+                                    # Invoke the node
+                                    outputs = self._invocation.invoke_internal(
+                                        context=context, services=self._invoker.services
                                     )
-                                    pass
 
-                                # The session is complete if the all invocations are complete or there was an error
-                                if self._queue_item.session.is_complete() or cancel_event.is_set():
+                                    # Save outputs and history
+                                    self._queue_item.session.complete(self._invocation.id, outputs)
+
                                     # Send complete event
-                                    self._invoker.services.events.emit_graph_execution_complete(
+                                    self._invoker.services.events.emit_invocation_complete(
                                         queue_batch_id=self._queue_item.batch_id,
                                         queue_item_id=self._queue_item.item_id,
                                         queue_id=self._queue_item.queue_id,
                                         graph_execution_state_id=self._queue_item.session.id,
+                                        node=self._invocation.model_dump(),
+                                        source_node_id=source_invocation_id,
+                                        result=outputs.model_dump(),
+                                    )
+
+                            except KeyboardInterrupt:
+                                # TODO(MM2): Create an event for this
+                                pass
+
+                            except CanceledException:
+                                # When the user cancels the graph, we first set the cancel event. The event is checked
+                                # between invocations, in this loop. Some invocations are long-running, and we need to
+                                # be able to cancel them mid-execution.
+                                #
+                                # For example, denoising is a long-running invocation with many steps. A step callback
+                                # is executed after each step. This step callback checks if the canceled event is set,
+                                # then raises a CanceledException to stop execution immediately.
+                                #
+                                # When we get a CanceledException, we don't need to do anything - just pass and let the
+                                # loop go to its next iteration, and the cancel event will be handled correctly.
+                                pass
+
+                            except Exception as e:
+                                error = traceback.format_exc()
+
+                                # Save error
+                                self._queue_item.session.set_node_error(self._invocation.id, error)
+                                self._invoker.services.logger.error(
+                                    f"Error while invoking session {self._queue_item.session_id}, invocation {self._invocation.id} ({self._invocation.get_type()}):\n{e}"
+                                )
+                                self._invoker.services.logger.error(error)
+
+                                # Send error event
+                                self._invoker.services.events.emit_invocation_error(
+                                    queue_batch_id=self._queue_item.session_id,
+                                    queue_item_id=self._queue_item.item_id,
+                                    queue_id=self._queue_item.queue_id,
+                                    graph_execution_state_id=self._queue_item.session.id,
+                                    node=self._invocation.model_dump(),
+                                    source_node_id=source_invocation_id,
+                                    error_type=e.__class__.__name__,
+                                    error=error,
+                                )
+                                pass
+
+                            # The session is complete if the all invocations are complete or there was an error
+                            if self._queue_item.session.is_complete() or cancel_event.is_set():
+                                # Send complete event
+                                self._invoker.services.events.emit_graph_execution_complete(
+                                    queue_batch_id=self._queue_item.batch_id,
+                                    queue_item_id=self._queue_item.item_id,
+                                    queue_id=self._queue_item.queue_id,
+                                    graph_execution_state_id=self._queue_item.session.id,
+                                )
+                                # If we are profiling, stop the profiler and dump the profile & stats
+                                if self._profiler:
+                                    profile_path = self._profiler.stop()
+                                    stats_path = profile_path.with_suffix(".json")
+                                    self._invoker.services.performance_statistics.dump_stats(
+                                        graph_execution_state_id=self._queue_item.session.id, output_path=stats_path
                                     )
-                                    # If we are profiling, stop the profiler and dump the profile & stats
-                                    if self._profiler:
-                                        profile_path = self._profiler.stop()
-                                        stats_path = profile_path.with_suffix(".json")
-                                        self._invoker.services.performance_statistics.dump_stats(
-                                            graph_execution_state_id=self._queue_item.session.id, output_path=stats_path
-                                        )
-                                    # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
-                                    # we don't care about that - suppress the error.
-                                    with suppress(GESStatsNotFoundError):
-                                        self._invoker.services.performance_statistics.log_stats(
-                                            self._queue_item.session.id
-                                        )
-                                        self._invoker.services.performance_statistics.reset_stats()
-
-                                    # Set the invocation to None to prepare for the next session
-                                    self._invocation = None
-                                else:
-                                    # Prepare the next invocation
-                                    self._invocation = self._queue_item.session.next()
-
-                            # The session is complete, immediately poll for next session
-                            self._queue_item = None
-                            poll_now_event.set()
-                        else:
-                            # The queue was empty, wait for next polling interval or event to try again
-                            self._invoker.services.logger.debug("Waiting for next polling interval or event")
-                            poll_now_event.wait(self._polling_interval)
-                            continue
+                                # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
+                                # we don't care about that - suppress the error.
+                                with suppress(GESStatsNotFoundError):
+                                    self._invoker.services.performance_statistics.log_stats(self._queue_item.session.id)
+                                    self._invoker.services.performance_statistics.reset_stats()
+
+                                # Set the invocation to None to prepare for the next session
+                                self._invocation = None
+                            else:
+                                # Prepare the next invocation
+                                self._invocation = self._queue_item.session.next()
+
+                        # The session is complete, immediately poll for next session
+                        self._queue_item = None
+                        poll_now_event.set()
+                    else:
+                        # The queue was empty, wait for next polling interval or event to try again
+                        self._invoker.services.logger.debug("Waiting for next polling interval or event")
+                        poll_now_event.wait(self._polling_interval)
+                        continue
                 except Exception:
                     # Non-fatal error in processor
                     self._invoker.services.logger.error(

From a1dcab9c38567b5102d288f60c4d5ad5542b272b Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 31 Mar 2024 16:52:01 -0400
Subject: [PATCH 04/30] remove references to vram_cache in tests

---
 invokeai/app/services/model_manager/model_manager_default.py | 1 -
 tests/backend/model_manager/model_manager_fixtures.py        | 1 -
 tests/test_config.py                                         | 1 -
 3 files changed, 3 deletions(-)

diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index e9035a3a809..e2da8e2712a 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -80,7 +80,6 @@ def build_model_manager(
 
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
-            max_vram_cache_size=app_config.vram,
             logger=logger,
             execution_devices=execution_devices,
         )
diff --git a/tests/backend/model_manager/model_manager_fixtures.py b/tests/backend/model_manager/model_manager_fixtures.py
index 9ee563bc983..5c0cae5e4b2 100644
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -109,7 +109,6 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
     ram_cache = ModelCache(
         logger=InvokeAILogger.get_logger(),
         max_cache_size=mm2_app_config.ram,
-        max_vram_cache_size=mm2_app_config.vram,
     )
     convert_cache = ModelConvertCache(mm2_app_config.convert_cache_path)
     return ModelLoadService(
diff --git a/tests/test_config.py b/tests/test_config.py
index f779df6a02f..7e28925aafc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -92,7 +92,6 @@ def test_migrate_v3_config_from_file(tmp_path: Path, patch_rootdir: None):
     assert config.host == "192.168.1.1"
     assert config.port == 8080
     assert config.ram == 100
-    assert config.vram == 50
     assert config.legacy_models_yaml_path == Path("/custom/models.yaml")
     # This should be stripped out
     assert not hasattr(config, "esrgan")

From 32d3e4dc5cd4a5ed6810da60fb9f6417bcb5058b Mon Sep 17 00:00:00 2001
From: psychedelicious <4822129+psychedelicious@users.noreply.github.com>
Date: Mon, 1 Apr 2024 07:55:42 +1100
Subject: [PATCH 05/30] feat(nodes): simplify processor loop with an early
 continue

Prefer an early return/continue to reduce the indentation of the processor loop. Easier to read.

There are other ways to improve its structure but at first glance, they seem to involve changing the logic in scarier ways.
---
 .../session_processor_default.py              | 241 +++++++++---------
 1 file changed, 121 insertions(+), 120 deletions(-)

diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index f921d1fdfdb..e7caabdf903 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -127,139 +127,140 @@ def _process(
                     # Get the next session to process
                     self._queue_item = self._invoker.services.session_queue.dequeue()
 
-                    if self._queue_item is not None:
-                        self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
-                        cancel_event.clear()
-
-                        # If profiling is enabled, start the profiler
-                        if self._profiler is not None:
-                            self._profiler.start(profile_id=self._queue_item.session_id)
-
-                        # Prepare invocations and take the first
-                        self._invocation = self._queue_item.session.next()
+                    if self._queue_item is None:
+                        # The queue was empty, wait for next polling interval or event to try again
+                        self._invoker.services.logger.debug("Waiting for next polling interval or event")
+                        poll_now_event.wait(self._polling_interval)
+                        continue
 
-                        # Loop over invocations until the session is complete or canceled
-                        while self._invocation is not None and not cancel_event.is_set():
-                            # get the source node id to provide to clients (the prepared node id is not as useful)
-                            source_invocation_id = self._queue_item.session.prepared_source_mapping[self._invocation.id]
+                    self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
+                    cancel_event.clear()
+
+                    # If profiling is enabled, start the profiler
+                    if self._profiler is not None:
+                        self._profiler.start(profile_id=self._queue_item.session_id)
+
+                    # Prepare invocations and take the first
+                    self._invocation = self._queue_item.session.next()
+
+                    # Loop over invocations until the session is complete or canceled
+                    while self._invocation is not None and not cancel_event.is_set():
+                        # get the source node id to provide to clients (the prepared node id is not as useful)
+                        source_invocation_id = self._queue_item.session.prepared_source_mapping[self._invocation.id]
+
+                        # Send starting event
+                        self._invoker.services.events.emit_invocation_started(
+                            queue_batch_id=self._queue_item.batch_id,
+                            queue_item_id=self._queue_item.item_id,
+                            queue_id=self._queue_item.queue_id,
+                            graph_execution_state_id=self._queue_item.session_id,
+                            node=self._invocation.model_dump(),
+                            source_node_id=source_invocation_id,
+                        )
 
-                            # Send starting event
-                            self._invoker.services.events.emit_invocation_started(
-                                queue_batch_id=self._queue_item.batch_id,
-                                queue_item_id=self._queue_item.item_id,
-                                queue_id=self._queue_item.queue_id,
-                                graph_execution_state_id=self._queue_item.session_id,
-                                node=self._invocation.model_dump(),
-                                source_node_id=source_invocation_id,
-                            )
+                        # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
+                        try:
+                            with self._invoker.services.performance_statistics.collect_stats(
+                                self._invocation, self._queue_item.session.id
+                            ):
+                                # Build invocation context (the node-facing API)
+                                data = InvocationContextData(
+                                    invocation=self._invocation,
+                                    source_invocation_id=source_invocation_id,
+                                    queue_item=self._queue_item,
+                                )
+                                context = build_invocation_context(
+                                    data=data,
+                                    services=self._invoker.services,
+                                    cancel_event=self._cancel_event,
+                                )
 
-                            # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
-                            try:
-                                with self._invoker.services.performance_statistics.collect_stats(
-                                    self._invocation, self._queue_item.session.id
-                                ):
-                                    # Build invocation context (the node-facing API)
-                                    data = InvocationContextData(
-                                        invocation=self._invocation,
-                                        source_invocation_id=source_invocation_id,
-                                        queue_item=self._queue_item,
-                                    )
-                                    context = build_invocation_context(
-                                        data=data,
-                                        services=self._invoker.services,
-                                        cancel_event=self._cancel_event,
-                                    )
-
-                                    # Invoke the node
-                                    outputs = self._invocation.invoke_internal(
-                                        context=context, services=self._invoker.services
-                                    )
-
-                                    # Save outputs and history
-                                    self._queue_item.session.complete(self._invocation.id, outputs)
-
-                                    # Send complete event
-                                    self._invoker.services.events.emit_invocation_complete(
-                                        queue_batch_id=self._queue_item.batch_id,
-                                        queue_item_id=self._queue_item.item_id,
-                                        queue_id=self._queue_item.queue_id,
-                                        graph_execution_state_id=self._queue_item.session.id,
-                                        node=self._invocation.model_dump(),
-                                        source_node_id=source_invocation_id,
-                                        result=outputs.model_dump(),
-                                    )
-
-                            except KeyboardInterrupt:
-                                # TODO(MM2): Create an event for this
-                                pass
-
-                            except CanceledException:
-                                # When the user cancels the graph, we first set the cancel event. The event is checked
-                                # between invocations, in this loop. Some invocations are long-running, and we need to
-                                # be able to cancel them mid-execution.
-                                #
-                                # For example, denoising is a long-running invocation with many steps. A step callback
-                                # is executed after each step. This step callback checks if the canceled event is set,
-                                # then raises a CanceledException to stop execution immediately.
-                                #
-                                # When we get a CanceledException, we don't need to do anything - just pass and let the
-                                # loop go to its next iteration, and the cancel event will be handled correctly.
-                                pass
-
-                            except Exception as e:
-                                error = traceback.format_exc()
-
-                                # Save error
-                                self._queue_item.session.set_node_error(self._invocation.id, error)
-                                self._invoker.services.logger.error(
-                                    f"Error while invoking session {self._queue_item.session_id}, invocation {self._invocation.id} ({self._invocation.get_type()}):\n{e}"
+                                # Invoke the node
+                                outputs = self._invocation.invoke_internal(
+                                    context=context, services=self._invoker.services
                                 )
-                                self._invoker.services.logger.error(error)
 
-                                # Send error event
-                                self._invoker.services.events.emit_invocation_error(
-                                    queue_batch_id=self._queue_item.session_id,
+                                # Save outputs and history
+                                self._queue_item.session.complete(self._invocation.id, outputs)
+
+                                # Send complete event
+                                self._invoker.services.events.emit_invocation_complete(
+                                    queue_batch_id=self._queue_item.batch_id,
                                     queue_item_id=self._queue_item.item_id,
                                     queue_id=self._queue_item.queue_id,
                                     graph_execution_state_id=self._queue_item.session.id,
                                     node=self._invocation.model_dump(),
                                     source_node_id=source_invocation_id,
-                                    error_type=e.__class__.__name__,
-                                    error=error,
+                                    result=outputs.model_dump(),
                                 )
-                                pass
 
-                            # The session is complete if the all invocations are complete or there was an error
-                            if self._queue_item.session.is_complete() or cancel_event.is_set():
-                                # Send complete event
-                                self._invoker.services.events.emit_graph_execution_complete(
-                                    queue_batch_id=self._queue_item.batch_id,
-                                    queue_item_id=self._queue_item.item_id,
-                                    queue_id=self._queue_item.queue_id,
-                                    graph_execution_state_id=self._queue_item.session.id,
+                        except KeyboardInterrupt:
+                            # TODO(MM2): Create an event for this
+                            pass
+
+                        except CanceledException:
+                            # When the user cancels the graph, we first set the cancel event. The event is checked
+                            # between invocations, in this loop. Some invocations are long-running, and we need to
+                            # be able to cancel them mid-execution.
+                            #
+                            # For example, denoising is a long-running invocation with many steps. A step callback
+                            # is executed after each step. This step callback checks if the canceled event is set,
+                            # then raises a CanceledException to stop execution immediately.
+                            #
+                            # When we get a CanceledException, we don't need to do anything - just pass and let the
+                            # loop go to its next iteration, and the cancel event will be handled correctly.
+                            pass
+
+                        except Exception as e:
+                            error = traceback.format_exc()
+
+                            # Save error
+                            self._queue_item.session.set_node_error(self._invocation.id, error)
+                            self._invoker.services.logger.error(
+                                f"Error while invoking session {self._queue_item.session_id}, invocation {self._invocation.id} ({self._invocation.get_type()}):\n{e}"
+                            )
+                            self._invoker.services.logger.error(error)
+
+                            # Send error event
+                            self._invoker.services.events.emit_invocation_error(
+                                queue_batch_id=self._queue_item.session_id,
+                                queue_item_id=self._queue_item.item_id,
+                                queue_id=self._queue_item.queue_id,
+                                graph_execution_state_id=self._queue_item.session.id,
+                                node=self._invocation.model_dump(),
+                                source_node_id=source_invocation_id,
+                                error_type=e.__class__.__name__,
+                                error=error,
+                            )
+                            pass
+
+                        # The session is complete if the all invocations are complete or there was an error
+                        if self._queue_item.session.is_complete() or cancel_event.is_set():
+                            # Send complete event
+                            self._invoker.services.events.emit_graph_execution_complete(
+                                queue_batch_id=self._queue_item.batch_id,
+                                queue_item_id=self._queue_item.item_id,
+                                queue_id=self._queue_item.queue_id,
+                                graph_execution_state_id=self._queue_item.session.id,
+                            )
+                            # If we are profiling, stop the profiler and dump the profile & stats
+                            if self._profiler:
+                                profile_path = self._profiler.stop()
+                                stats_path = profile_path.with_suffix(".json")
+                                self._invoker.services.performance_statistics.dump_stats(
+                                    graph_execution_state_id=self._queue_item.session.id, output_path=stats_path
                                 )
-                                # If we are profiling, stop the profiler and dump the profile & stats
-                                if self._profiler:
-                                    profile_path = self._profiler.stop()
-                                    stats_path = profile_path.with_suffix(".json")
-                                    self._invoker.services.performance_statistics.dump_stats(
-                                        graph_execution_state_id=self._queue_item.session.id, output_path=stats_path
-                                    )
-                                # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
-                                # we don't care about that - suppress the error.
-                                with suppress(GESStatsNotFoundError):
-                                    self._invoker.services.performance_statistics.log_stats(self._queue_item.session.id)
-                                    self._invoker.services.performance_statistics.reset_stats()
-
-                                # Set the invocation to None to prepare for the next session
-                                self._invocation = None
-                            else:
-                                # Prepare the next invocation
-                                self._invocation = self._queue_item.session.next()
-
-                        # The session is complete, immediately poll for next session
-                        self._queue_item = None
-                        poll_now_event.set()
+                            # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
+                            # we don't care about that - suppress the error.
+                            with suppress(GESStatsNotFoundError):
+                                self._invoker.services.performance_statistics.log_stats(self._queue_item.session.id)
+                                self._invoker.services.performance_statistics.reset_stats()
+
+                            # Set the invocation to None to prepare for the next session
+                            self._invocation = None
+                        else:
+                            # Prepare the next invocation
+                            self._invocation = self._queue_item.session.next()
                     else:
                         # The queue was empty, wait for next polling interval or event to try again
                         self._invoker.services.logger.debug("Waiting for next polling interval or event")

From 9336a076deb2b9632f3b29c478e10ea6922e652b Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 31 Mar 2024 16:58:56 -0400
Subject: [PATCH 06/30] add locking around thread-critical sections

---
 .../load/model_cache/model_cache_default.py   | 68 ++++++++++---------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 82935ef7869..90090b522d0 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -20,6 +20,7 @@
 
 import gc
 import sys
+import threading
 from contextlib import suppress
 from logging import Logger
 from threading import BoundedSemaphore, Lock
@@ -80,6 +81,7 @@ def __init__(
         self._max_cache_size: float = max_cache_size
         self._execution_devices: Set[torch.device] = execution_devices or self._get_execution_devices()
         self._storage_device: torch.device = storage_device
+        self._lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
         self._log_memory_usage = log_memory_usage
         self._stats: Optional[CacheStats] = None
@@ -162,12 +164,13 @@ def put(
         submodel_type: Optional[SubModelType] = None,
     ) -> None:
         """Store model under key and optional submodel_type."""
-        key = self._make_cache_key(key, submodel_type)
-        assert key not in self._cached_models
+        with self._lock:
+            key = self._make_cache_key(key, submodel_type)
+            assert key not in self._cached_models
 
-        cache_record = CacheRecord(key=key, model=model, size=size)
-        self._cached_models[key] = cache_record
-        self._cache_stack.append(key)
+            cache_record = CacheRecord(key=key, model=model, size=size)
+            self._cached_models[key] = cache_record
+            self._cache_stack.append(key)
 
     def get(
         self,
@@ -185,35 +188,36 @@ def get(
 
         This may raise an IndexError if the model is not in the cache.
         """
-        key = self._make_cache_key(key, submodel_type)
-        if key in self._cached_models:
-            if self.stats:
-                self.stats.hits += 1
-        else:
+        with self._lock:
+            key = self._make_cache_key(key, submodel_type)
+            if key in self._cached_models:
+                if self.stats:
+                    self.stats.hits += 1
+            else:
+                if self.stats:
+                    self.stats.misses += 1
+                raise IndexError(f"The model with key {key} is not in the cache.")
+
+            cache_entry = self._cached_models[key]
+
+            # more stats
             if self.stats:
-                self.stats.misses += 1
-            raise IndexError(f"The model with key {key} is not in the cache.")
-
-        cache_entry = self._cached_models[key]
-
-        # more stats
-        if self.stats:
-            stats_name = stats_name or key
-            self.stats.cache_size = int(self._max_cache_size * GIG)
-            self.stats.high_watermark = max(self.stats.high_watermark, self.cache_size())
-            self.stats.in_cache = len(self._cached_models)
-            self.stats.loaded_model_sizes[stats_name] = max(
-                self.stats.loaded_model_sizes.get(stats_name, 0), cache_entry.size
-            )
+                stats_name = stats_name or key
+                self.stats.cache_size = int(self._max_cache_size * GIG)
+                self.stats.high_watermark = max(self.stats.high_watermark, self.cache_size())
+                self.stats.in_cache = len(self._cached_models)
+                self.stats.loaded_model_sizes[stats_name] = max(
+                    self.stats.loaded_model_sizes.get(stats_name, 0), cache_entry.size
+                )
 
-        # this moves the entry to the top (right end) of the stack
-        with suppress(Exception):
-            self._cache_stack.remove(key)
-        self._cache_stack.append(key)
-        return ModelLocker(
-            cache=self,
-            cache_entry=cache_entry,
-        )
+            # this moves the entry to the top (right end) of the stack
+            with suppress(Exception):
+                self._cache_stack.remove(key)
+            self._cache_stack.append(key)
+            return ModelLocker(
+                cache=self,
+                cache_entry=cache_entry,
+            )
 
     def _capture_memory_snapshot(self) -> Optional[MemorySnapshot]:
         if self._log_memory_usage:

From 9df0980c4612f20bf3a52998a1ec8d7887d1a4fc Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 1 Apr 2024 00:07:47 -0400
Subject: [PATCH 07/30] parallel processing working on single-GPU, not tested
 on multi

---
 invokeai/app/api/dependencies.py              |   3 -
 invokeai/app/invocations/latent.py            |   1 -
 .../app/services/config/config_default.py     |   4 +-
 invokeai/app/services/invocation_services.py  |   3 -
 .../model_manager/model_manager_default.py    |  10 +-
 .../session_processor_default.py              | 387 ++++++++++--------
 .../load/model_cache/model_cache_default.py   |  77 +++-
 tests/conftest.py                             |   2 -
 8 files changed, 278 insertions(+), 209 deletions(-)

diff --git a/invokeai/app/api/dependencies.py b/invokeai/app/api/dependencies.py
index 9a6c7416f69..7332b35c086 100644
--- a/invokeai/app/api/dependencies.py
+++ b/invokeai/app/api/dependencies.py
@@ -23,7 +23,6 @@
 from ..services.images.images_default import ImageService
 from ..services.invocation_cache.invocation_cache_memory import MemoryInvocationCache
 from ..services.invocation_services import InvocationServices
-from ..services.invocation_stats.invocation_stats_default import InvocationStatsService
 from ..services.invoker import Invoker
 from ..services.model_images.model_images_default import ModelImageFileStorageDisk
 from ..services.model_manager.model_manager_default import ModelManagerService
@@ -102,7 +101,6 @@ def initialize(config: InvokeAIAppConfig, event_handler_id: int, logger: Logger
             events=events,
         )
         names = SimpleNameService()
-        performance_statistics = InvocationStatsService()
         session_processor = DefaultSessionProcessor()
         session_queue = SqliteSessionQueue(db=db)
         urls = LocalUrlService()
@@ -125,7 +123,6 @@ def initialize(config: InvokeAIAppConfig, event_handler_id: int, logger: Logger
             model_manager=model_manager,
             download_queue=download_queue_service,
             names=names,
-            performance_statistics=performance_statistics,
             session_processor=session_processor,
             session_queue=session_queue,
             urls=urls,
diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py
index bc79efdeba4..7845cbba03a 100644
--- a/invokeai/app/invocations/latent.py
+++ b/invokeai/app/invocations/latent.py
@@ -749,7 +749,6 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
                     mask = mask.to(device=unet.device, dtype=unet.dtype)
                 if masked_latents is not None:
                     masked_latents = masked_latents.to(device=unet.device, dtype=unet.dtype)
-
                 scheduler = get_scheduler(
                     context=context,
                     scheduler_info=self.unet.scheduler,
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 2e9578a56a2..258cd58e8da 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -24,7 +24,7 @@
 LEGACY_INIT_FILE = Path("invokeai.init")
 DEFAULT_RAM_CACHE = 10.0
 DEFAULT_CONVERT_CACHE = 20.0
-DEVICE = Literal["auto", "cpu", "cuda", "cuda:1", "mps"]
+DEVICE = Literal["auto", "cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32", "autocast"]
 ATTENTION_TYPE = Literal["auto", "normal", "xformers", "sliced", "torch-sdp"]
 ATTENTION_SLICE_SIZE = Literal["auto", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8]
@@ -169,6 +169,7 @@ class InvokeAIAppConfig(BaseSettings):
 
     # DEVICE
     device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
+    devices:      Optional[list[DEVICE]] = Field(default=None,              description="List of execution devices; will override default device selected.")
     precision:                PRECISION = Field(default="auto",             description="Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.")
 
     # GENERATION
@@ -178,6 +179,7 @@ class InvokeAIAppConfig(BaseSettings):
     force_tiled_decode:            bool = Field(default=False,              description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).")
     pil_compress_level:             int = Field(default=1,                  description="The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.")
     max_queue_size:                 int = Field(default=10000, gt=0,        description="Maximum number of items in the session queue.")
+    max_threads:                    int = Field(default=4,                  description="Maximum number of session queue execution threads.")
 
     # NODES
     allow_nodes:    Optional[list[str]] = Field(default=None,               description="List of nodes to allow. Omit to allow all.")
diff --git a/invokeai/app/services/invocation_services.py b/invokeai/app/services/invocation_services.py
index f4fce6098f3..0e1ec123ca5 100644
--- a/invokeai/app/services/invocation_services.py
+++ b/invokeai/app/services/invocation_services.py
@@ -24,7 +24,6 @@
     from .image_records.image_records_base import ImageRecordStorageBase
     from .images.images_base import ImageServiceABC
     from .invocation_cache.invocation_cache_base import InvocationCacheBase
-    from .invocation_stats.invocation_stats_base import InvocationStatsServiceBase
     from .model_images.model_images_base import ModelImageFileStorageBase
     from .model_manager.model_manager_base import ModelManagerServiceBase
     from .names.names_base import NameServiceBase
@@ -53,7 +52,6 @@ def __init__(
         model_images: "ModelImageFileStorageBase",
         model_manager: "ModelManagerServiceBase",
         download_queue: "DownloadQueueServiceBase",
-        performance_statistics: "InvocationStatsServiceBase",
         session_queue: "SessionQueueBase",
         session_processor: "SessionProcessorBase",
         invocation_cache: "InvocationCacheBase",
@@ -77,7 +75,6 @@ def __init__(
         self.model_images = model_images
         self.model_manager = model_manager
         self.download_queue = download_queue
-        self.performance_statistics = performance_statistics
         self.session_queue = session_queue
         self.session_processor = session_processor
         self.invocation_cache = invocation_cache
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index e2da8e2712a..241259c803b 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -1,8 +1,6 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 """Implementation of ModelManagerServiceBase."""
 
-from typing import Optional, Set
-
 import torch
 from typing_extensions import Self
 
@@ -68,7 +66,6 @@ def build_model_manager(
         model_record_service: ModelRecordServiceBase,
         download_queue: DownloadQueueServiceBase,
         events: EventServiceBase,
-        execution_devices: Optional[Set[torch.device]] = None,
     ) -> Self:
         """
         Construct the model manager service instance.
@@ -78,6 +75,13 @@ def build_model_manager(
         logger = InvokeAILogger.get_logger(cls.__name__)
         logger.setLevel(app_config.log_level.upper())
 
+        execution_devices = (
+            None
+            if app_config.devices is None
+            else None
+            if "auto" in app_config.devices
+            else {torch.device(x) for x in app_config.devices}
+        )
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
             logger=logger,
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index d6791fbd57d..3088d99c5de 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -1,8 +1,9 @@
 import traceback
 from contextlib import suppress
-from threading import BoundedSemaphore, Thread
+from queue import Queue
+from threading import BoundedSemaphore, Lock, Thread
 from threading import Event as ThreadEvent
-from typing import Optional
+from typing import Optional, Set
 
 from fastapi_events.handlers.local import local_handler
 from fastapi_events.typing import Event as FastAPIEvent
@@ -10,6 +11,7 @@
 from invokeai.app.invocations.baseinvocation import BaseInvocation
 from invokeai.app.services.events.events_base import EventServiceBase
 from invokeai.app.services.invocation_stats.invocation_stats_common import GESStatsNotFoundError
+from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService
 from invokeai.app.services.session_processor.session_processor_common import CanceledException
 from invokeai.app.services.session_queue.session_queue_common import SessionQueueItem
 from invokeai.app.services.shared.invocation_context import InvocationContextData, build_invocation_context
@@ -23,7 +25,8 @@
 class DefaultSessionProcessor(SessionProcessorBase):
     def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
         self._invoker: Invoker = invoker
-        self._queue_item: Optional[SessionQueueItem] = None
+        self._queue_items: Set[int] = set()
+        self._sessions_to_cancel: Set[int] = set()
         self._invocation: Optional[BaseInvocation] = None
 
         self._resume_event = ThreadEvent()
@@ -33,10 +36,14 @@ def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
 
         local_handler.register(event_name=EventServiceBase.queue_event, _func=self._on_queue_event)
 
-        self._thread_limit = self._invoker.services.model_manager.load.gpu_count
+        self._thread_limit = 1
         self._thread_semaphore = BoundedSemaphore(self._thread_limit)
         self._polling_interval = polling_interval
 
+        self._worker_thread_count = self._invoker.services.configuration.max_threads
+        self._session_worker_queue: Queue[SessionQueueItem] = Queue()
+        self._process_lock = Lock()
+
         # If profiling is enabled, create a profiler. The same profiler will be used for all sessions. Internally,
         # the profiler will create a new profile for each session.
         self._profiler = (
@@ -49,6 +56,7 @@ def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
             else None
         )
 
+        # main session processor loop - single thread
         self._thread = Thread(
             name="session_processor",
             target=self._process,
@@ -61,6 +69,16 @@ def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
         )
         self._thread.start()
 
+        # Session processor workers - multithreaded
+        self._invoker.services.logger.debug(f"Starting {self._worker_thread_count} session processing threads.")
+        for _i in range(0, self._worker_thread_count):
+            worker = Thread(
+                name="session_worker",
+                target=self._process_next_session,
+                daemon=True,
+            )
+            worker.start()
+
     def stop(self, *args, **kwargs) -> None:
         self._stop_event.set()
 
@@ -70,18 +88,12 @@ def _poll_now(self) -> None:
     async def _on_queue_event(self, event: FastAPIEvent) -> None:
         event_name = event[1]["event"]
 
-        if (
-            event_name == "session_canceled"
-            and self._queue_item
-            and self._queue_item.item_id == event[1]["data"]["queue_item_id"]
-        ):
+        if event_name == "session_canceled" and event[1]["data"]["queue_item_id"] in self._queue_items:
+            self._sessions_to_cancel.add(event[1]["data"]["queue_item_id"])
             self._cancel_event.set()
             self._poll_now()
-        elif (
-            event_name == "queue_cleared"
-            and self._queue_item
-            and self._queue_item.queue_id == event[1]["data"]["queue_id"]
-        ):
+        elif event_name == "queue_cleared" and event[1]["data"]["queue_id"] in self._queue_items:
+            self._sessions_to_cancel.add(event[1]["data"]["queue_item_id"])
             self._cancel_event.set()
             self._poll_now()
         elif event_name == "batch_enqueued":
@@ -100,7 +112,7 @@ def pause(self) -> SessionProcessorStatus:
     def get_status(self) -> SessionProcessorStatus:
         return SessionProcessorStatus(
             is_started=self._resume_event.is_set(),
-            is_processing=self._queue_item is not None,
+            is_processing=len(self._queue_items) > 0,
         )
 
     def _process(
@@ -109,7 +121,7 @@ def _process(
         poll_now_event: ThreadEvent,
         resume_event: ThreadEvent,
         cancel_event: ThreadEvent,
-    ):
+    ) -> None:
         # Outermost processor try block; any unhandled exception is a fatal processor error
         try:
             self._thread_semaphore.acquire()
@@ -119,168 +131,21 @@ def _process(
 
             while not stop_event.is_set():
                 poll_now_event.clear()
-                # Middle processor try block; any unhandled exception is a non-fatal processor error
-                try:
-                    # If we are paused, wait for resume event
-                    resume_event.wait()
-
-                    # Get the next session to process
-                    self._queue_item = self._invoker.services.session_queue.dequeue()
-
-                    if self._queue_item is None:
-                        # The queue was empty, wait for next polling interval or event to try again
-                        self._invoker.services.logger.debug("Waiting for next polling interval or event")
-                        poll_now_event.wait(self._polling_interval)
-                        continue
+                resume_event.wait()
 
-                    self._invoker.services.logger.debug(f"Executing queue item {self._queue_item.item_id}")
-                    cancel_event.clear()
-
-                    # If profiling is enabled, start the profiler
-                    if self._profiler is not None:
-                        self._profiler.start(profile_id=self._queue_item.session_id)
-
-                    # Prepare invocations and take the first
-                    self._invocation = self._queue_item.session.next()
-
-                    # Loop over invocations until the session is complete or canceled
-                    while self._invocation is not None and not cancel_event.is_set():
-                        # get the source node id to provide to clients (the prepared node id is not as useful)
-                        source_invocation_id = self._queue_item.session.prepared_source_mapping[self._invocation.id]
-
-                        # Send starting event
-                        self._invoker.services.events.emit_invocation_started(
-                            queue_batch_id=self._queue_item.batch_id,
-                            queue_item_id=self._queue_item.item_id,
-                            queue_id=self._queue_item.queue_id,
-                            graph_execution_state_id=self._queue_item.session_id,
-                            node=self._invocation.model_dump(),
-                            source_node_id=source_invocation_id,
-                        )
+                # Get the next session to process
+                session = self._invoker.services.session_queue.dequeue()
 
-                        # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
-                        try:
-                            with self._invoker.services.performance_statistics.collect_stats(
-                                self._invocation, self._queue_item.session.id
-                            ):
-                                # Build invocation context (the node-facing API)
-                                data = InvocationContextData(
-                                    invocation=self._invocation,
-                                    source_invocation_id=source_invocation_id,
-                                    queue_item=self._queue_item,
-                                )
-                                context = build_invocation_context(
-                                    data=data,
-                                    services=self._invoker.services,
-                                    cancel_event=self._cancel_event,
-                                )
-
-                                # Invoke the node
-                                outputs = self._invocation.invoke_internal(
-                                    context=context, services=self._invoker.services
-                                )
-
-                                # Save outputs and history
-                                self._queue_item.session.complete(self._invocation.id, outputs)
-
-                                # Send complete event
-                                self._invoker.services.events.emit_invocation_complete(
-                                    queue_batch_id=self._queue_item.batch_id,
-                                    queue_item_id=self._queue_item.item_id,
-                                    queue_id=self._queue_item.queue_id,
-                                    graph_execution_state_id=self._queue_item.session.id,
-                                    node=self._invocation.model_dump(),
-                                    source_node_id=source_invocation_id,
-                                    result=outputs.model_dump(),
-                                )
-
-                        except KeyboardInterrupt:
-                            # TODO(MM2): Create an event for this
-                            pass
-
-                        except CanceledException:
-                            # When the user cancels the graph, we first set the cancel event. The event is checked
-                            # between invocations, in this loop. Some invocations are long-running, and we need to
-                            # be able to cancel them mid-execution.
-                            #
-                            # For example, denoising is a long-running invocation with many steps. A step callback
-                            # is executed after each step. This step callback checks if the canceled event is set,
-                            # then raises a CanceledException to stop execution immediately.
-                            #
-                            # When we get a CanceledException, we don't need to do anything - just pass and let the
-                            # loop go to its next iteration, and the cancel event will be handled correctly.
-                            pass
-
-                        except Exception as e:
-                            error = traceback.format_exc()
-
-                            # Save error
-                            self._queue_item.session.set_node_error(self._invocation.id, error)
-                            self._invoker.services.logger.error(
-                                f"Error while invoking session {self._queue_item.session_id}, invocation {self._invocation.id} ({self._invocation.get_type()}):\n{e}"
-                            )
-                            self._invoker.services.logger.error(error)
-
-                            # Send error event
-                            self._invoker.services.events.emit_invocation_error(
-                                queue_batch_id=self._queue_item.session_id,
-                                queue_item_id=self._queue_item.item_id,
-                                queue_id=self._queue_item.queue_id,
-                                graph_execution_state_id=self._queue_item.session.id,
-                                node=self._invocation.model_dump(),
-                                source_node_id=source_invocation_id,
-                                error_type=e.__class__.__name__,
-                                error=error,
-                            )
-                            pass
-
-                        # The session is complete if the all invocations are complete or there was an error
-                        if self._queue_item.session.is_complete() or cancel_event.is_set():
-                            # Send complete event
-                            self._invoker.services.events.emit_graph_execution_complete(
-                                queue_batch_id=self._queue_item.batch_id,
-                                queue_item_id=self._queue_item.item_id,
-                                queue_id=self._queue_item.queue_id,
-                                graph_execution_state_id=self._queue_item.session.id,
-                            )
-                            # If we are profiling, stop the profiler and dump the profile & stats
-                            if self._profiler:
-                                profile_path = self._profiler.stop()
-                                stats_path = profile_path.with_suffix(".json")
-                                self._invoker.services.performance_statistics.dump_stats(
-                                    graph_execution_state_id=self._queue_item.session.id, output_path=stats_path
-                                )
-                            # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
-                            # we don't care about that - suppress the error.
-                            with suppress(GESStatsNotFoundError):
-                                self._invoker.services.performance_statistics.log_stats(self._queue_item.session.id)
-                                self._invoker.services.performance_statistics.reset_stats()
-
-                            # Set the invocation to None to prepare for the next session
-                            self._invocation = None
-                        else:
-                            # Prepare the next invocation
-                            self._invocation = self._queue_item.session.next()
-                    else:
-                        # The queue was empty, wait for next polling interval or event to try again
-                        self._invoker.services.logger.debug("Waiting for next polling interval or event")
-                        poll_now_event.wait(self._polling_interval)
-                        continue
-                except Exception:
-                    # Non-fatal error in processor
-                    self._invoker.services.logger.error(
-                        f"Non-fatal error in session processor:\n{traceback.format_exc()}"
-                    )
-                    # Cancel the queue item
-                    if self._queue_item is not None:
-                        self._invoker.services.session_queue.cancel_queue_item(
-                            self._queue_item.item_id, error=traceback.format_exc()
-                        )
-                    # Reset the invocation to None to prepare for the next session
-                    self._invocation = None
-                    # Immediately poll for next queue item
+                if session is None:
+                    # The queue was empty, wait for next polling interval or event to try again
+                    self._invoker.services.logger.debug("Waiting for next polling interval or event")
                     poll_now_event.wait(self._polling_interval)
                     continue
+
+                self._queue_items.add(session.item_id)
+                self._session_worker_queue.put(session)
+                self._invoker.services.logger.debug(f"Executing queue item {session.item_id}")
+                cancel_event.clear()
         except Exception:
             # Fatal error in processor, log and pass - we're done here
             self._invoker.services.logger.error(f"Fatal Error in session processor:\n{traceback.format_exc()}")
@@ -288,5 +153,177 @@ def _process(
         finally:
             stop_event.clear()
             poll_now_event.clear()
-            self._queue_item = None
+            self._queue_items.clear()
             self._thread_semaphore.release()
+
+    def _process_next_session(self) -> None:
+        profiler = (
+            Profiler(
+                logger=self._invoker.services.logger,
+                output_dir=self._invoker.services.configuration.profiles_path,
+                prefix=self._invoker.services.configuration.profile_prefix,
+            )
+            if self._invoker.services.configuration.profile_graphs
+            else None
+        )
+        stats_service = InvocationStatsService()
+        stats_service.start(self._invoker)
+
+        while True:
+            # Outer try block. Any error here is a fatal processor error
+            try:
+                session = self._session_worker_queue.get()
+                if self._cancel_event.is_set():
+                    if session.item_id in self._sessions_to_cancel:
+                        print("DEBUG: CANCEL")
+                        continue
+
+                if profiler is not None:
+                    profiler.start(profile_id=session.session_id)
+
+                # Prepare invocations and take the first
+                with self._process_lock:
+                    invocation = session.session.next()
+
+                # Loop over invocations until the session is complete or canceled
+                while invocation is not None:
+                    if self._stop_event.is_set():
+                        break
+                    self._resume_event.wait()
+
+                    self._process_next_invocation(session, invocation, stats_service)
+
+                    # The session is complete if all invocations are complete or there was an error
+                    if session.session.is_complete():
+                        # Send complete event
+                        self._invoker.services.events.emit_graph_execution_complete(
+                            queue_batch_id=session.batch_id,
+                            queue_item_id=session.item_id,
+                            queue_id=session.queue_id,
+                            graph_execution_state_id=session.session.id,
+                        )
+                        # Log stats
+                        # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
+                        # we don't care about that - suppress the error.
+                        with suppress(GESStatsNotFoundError):
+                            stats_service.log_stats(session.session.id)
+                            stats_service.reset_stats()
+
+                        # If we are profiling, stop the profiler and dump the profile & stats
+                        if self._profiler:
+                            profile_path = self._profiler.stop()
+                            stats_path = profile_path.with_suffix(".json")
+                            stats_service.dump_stats(
+                                graph_execution_state_id=session.session.id, output_path=stats_path
+                            )
+                        self._queue_items.remove(session.item_id)
+                        invocation = None
+                    else:
+                        # Prepare the next invocation
+                        with self._process_lock:
+                            invocation = session.session.next()
+
+            except Exception:
+                # Non-fatal error in processor
+                self._invoker.services.logger.error(f"Non-fatal error in session processor:\n{traceback.format_exc()}")
+
+                # Cancel the queue item
+                if session is not None:
+                    self._invoker.services.session_queue.cancel_queue_item(
+                        session.item_id, error=traceback.format_exc()
+                    )
+            finally:
+                self._session_worker_queue.task_done()
+
+    def _process_next_invocation(
+        self,
+        session: SessionQueueItem,
+        invocation: BaseInvocation,
+        stats_service: InvocationStatsService,
+    ) -> None:
+        # get the source node id to provide to clients (the prepared node id is not as useful)
+        source_invocation_id = session.session.prepared_source_mapping[invocation.id]
+
+        self._invoker.services.logger.debug(f"Executing invocation {session.session.id}:{source_invocation_id}")
+
+        # Send starting event
+        self._invoker.services.events.emit_invocation_started(
+            queue_batch_id=session.batch_id,
+            queue_item_id=session.item_id,
+            queue_id=session.queue_id,
+            graph_execution_state_id=session.session_id,
+            node=invocation.model_dump(),
+            source_node_id=source_invocation_id,
+        )
+
+        # Innermost processor try block; any unhandled exception is an invocation error & will fail the graph
+        try:
+            # Build invocation context (the node-facing API)
+            data = InvocationContextData(
+                invocation=invocation,
+                source_invocation_id=source_invocation_id,
+                queue_item=session,
+            )
+            context = build_invocation_context(
+                data=data,
+                services=self._invoker.services,
+                cancel_event=self._cancel_event,
+            )
+
+            # Invoke the node
+            # title = invocation.UIConfig.title
+            with stats_service.collect_stats(invocation, session.session.id):
+                outputs = invocation.invoke_internal(context=context, services=self._invoker.services)
+
+            # Save outputs and history
+            session.session.complete(invocation.id, outputs)
+
+            # Send complete event
+            self._invoker.services.events.emit_invocation_complete(
+                queue_batch_id=session.batch_id,
+                queue_item_id=session.item_id,
+                queue_id=session.queue_id,
+                graph_execution_state_id=session.session.id,
+                node=invocation.model_dump(),
+                source_node_id=source_invocation_id,
+                result=outputs.model_dump(),
+            )
+
+        except KeyboardInterrupt:
+            # TODO(MM2): Create an event for this
+            pass
+
+        except CanceledException:
+            # When the user cancels the graph, we first set the cancel event. The event is checked
+            # between invocations, in this loop. Some invocations are long-running, and we need to
+            # be able to cancel them mid-execution.
+            #
+            # For example, denoising is a long-running invocation with many steps. A step callback
+            # is executed after each step. This step callback checks if the canceled event is set,
+            # then raises a CanceledException to stop execution immediately.
+            #
+            # When we get a CanceledException, we don't need to do anything - just pass and let the
+            # loop go to its next iteration, and the cancel event will be handled correctly.
+            pass
+
+        except Exception as e:
+            error = traceback.format_exc()
+
+            # Save error
+            session.session.set_node_error(invocation.id, error)
+            self._invoker.services.logger.error(
+                f"Error while invoking session {session.session_id}, invocation {invocation.id} ({invocation.get_type()}):\n{e}"
+            )
+            self._invoker.services.logger.error(error)
+
+            # Send error event
+            self._invoker.services.events.emit_invocation_error(
+                queue_batch_id=session.session_id,
+                queue_item_id=session.item_id,
+                queue_id=session.queue_id,
+                graph_execution_state_id=session.session.id,
+                node=invocation.model_dump(),
+                source_node_id=source_invocation_id,
+                error_type=e.__class__.__name__,
+                error=error,
+            )
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 519a45c237f..4478360dfed 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -27,6 +27,7 @@
 from typing import Dict, List, Optional, Set
 
 import torch
+from pydantic import BaseModel
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
 from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
@@ -50,6 +51,26 @@
 MB = 2**20
 
 
+# GPU device can only be used by one thread at a time.
+# The refcount indicates the number of models stored
+# in it.
+class GPUDeviceStatus(BaseModel):
+    """Track of which threads are using the GPU(s) on this system."""
+
+    device: torch.device
+    thread_id: int = 0
+    refcount: int = 0
+
+    class Config:
+        """Configure the base model."""
+
+        arbitrary_types_allowed = True
+
+    def __hash__(self) -> int:
+        """Allow to be added to a set."""
+        return hash(str(torch.device))
+
+
 class ModelCache(ModelCacheBase[AnyModel]):
     """Implementation of ModelCacheBase."""
 
@@ -79,7 +100,7 @@ def __init__(
         """
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
-        self._execution_devices: Set[torch.device] = execution_devices or self._get_execution_devices()
+        self._execution_devices: Set[GPUDeviceStatus] = self._get_execution_devices(execution_devices)
         self._storage_device: torch.device = storage_device
         self._lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -91,9 +112,10 @@ def __init__(
 
         self._lock = Lock()
         self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
-        self._busy_execution_devices: Set[torch.device] = set()
 
-        self.logger.info(f"Using rendering device(s) {[self._device_name(x) for x in self._execution_devices]}")
+        self.logger.info(
+            f"Using rendering device(s): {', '.join(sorted([str(x.device) for x in self._execution_devices]))}"
+        )
 
     @property
     def logger(self) -> Logger:
@@ -108,22 +130,34 @@ def storage_device(self) -> torch.device:
     @property
     def execution_devices(self) -> Set[torch.device]:
         """Return the set of available execution devices."""
-        return self._execution_devices
+        return {x.device for x in self._execution_devices}
 
     def acquire_execution_device(self, timeout: int = 0) -> torch.device:
         """Acquire and return an execution device (e.g. "cuda" for VRAM)."""
-        with self._lock:
+        current_thread = threading.current_thread().ident
+        assert current_thread is not None
+
+        # first try to assign a device that is already executing on this thread
+        if claimed_devices := [x for x in self._execution_devices if x.thread_id == current_thread]:
+            claimed_devices[0].refcount += 1
+            return claimed_devices[0].device
+        else:
+            # this thread is not currently using any gpu. Wait for a free one
             self._free_execution_device.acquire(timeout=timeout)
-            free_devices = self.execution_devices - self._busy_execution_devices
-            chosen_device = list(free_devices)[0]
-            self._busy_execution_devices.add(chosen_device)
-        return chosen_device
+            unclaimed_devices = [x for x in self._execution_devices if x.refcount == 0]
+            unclaimed_devices[0].thread_id = current_thread
+            unclaimed_devices[0].refcount += 1
+            return unclaimed_devices[0].device
 
     def release_execution_device(self, device: torch.device) -> None:
         """Mark this execution device as unused."""
-        with self._lock:
-            self._free_execution_device.release()
-            self._busy_execution_devices.remove(device)
+        current_thread = threading.current_thread().ident
+        for x in self._execution_devices:
+            if x.thread_id == current_thread and x.device == device:
+                x.refcount -= 1
+                if x.refcount == 0:
+                    x.thread_id = 0
+                    self._free_execution_device.release()
 
     @property
     def max_cache_size(self) -> float:
@@ -174,7 +208,7 @@ def put(
             if key in self._cached_models:
                 return
             self.make_room(size)
-            cache_record = CacheRecord(key, model, size)
+            cache_record = CacheRecord(key, model=model, size=size)
             self._cached_models[key] = cache_record
             self._cache_stack.append(key)
 
@@ -361,14 +395,15 @@ def _check_free_vram(self, target_device: torch.device, needed_size: int) -> Non
             raise torch.cuda.OutOfMemoryError
 
     @staticmethod
-    def _get_execution_devices() -> Set[torch.device]:
-        default_device = choose_torch_device()
-        if default_device != torch.device("cuda"):
-            return {default_device}
-
-        # we get here if the default device is cuda, and return each of the
-        # cuda devices.
-        return {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+    def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[GPUDeviceStatus]:
+        if not devices:
+            default_device = choose_torch_device()
+            if default_device != torch.device("cuda"):
+                devices = {default_device}
+            else:
+                # we get here if the default device is cuda, and return each of the cuda devices.
+                devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+        return {GPUDeviceStatus(device=x) for x in devices}
 
     @staticmethod
     def _device_name(device: torch.device) -> str:
diff --git a/tests/conftest.py b/tests/conftest.py
index 7a7fdf32bbf..97fd46de9b8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,7 +17,6 @@
 from invokeai.app.services.images.images_default import ImageService
 from invokeai.app.services.invocation_cache.invocation_cache_memory import MemoryInvocationCache
 from invokeai.app.services.invocation_services import InvocationServices
-from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService
 from invokeai.app.services.invoker import Invoker
 from invokeai.backend.util.logging import InvokeAILogger
 from tests.fixtures.sqlite_database import create_mock_sqlite_database  # noqa: F401
@@ -48,7 +47,6 @@ def mock_services() -> InvocationServices:
         model_manager=None,  # type: ignore
         download_queue=None,  # type: ignore
         names=None,  # type: ignore
-        performance_statistics=InvocationStatsService(),
         session_processor=None,  # type: ignore
         session_queue=None,  # type: ignore
         urls=None,  # type: ignore

From eca29c41d072bca3125247771e5d38ddc5ca1420 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 1 Apr 2024 13:30:02 -0400
Subject: [PATCH 08/30] added notes

---
 .../model_manager/load/model_cache/model_locker.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 30c5dfa8c89..0ea87fbe063 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -33,6 +33,20 @@ def model(self) -> AnyModel:
         """Return the model without moving it around."""
         return self._cache_entry.model
 
+    # ---------------------------- NOTE -----------------
+    # Ryan suggests keeping a copy of the model's state dict in CPU and copying it
+    # into the GPU with code like this:
+    #
+    # def state_dict_to(state_dict: dict[str, torch.Tensor], device: torch.device) -> dict[str, torch.Tensor]:
+    #    new_state_dict: dict[str, torch.Tensor] = {}
+    #    for k, v in state_dict.items():
+    #       new_state_dict[k] = v.to(device=device, copy=True, non_blocking=True)
+    #    return new_state_dict
+    #
+    # I believe we'd then use load_state_dict() to inject the state dict into the model.
+    # See: https://pytorch.org/tutorials/beginner/saving_loading_models.html
+    # ---------------------------- NOTE -----------------
+
     def lock(self) -> AnyModel:
         """Move the model into the execution device (GPU) and lock it."""
         if not hasattr(self.model, "to"):

From 3d6937278574b212186ecdba2cd1093fc6470777 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 1 Apr 2024 16:01:43 -0400
Subject: [PATCH 09/30] implement session-level reservation of gpus

---
 .../app/services/config/config_default.py     |   4 +-
 .../session_processor_default.py              |  84 ++++++------
 .../load/model_cache/model_cache_base.py      |  26 ++--
 .../load/model_cache/model_cache_default.py   | 121 ++++++++++--------
 .../load/model_cache/model_locker.py          |  13 +-
 invokeai/backend/util/devices.py              |  10 +-
 6 files changed, 138 insertions(+), 120 deletions(-)

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 258cd58e8da..8ab41052847 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -100,7 +100,8 @@ class InvokeAIAppConfig(BaseSettings):
         ram: Maximum memory amount used by memory model cache for rapid switching (GB).
         convert_cache: Maximum size of on-disk converted models cache (GB).
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
-        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
+        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `mps`
+        devices: List of execution devices; will override default device selected.
         precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`, `autocast`
         sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
         attention_type: Attention type.<br>Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`
@@ -108,6 +109,7 @@ class InvokeAIAppConfig(BaseSettings):
         force_tiled_decode: Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).
         pil_compress_level: The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.
         max_queue_size: Maximum number of items in the session queue.
+        max_threads: Maximum number of session queue execution threads.
         allow_nodes: List of nodes to allow. Omit to allow all.
         deny_nodes: List of nodes to deny. Omit to deny none.
         node_cache_size: How many cached nodes to keep in memory.
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index 3088d99c5de..fd198d0ff99 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -181,47 +181,51 @@ def _process_next_session(self) -> None:
                 if profiler is not None:
                     profiler.start(profile_id=session.session_id)
 
-                # Prepare invocations and take the first
-                with self._process_lock:
-                    invocation = session.session.next()
-
-                # Loop over invocations until the session is complete or canceled
-                while invocation is not None:
-                    if self._stop_event.is_set():
-                        break
-                    self._resume_event.wait()
-
-                    self._process_next_invocation(session, invocation, stats_service)
-
-                    # The session is complete if all invocations are complete or there was an error
-                    if session.session.is_complete():
-                        # Send complete event
-                        self._invoker.services.events.emit_graph_execution_complete(
-                            queue_batch_id=session.batch_id,
-                            queue_item_id=session.item_id,
-                            queue_id=session.queue_id,
-                            graph_execution_state_id=session.session.id,
-                        )
-                        # Log stats
-                        # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
-                        # we don't care about that - suppress the error.
-                        with suppress(GESStatsNotFoundError):
-                            stats_service.log_stats(session.session.id)
-                            stats_service.reset_stats()
-
-                        # If we are profiling, stop the profiler and dump the profile & stats
-                        if self._profiler:
-                            profile_path = self._profiler.stop()
-                            stats_path = profile_path.with_suffix(".json")
-                            stats_service.dump_stats(
-                                graph_execution_state_id=session.session.id, output_path=stats_path
+                # reserve a GPU for this session - may block
+                with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device() as gpu:
+                    print(f"DEBUG: session {session.item_id} has reserved gpu {gpu}")
+
+                    # Prepare invocations and take the first
+                    with self._process_lock:
+                        invocation = session.session.next()
+
+                    # Loop over invocations until the session is complete or canceled
+                    while invocation is not None:
+                        if self._stop_event.is_set():
+                            break
+                        self._resume_event.wait()
+
+                        self._process_next_invocation(session, invocation, stats_service)
+
+                        # The session is complete if all invocations are complete or there was an error
+                        if session.session.is_complete():
+                            # Send complete event
+                            self._invoker.services.events.emit_graph_execution_complete(
+                                queue_batch_id=session.batch_id,
+                                queue_item_id=session.item_id,
+                                queue_id=session.queue_id,
+                                graph_execution_state_id=session.session.id,
                             )
-                        self._queue_items.remove(session.item_id)
-                        invocation = None
-                    else:
-                        # Prepare the next invocation
-                        with self._process_lock:
-                            invocation = session.session.next()
+                            # Log stats
+                            # We'll get a GESStatsNotFoundError if we try to log stats for an untracked graph, but in the processor
+                            # we don't care about that - suppress the error.
+                            with suppress(GESStatsNotFoundError):
+                                stats_service.log_stats(session.session.id)
+                                stats_service.reset_stats()
+
+                            # If we are profiling, stop the profiler and dump the profile & stats
+                            if self._profiler:
+                                profile_path = self._profiler.stop()
+                                stats_path = profile_path.with_suffix(".json")
+                                stats_service.dump_stats(
+                                    graph_execution_state_id=session.session.id, output_path=stats_path
+                                )
+                            self._queue_items.remove(session.item_id)
+                            invocation = None
+                        else:
+                            # Prepare the next invocation
+                            with self._process_lock:
+                                invocation = session.session.next()
 
             except Exception:
                 # Non-fatal error in processor
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index 1d6a4f15dbc..45640aff427 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -8,9 +8,10 @@
 """
 
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from logging import Logger
-from typing import Dict, Generic, Optional, Set, TypeVar
+from typing import Dict, Generator, Generic, Optional, Set, TypeVar
 
 import torch
 
@@ -93,20 +94,23 @@ def execution_devices(self) -> Set[torch.device]:
         """Return the set of available execution devices."""
         pass
 
+    @contextmanager
     @abstractmethod
-    def acquire_execution_device(self, timeout: int = 0) -> torch.device:
-        """
-        Pick the next available execution device.
-
-        If all devices are currently engaged (locked), then
-        block until timeout seconds have passed and raise a
-        TimeoutError if no devices are available.
-        """
+    def reserve_execution_device(self, timeout: int = 0) -> Generator[torch.device, None, None]:
+        """Reserve an execution device (GPU) under the current thread id."""
         pass
 
     @abstractmethod
-    def release_execution_device(self, device: torch.device) -> None:
-        """Release a previously-acquired execution device."""
+    def get_execution_device(self) -> torch.device:
+        """
+        Return an execution device that has been reserved for current thread.
+
+        Note that reservations are done using the current thread's TID.
+        It would be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
+
+        May generate a ValueError if no GPU has been reserved.
+        """
         pass
 
     @property
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 4478360dfed..04cac01092e 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -21,13 +21,12 @@
 import gc
 import sys
 import threading
-from contextlib import suppress
+from contextlib import contextmanager, suppress
 from logging import Logger
-from threading import BoundedSemaphore, Lock
-from typing import Dict, List, Optional, Set
+from threading import BoundedSemaphore
+from typing import Dict, Generator, List, Optional, Set
 
 import torch
-from pydantic import BaseModel
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
 from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
@@ -51,26 +50,6 @@
 MB = 2**20
 
 
-# GPU device can only be used by one thread at a time.
-# The refcount indicates the number of models stored
-# in it.
-class GPUDeviceStatus(BaseModel):
-    """Track of which threads are using the GPU(s) on this system."""
-
-    device: torch.device
-    thread_id: int = 0
-    refcount: int = 0
-
-    class Config:
-        """Configure the base model."""
-
-        arbitrary_types_allowed = True
-
-    def __hash__(self) -> int:
-        """Allow to be added to a set."""
-        return hash(str(torch.device))
-
-
 class ModelCache(ModelCacheBase[AnyModel]):
     """Implementation of ModelCacheBase."""
 
@@ -100,9 +79,8 @@ def __init__(
         """
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
-        self._execution_devices: Set[GPUDeviceStatus] = self._get_execution_devices(execution_devices)
         self._storage_device: torch.device = storage_device
-        self._lock = threading.Lock()
+        self._ram_lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
         self._log_memory_usage = log_memory_usage
         self._stats: Optional[CacheStats] = None
@@ -110,11 +88,15 @@ def __init__(
         self._cached_models: Dict[str, CacheRecord[AnyModel]] = {}
         self._cache_stack: List[str] = []
 
-        self._lock = Lock()
+        # device to thread id
+        self._device_lock = threading.Lock()
+        self._execution_devices: Dict[torch.device, int] = {
+            x: 0 for x in execution_devices or self._get_execution_devices()
+        }
         self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
 
         self.logger.info(
-            f"Using rendering device(s): {', '.join(sorted([str(x.device) for x in self._execution_devices]))}"
+            f"Using rendering device(s): {', '.join(sorted([str(x) for x in self._execution_devices.keys()]))}"
         )
 
     @property
@@ -130,34 +112,61 @@ def storage_device(self) -> torch.device:
     @property
     def execution_devices(self) -> Set[torch.device]:
         """Return the set of available execution devices."""
-        return {x.device for x in self._execution_devices}
+        devices = self._execution_devices.keys()
+        return set(devices)
 
-    def acquire_execution_device(self, timeout: int = 0) -> torch.device:
-        """Acquire and return an execution device (e.g. "cuda" for VRAM)."""
-        current_thread = threading.current_thread().ident
-        assert current_thread is not None
+    def get_execution_device(self) -> torch.device:
+        """
+        Return an execution device that has been reserved for current thread.
 
-        # first try to assign a device that is already executing on this thread
-        if claimed_devices := [x for x in self._execution_devices if x.thread_id == current_thread]:
-            claimed_devices[0].refcount += 1
-            return claimed_devices[0].device
-        else:
-            # this thread is not currently using any gpu. Wait for a free one
-            self._free_execution_device.acquire(timeout=timeout)
-            unclaimed_devices = [x for x in self._execution_devices if x.refcount == 0]
-            unclaimed_devices[0].thread_id = current_thread
-            unclaimed_devices[0].refcount += 1
-            return unclaimed_devices[0].device
+        Note that reservations are done using the current thread's TID.
+        It would be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
 
-    def release_execution_device(self, device: torch.device) -> None:
-        """Mark this execution device as unused."""
+        May generate a ValueError if no GPU has been reserved.
+        """
         current_thread = threading.current_thread().ident
-        for x in self._execution_devices:
-            if x.thread_id == current_thread and x.device == device:
-                x.refcount -= 1
-                if x.refcount == 0:
-                    x.thread_id = 0
-                    self._free_execution_device.release()
+        assert current_thread is not None
+        assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
+        if not assigned:
+            raise ValueError("No GPU has been reserved for the use of thread {current_thread}")
+        return assigned[0]
+
+    @contextmanager
+    def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[torch.device, None, None]:
+        """Reserve an execution device (e.g. GPU) for exclusive use by a generation thread.
+
+        Note that the reservation is done using the current thread's TID.
+        It would be better to do this using the session ID, but that involves
+        too many detailed changes to model manager calls.
+        """
+        device = None
+        with self._device_lock:
+            current_thread = threading.current_thread().ident
+            assert current_thread is not None
+
+            # look for a device that has already been assigned to this thread
+            assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
+            if assigned:
+                device = assigned[0]
+
+        # no device already assigned. Get one.
+        if device is None:
+            self._free_execution_device.acquire(timeout=timeout)
+            with self._device_lock:
+                free_device = [x for x, tid in self._execution_devices.items() if tid == 0]
+                print(f"DEBUG: execution devices = {self._execution_devices}")
+                self._execution_devices[free_device[0]] = current_thread
+                device = free_device[0]
+
+        # we are outside the lock region now
+        try:
+            yield device
+        finally:
+            with self._device_lock:
+                self._execution_devices[device] = 0
+                self._free_execution_device.release()
+                torch.cuda.empty_cache()
 
     @property
     def max_cache_size(self) -> float:
@@ -203,7 +212,7 @@ def put(
         submodel_type: Optional[SubModelType] = None,
     ) -> None:
         """Store model under key and optional submodel_type."""
-        with self._lock:
+        with self._ram_lock:
             key = self._make_cache_key(key, submodel_type)
             if key in self._cached_models:
                 return
@@ -228,7 +237,7 @@ def get(
 
         This may raise an IndexError if the model is not in the cache.
         """
-        with self._lock:
+        with self._ram_lock:
             key = self._make_cache_key(key, submodel_type)
             if key in self._cached_models:
                 if self.stats:
@@ -395,7 +404,7 @@ def _check_free_vram(self, target_device: torch.device, needed_size: int) -> Non
             raise torch.cuda.OutOfMemoryError
 
     @staticmethod
-    def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[GPUDeviceStatus]:
+    def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[torch.device]:
         if not devices:
             default_device = choose_torch_device()
             if default_device != torch.device("cuda"):
@@ -403,7 +412,7 @@ def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[G
             else:
                 # we get here if the default device is cuda, and return each of the cuda devices.
                 devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
-        return {GPUDeviceStatus(device=x) for x in devices}
+        return devices
 
     @staticmethod
     def _device_name(device: torch.device) -> str:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 0ea87fbe063..fd2465b5178 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -56,8 +56,8 @@ def lock(self) -> AnyModel:
         self._cache_entry.lock()
 
         try:
-            # We wait for a gpu to be free - may raise a TimeoutError
-            self._execution_device = self._cache.acquire_execution_device(MAX_GPU_WAIT)
+            # We wait for a gpu to be free - may raise a ValueError
+            self._execution_device = self._cache.get_execution_device()
             self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
             model_in_gpu = copy.deepcopy(self._cache_entry.model)
             if hasattr(model_in_gpu, "to"):
@@ -77,14 +77,5 @@ def unlock(self) -> None:
         """Call upon exit from context."""
         if not hasattr(self.model, "to"):
             return
-
         self._cache_entry.unlock()
-        if self._execution_device:
-            self._cache.release_execution_device(self._execution_device)
-
-        try:
-            torch.cuda.empty_cache()
-            torch.mps.empty_cache()
-        except Exception:
-            pass
         self._cache.print_cuda_stats()
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index 0be53c842a7..e0fd0b1c9e7 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -15,7 +15,15 @@
 
 
 def choose_torch_device() -> torch.device:
-    """Convenience routine for guessing which GPU device to run model on"""
+    """Convenience routine for guessing which GPU device to run model on."""
+    # """Temporarily modified to use the model manager's get_execution_device()"""
+    # try:
+    #     from invokeai.app.api.dependencies import ApiDependencies
+    #     model_manager = ApiDependencies.invoker.services.model_manager
+    #     device = model_manager.load.ram_cache.acquire_execution_device()
+    #     print(f'DEBUG choose_torch_device returning {device}')
+    #     return device
+    # except Exception:
     config = get_config()
     if config.device == "auto":
         if torch.cuda.is_available():

From 9adb15f86c4ac6da1b3f6a41e37e9834b21d2286 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 1 Apr 2024 18:44:24 -0400
Subject: [PATCH 10/30] working but filled with debug statements

---
 invokeai/app/api/dependencies.py              |  5 +++
 invokeai/app/invocations/latent.py            |  7 +++-
 .../object_serializer_forward_cache.py        | 32 ++++++++++++-----
 .../session_processor_default.py              |  2 --
 .../load/model_cache/model_cache_default.py   | 23 +++++--------
 .../stable_diffusion/diffusers_pipeline.py    | 10 ++++++
 .../diffusion/shared_invokeai_diffusion.py    |  3 ++
 invokeai/backend/util/devices.py              | 34 +++++++++----------
 8 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/invokeai/app/api/dependencies.py b/invokeai/app/api/dependencies.py
index 7332b35c086..995d08106a5 100644
--- a/invokeai/app/api/dependencies.py
+++ b/invokeai/app/api/dependencies.py
@@ -4,6 +4,8 @@
 
 import torch
 
+import invokeai.backend.util.devices  # horrible hack
+
 from invokeai.app.services.object_serializer.object_serializer_disk import ObjectSerializerDisk
 from invokeai.app.services.object_serializer.object_serializer_forward_cache import ObjectSerializerForwardCache
 from invokeai.app.services.shared.sqlite.sqlite_util import init_db
@@ -100,6 +102,9 @@ def initialize(config: InvokeAIAppConfig, event_handler_id: int, logger: Logger
             download_queue=download_queue_service,
             events=events,
         )
+        # horrible hack - remove
+        invokeai.backend.util.devices.RAM_CACHE = model_manager.load.ram_cache
+        
         names = SimpleNameService()
         session_processor = DefaultSessionProcessor()
         session_queue = SqliteSessionQueue(db=db)
diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py
index 7845cbba03a..9d73601d718 100644
--- a/invokeai/app/invocations/latent.py
+++ b/invokeai/app/invocations/latent.py
@@ -4,7 +4,7 @@
 from contextlib import ExitStack
 from functools import singledispatchmethod
 from typing import Any, Iterator, List, Literal, Optional, Tuple, Union
-
+import threading
 import einops
 import numpy as np
 import numpy.typing as npt
@@ -393,6 +393,11 @@ def get_conditioning_data(
             # flip all bits to have noise different from initial
             generator=torch.Generator(device=unet.device).manual_seed(seed ^ 0xFFFFFFFF),
         )
+
+        if conditioning_data.unconditioned_embeddings.embeds.device != conditioning_data.text_embeddings.embeds.device:
+            print(f'DEBUG; ERROR uc={conditioning_data.unconditioned_embeddings.embeds.device} c={conditioning_data.text_embeddings.embeds.device} unet={unet.device}, tid={threading.current_thread().ident}')
+
+
         return conditioning_data
 
     def create_pipeline(
diff --git a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
index b361259a4b1..7d04d47d5c3 100644
--- a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
+++ b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
@@ -1,5 +1,6 @@
 from queue import Queue
 from typing import TYPE_CHECKING, Optional, TypeVar
+import threading
 
 from invokeai.app.services.object_serializer.object_serializer_base import ObjectSerializerBase
 
@@ -18,8 +19,8 @@ class ObjectSerializerForwardCache(ObjectSerializerBase[T]):
     def __init__(self, underlying_storage: ObjectSerializerBase[T], max_cache_size: int = 20):
         super().__init__()
         self._underlying_storage = underlying_storage
-        self._cache: dict[str, T] = {}
-        self._cache_ids = Queue[str]()
+        self._cache: dict[int, dict[str, T]] = {}
+        self._cache_ids: dict[int, Queue[str]] = {}
         self._max_cache_size = max_cache_size
 
     def start(self, invoker: "Invoker") -> None:
@@ -54,12 +55,27 @@ def delete(self, name: str) -> None:
             del self._cache[name]
         self._on_deleted(name)
 
+    def _get_tid_cache(self) -> dict[str, T]:
+        tid = threading.current_thread().ident
+        if tid not in self._cache:
+            self._cache[tid] = {}
+        return self._cache[tid]
+
+    def _get_tid_cache_ids(self) -> Queue[str]:
+        tid = threading.current_thread().ident
+        if tid not in self._cache_ids:
+            self._cache_ids[tid] = Queue[str]()
+        return self._cache_ids[tid]
+
     def _get_cache(self, name: str) -> Optional[T]:
-        return None if name not in self._cache else self._cache[name]
+        cache = self._get_tid_cache()
+        return None if name not in cache else cache[name]
 
     def _set_cache(self, name: str, data: T):
-        if name not in self._cache:
-            self._cache[name] = data
-            self._cache_ids.put(name)
-            if self._cache_ids.qsize() > self._max_cache_size:
-                self._cache.pop(self._cache_ids.get())
+        cache = self._get_tid_cache()
+        if name not in cache:
+            cache[name] = data
+            cache_ids = self._get_tid_cache_ids()
+            cache_ids.put(name)
+            if cache_ids.qsize() > self._max_cache_size:
+                cache.pop(cache_ids.get())
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index fd198d0ff99..b1f3203fe0a 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -175,7 +175,6 @@ def _process_next_session(self) -> None:
                 session = self._session_worker_queue.get()
                 if self._cancel_event.is_set():
                     if session.item_id in self._sessions_to_cancel:
-                        print("DEBUG: CANCEL")
                         continue
 
                 if profiler is not None:
@@ -183,7 +182,6 @@ def _process_next_session(self) -> None:
 
                 # reserve a GPU for this session - may block
                 with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device() as gpu:
-                    print(f"DEBUG: session {session.item_id} has reserved gpu {gpu}")
 
                     # Prepare invocations and take the first
                     with self._process_lock:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 04cac01092e..a6b7f9524dc 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -30,15 +30,11 @@
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
 from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
-from invokeai.backend.util.devices import choose_torch_device
 from invokeai.backend.util.logging import InvokeAILogger
 
 from .model_cache_base import CacheRecord, CacheStats, ModelCacheBase, ModelLockerBase
 from .model_locker import ModelLocker
 
-if choose_torch_device() == torch.device("mps"):
-    from torch import mps
-
 # Maximum size of the cache, in gigs
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
@@ -130,6 +126,7 @@ def get_execution_device(self) -> torch.device:
         assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
         if not assigned:
             raise ValueError("No GPU has been reserved for the use of thread {current_thread}")
+        print(f'DEBUG: TID={current_thread}; owns {assigned[0]}')
         return assigned[0]
 
     @contextmanager
@@ -155,15 +152,16 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
             self._free_execution_device.acquire(timeout=timeout)
             with self._device_lock:
                 free_device = [x for x, tid in self._execution_devices.items() if tid == 0]
-                print(f"DEBUG: execution devices = {self._execution_devices}")
                 self._execution_devices[free_device[0]] = current_thread
                 device = free_device[0]
 
         # we are outside the lock region now
+        print(f'DEBUG: RESERVED {device} for TID {current_thread}')
         try:
             yield device
         finally:
             with self._device_lock:
+                print(f'DEBUG: RELEASED {device} for TID {current_thread}')
                 self._execution_devices[device] = 0
                 self._free_execution_device.release()
                 torch.cuda.empty_cache()
@@ -386,11 +384,6 @@ def make_room(self, model_size: int) -> None:
             if self.stats:
                 self.stats.cleared = models_cleared
             gc.collect()
-
-        torch.cuda.empty_cache()
-        if choose_torch_device() == torch.device("mps"):
-            mps.empty_cache()
-
         self.logger.debug(f"After making room: cached_models={len(self._cached_models)}")
 
     def _check_free_vram(self, target_device: torch.device, needed_size: int) -> None:
@@ -406,12 +399,12 @@ def _check_free_vram(self, target_device: torch.device, needed_size: int) -> Non
     @staticmethod
     def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[torch.device]:
         if not devices:
-            default_device = choose_torch_device()
-            if default_device != torch.device("cuda"):
-                devices = {default_device}
-            else:
-                # we get here if the default device is cuda, and return each of the cuda devices.
+            if torch.cuda.is_available():
                 devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+            elif torch.backends.mps.is_available():
+                devices = {torch.device('mps')}
+            else:
+                devices = {torch.device('cpu')}
         return devices
 
     @staticmethod
diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
index dae55a07517..34e40daa1c9 100644
--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@@ -414,6 +414,11 @@ def generate_latents_from_embeddings(
         else:
             attn_ctx = nullcontext()
 
+        # NOTE error is not here!
+        if conditioning_data.unconditioned_embeddings.embeds.device != \
+           conditioning_data.text_embeddings.embeds.device:
+            print('DEBUG; HERE IS THE ERROR 1')
+
         with attn_ctx:
             if callback is not None:
                 callback(
@@ -428,6 +433,10 @@ def generate_latents_from_embeddings(
 
             # print("timesteps:", timesteps)
             for i, t in enumerate(self.progress_bar(timesteps)):
+                if conditioning_data.unconditioned_embeddings.embeds.device != \
+                   conditioning_data.text_embeddings.embeds.device:
+                    print('DEBUG; HERE IS THE ERROR 2')
+
                 batched_t = t.expand(batch_size)
                 step_output = self.step(
                     batched_t,
@@ -472,6 +481,7 @@ def step(
         t2i_adapter_data: Optional[list[T2IAdapterData]] = None,
         ip_adapter_unet_patcher: Optional[UNetPatcher] = None,
     ):
+
         # invokeai_diffuser has batched timesteps, but diffusers schedulers expect a single value
         timestep = t[0]
         if additional_guidance is None:
diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
index f55876623cd..f4ca8d54184 100644
--- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
+++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
@@ -5,6 +5,7 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
+import threading
 from diffusers import UNet2DConditionModel
 from typing_extensions import TypeAlias
 
@@ -288,6 +289,8 @@ def _pad_conditioning(cond, target_len, encoder_attention_mask):
             unconditioning, encoder_attention_mask = _pad_conditioning(unconditioning, max_len, encoder_attention_mask)
             conditioning, encoder_attention_mask = _pad_conditioning(conditioning, max_len, encoder_attention_mask)
 
+        if unconditioning.device != conditioning.device:
+            print(f'DEBUG: TID={threading.current_thread().ident}: Unconditioning device = {unconditioning.device}, conditioning device={conditioning.device}')
         return torch.cat([unconditioning, conditioning]), encoder_attention_mask
 
     # methods below are called from do_diffusion_step and should be considered private to this class.
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index e0fd0b1c9e7..5aa81306894 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -12,28 +12,26 @@
 CPU_DEVICE = torch.device("cpu")
 CUDA_DEVICE = torch.device("cuda")
 MPS_DEVICE = torch.device("mps")
-
+RAM_CACHE = None   # horrible hack
 
 def choose_torch_device() -> torch.device:
     """Convenience routine for guessing which GPU device to run model on."""
-    # """Temporarily modified to use the model manager's get_execution_device()"""
-    # try:
-    #     from invokeai.app.api.dependencies import ApiDependencies
-    #     model_manager = ApiDependencies.invoker.services.model_manager
-    #     device = model_manager.load.ram_cache.acquire_execution_device()
-    #     print(f'DEBUG choose_torch_device returning {device}')
-    #     return device
-    # except Exception:
-    config = get_config()
-    if config.device == "auto":
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return torch.device("mps")
+    """Temporarily modified to use the model manager's get_execution_device()"""
+    global RAM_CACHE
+    try:
+        device = RAM_CACHE.get_execution_device()
+        return device
+    except (ValueError, AttributeError):
+        config = get_config()
+        if config.device == "auto":
+            if torch.cuda.is_available():
+                return torch.device("cuda")
+            if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                return torch.device("mps")
+            else:
+                return CPU_DEVICE
         else:
-            return CPU_DEVICE
-    else:
-        return torch.device(config.device)
+            return torch.device(config.device)
 
 
 def get_torch_device_name() -> str:

From f7436f3baed95a7fe77dec69a0dea03070c0fbb7 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 15 Apr 2024 22:15:50 -0400
Subject: [PATCH 11/30] fixup config_default; patch TorchDevice to work
 dynamically

---
 invokeai/app/api/dependencies.py              |  3 +-
 invokeai/app/invocations/compel.py            | 18 +++++----
 invokeai/app/invocations/latent.py            |  8 +---
 .../app/services/config/config_default.py     | 37 +++++++++++++++----
 .../model_manager/model_manager_default.py    |  5 ---
 .../object_serializer_forward_cache.py        |  2 +-
 .../session_processor_default.py              |  3 +-
 .../app/services/shared/invocation_context.py | 34 ++++++++++++++++-
 .../load/model_cache/model_cache_base.py      |  2 +-
 .../load/model_cache/model_cache_default.py   | 13 ++++---
 .../stable_diffusion/diffusers_pipeline.py    | 10 -----
 .../diffusion/shared_invokeai_diffusion.py    |  3 --
 invokeai/backend/util/devices.py              | 16 +++++++-
 13 files changed, 102 insertions(+), 52 deletions(-)

diff --git a/invokeai/app/api/dependencies.py b/invokeai/app/api/dependencies.py
index 995d08106a5..f492da90f3e 100644
--- a/invokeai/app/api/dependencies.py
+++ b/invokeai/app/api/dependencies.py
@@ -5,7 +5,6 @@
 import torch
 
 import invokeai.backend.util.devices  # horrible hack
-
 from invokeai.app.services.object_serializer.object_serializer_disk import ObjectSerializerDisk
 from invokeai.app.services.object_serializer.object_serializer_forward_cache import ObjectSerializerForwardCache
 from invokeai.app.services.shared.sqlite.sqlite_util import init_db
@@ -104,7 +103,7 @@ def initialize(config: InvokeAIAppConfig, event_handler_id: int, logger: Logger
         )
         # horrible hack - remove
         invokeai.backend.util.devices.RAM_CACHE = model_manager.load.ram_cache
-        
+
         names = SimpleNameService()
         session_processor = DefaultSessionProcessor()
         session_queue = SqliteSessionQueue(db=db)
diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index 0d5024a9c53..158f11a58e8 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -68,16 +68,19 @@ def invoke(self, context: InvocationContext) -> ConditioningOutput:
         tokenizer_model = tokenizer_info.model
         assert isinstance(tokenizer_model, CLIPTokenizer)
         text_encoder_info = context.models.load(self.clip.text_encoder)
+        text_encoder_model = text_encoder_info.model
+        assert isinstance(text_encoder_model, CLIPTextModel)
 
         def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             for lora in self.clip.loras:
                 lora_info = context.models.load(lora.lora)
                 assert isinstance(lora_info.model, LoRAModelRaw)
-                with lora_info as model:
-                    yield (model, lora.weight)
+                yield (lora_info.model, lora.weight)
                 del lora_info
             return
 
+        # loras = [(context.models.get(**lora.dict(exclude={"weight"})).context.model, lora.weight) for lora in self.clip.loras]
+
         ti_list = generate_ti_list(self.prompt, text_encoder_info.config.base, context)
 
         with (
@@ -136,7 +139,8 @@ def run_clip_compel(
         tokenizer_model = tokenizer_info.model
         assert isinstance(tokenizer_model, CLIPTokenizer)
         text_encoder_info = context.models.load(clip_field.text_encoder)
-        assert isinstance(text_encoder_info.model, (CLIPTextModel, CLIPTextModelWithProjection))
+        text_encoder_model = text_encoder_info.model
+        assert isinstance(text_encoder_model, (CLIPTextModel, CLIPTextModelWithProjection))
 
         # return zero on empty
         if prompt == "" and zero_on_empty:
@@ -195,11 +199,11 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
                 requires_pooled=get_pooled,
             )
 
-                conjunction = Compel.parse_prompt_string(prompt)
+            conjunction = Compel.parse_prompt_string(prompt)
 
-                if context.config.get().log_tokenization:
-                    # TODO: better logging for and syntax
-                    log_tokenization_for_conjunction(conjunction, tokenizer)
+            if context.config.get().log_tokenization:
+                # TODO: better logging for and syntax
+                log_tokenization_for_conjunction(conjunction, tokenizer)
 
             # TODO: ask for optimizations? to not run text_encoder twice
             c, _options = compel.build_conditioning_tensor_for_conjunction(conjunction)
diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py
index 1bdceff9da5..a8ead96f3ac 100644
--- a/invokeai/app/invocations/latent.py
+++ b/invokeai/app/invocations/latent.py
@@ -4,7 +4,7 @@
 from contextlib import ExitStack
 from functools import singledispatchmethod
 from typing import Any, Iterator, List, Literal, Optional, Tuple, Union
-import threading
+
 import einops
 import numpy as np
 import numpy.typing as npt
@@ -525,11 +525,6 @@ def get_conditioning_data(
             guidance_scale=self.cfg_scale,
             guidance_rescale_multiplier=self.cfg_rescale_multiplier,
         )
-
-        if conditioning_data.unconditioned_embeddings.embeds.device != conditioning_data.text_embeddings.embeds.device:
-            print(f'DEBUG; ERROR uc={conditioning_data.unconditioned_embeddings.embeds.device} c={conditioning_data.text_embeddings.embeds.device} unet={unet.device}, tid={threading.current_thread().ident}')
-
-
         return conditioning_data
 
     def create_pipeline(
@@ -899,6 +894,7 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
                     mask = mask.to(device=unet.device, dtype=unet.dtype)
                 if masked_latents is not None:
                     masked_latents = masked_latents.to(device=unet.device, dtype=unet.dtype)
+
                 scheduler = get_scheduler(
                     context=context,
                     scheduler_info=self.unet.scheduler,
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 908a0de1d81..39dc1fe83be 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -31,7 +31,7 @@
 ATTENTION_SLICE_SIZE = Literal["auto", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8]
 LOG_FORMAT = Literal["plain", "color", "syslog", "legacy"]
 LOG_LEVEL = Literal["debug", "info", "warning", "error", "critical"]
-CONFIG_SCHEMA_VERSION = "4.0.1"
+CONFIG_SCHEMA_VERSION = "4.0.2"
 
 
 def get_default_ram_cache_size() -> float:
@@ -101,9 +101,9 @@ class InvokeAIAppConfig(BaseSettings):
         ram: Maximum memory amount used by memory model cache for rapid switching (GB).
         convert_cache: Maximum size of on-disk converted models cache (GB).
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
-        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `cuda:8`, `mps`
-        devices: List of execution devices to use in a multi-GPU environment; will override default device selected.
-        precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
+        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `mps`
+        devices: List of execution devices; will override default device selected.
+        precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`, `autocast`
         sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
         attention_type: Attention type.<br>Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`
         attention_slice_size: Slice size, valid when attention_type=="sliced".<br>Valid values: `auto`, `balanced`, `max`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`
@@ -366,9 +366,9 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
             # `max_cache_size` was renamed to `ram` some time in v3, but both names were used
             if k == "max_cache_size" and "ram" not in category_dict:
                 parsed_config_dict["ram"] = v
-            # `max_vram_cache_size` was renamed to `vram` some time in v3, but both names were used
-            if k == "max_vram_cache_size" and "vram" not in category_dict:
-                parsed_config_dict["vram"] = v
+            # vram was removed in v4.0.2
+            if k in ["vram", "max_vram_cache_size", "lazy_offload"]:
+                continue
             # autocast was removed in v4.0.1
             if k == "precision" and v == "autocast":
                 parsed_config_dict["precision"] = "auto"
@@ -416,6 +416,25 @@ def migrate_v4_0_0_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
     return config
 
 
+def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
+    """Migrate v4.0.1 config dictionary to a current config object.
+
+    Args:
+        config_dict: A dictionary of settings from a v4.0.1 config file.
+
+    Returns:
+        An instance of `InvokeAIAppConfig` with the migrated settings.
+    """
+    parsed_config_dict: dict[str, Any] = {}
+    for k, v in config_dict.items():
+        if k not in ["vram", "lazy_offload"]:
+            parsed_config_dict[k] = v
+        if k == "schema_version":
+            parsed_config_dict[k] = CONFIG_SCHEMA_VERSION
+    config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
+    return config
+
+
 def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
     """Load and migrate a config file to the latest version.
 
@@ -447,6 +466,10 @@ def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
         loaded_config_dict = migrate_v4_0_0_config_dict(loaded_config_dict)
         loaded_config_dict.write_file(config_path)
 
+    elif loaded_config_dict["schema_version"] == "4.0.1":
+        loaded_config_dict = migrate_v4_0_1_config_dict(loaded_config_dict)
+        loaded_config_dict.write_file(config_path)
+
     # Attempt to load as a v4 config file
     try:
         # Meta is not included in the model fields, so we need to validate it separately
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index 4d595835d1c..241259c803b 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -1,14 +1,11 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 """Implementation of ModelManagerServiceBase."""
 
-from typing import Optional
-
 import torch
 from typing_extensions import Self
 
 from invokeai.app.services.invoker import Invoker
 from invokeai.backend.model_manager.load import ModelCache, ModelConvertCache, ModelLoaderRegistry
-from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
 from ..config import InvokeAIAppConfig
@@ -89,8 +86,6 @@ def build_model_manager(
             max_cache_size=app_config.ram,
             logger=logger,
             execution_devices=execution_devices,
-            max_vram_cache_size=app_config.vram,
-            lazy_offloading=app_config.lazy_offload,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
         loader = ModelLoadService(
diff --git a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
index 7d04d47d5c3..bf16bfe242d 100644
--- a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
+++ b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
@@ -1,6 +1,6 @@
+import threading
 from queue import Queue
 from typing import TYPE_CHECKING, Optional, TypeVar
-import threading
 
 from invokeai.app.services.object_serializer.object_serializer_base import ObjectSerializerBase
 
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index d65f5ba86ca..eb00caba5b1 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -187,8 +187,7 @@ def _process_next_session(self) -> None:
                     profiler.start(profile_id=session.session_id)
 
                 # reserve a GPU for this session - may block
-                with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device() as gpu:
-
+                with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device():
                     # Prepare invocations and take the first
                     with self._process_lock:
                         invocation = session.session.next()
diff --git a/invokeai/app/services/shared/invocation_context.py b/invokeai/app/services/shared/invocation_context.py
index 9994d663e5e..c2c37de78da 100644
--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
 
+import torch
 from PIL.Image import Image
 from torch import Tensor
 
@@ -15,15 +16,24 @@
 from invokeai.app.services.invocation_services import InvocationServices
 from invokeai.app.services.model_records.model_records_base import UnknownModelException
 from invokeai.app.util.step_callback import stable_diffusion_step_callback
-from invokeai.backend.model_manager.config import AnyModelConfig, BaseModelType, ModelFormat, ModelType, SubModelType
+from invokeai.backend.model_manager.config import (
+    AnyModel,
+    AnyModelConfig,
+    BaseModelType,
+    ModelFormat,
+    ModelType,
+    SubModelType,
+)
 from invokeai.backend.model_manager.load.load_base import LoadedModel
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData
+from invokeai.backend.util.devices import TorchDevice
 
 if TYPE_CHECKING:
     from invokeai.app.invocations.baseinvocation import BaseInvocation
     from invokeai.app.invocations.model import ModelIdentifierField
     from invokeai.app.services.session_queue.session_queue_common import SessionQueueItem
+    from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase
 
 """
 The InvocationContext provides access to various services and data about the current invocation.
@@ -473,6 +483,28 @@ def sd_step_callback(self, intermediate_state: PipelineIntermediateState, base_m
             is_canceled=self.is_canceled,
         )
 
+    def torch_device(self) -> torch.device:
+        """
+        Return a torch device to use in the current invocation.
+
+        Returns:
+            A torch.device not currently in use by the system.
+        """
+        ram_cache: "ModelCacheBase[AnyModel]" = self._services.model_manager.load.ram_cache
+        return ram_cache.get_execution_device()
+
+    def torch_dtype(self, device: Optional[torch.device] = None) -> torch.dtype:
+        """
+        Return a precision type to use with the current invocation and torch device.
+
+        Args:
+            device: Optional device.
+
+        Returns:
+            A torch.dtype suited for the current device.
+        """
+        return TorchDevice.choose_torch_dtype(device)
+
 
 class InvocationContext:
     """Provides access to various services and data for the current invocation.
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index 469d51e4e5f..c86ec5ddda3 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -106,7 +106,7 @@ def get_execution_device(self) -> torch.device:
         Return an execution device that has been reserved for current thread.
 
         Note that reservations are done using the current thread's TID.
-        It would be better to do this using the session ID, but that involves
+        It might be better to do this using the session ID, but that involves
         too many detailed changes to model manager calls.
 
         May generate a ValueError if no GPU has been reserved.
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index fa13b8c6270..f7f466f2b09 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -127,7 +127,6 @@ def get_execution_device(self) -> torch.device:
         assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
         if not assigned:
             raise ValueError("No GPU has been reserved for the use of thread {current_thread}")
-        print(f'DEBUG: TID={current_thread}; owns {assigned[0]}')
         return assigned[0]
 
     @contextmanager
@@ -157,12 +156,15 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
                 device = free_device[0]
 
         # we are outside the lock region now
-        print(f'DEBUG: RESERVED {device} for TID {current_thread}')
+        self.logger.info("Reserved torch device {device} for execution thread {current_thread}")
+
+        # Tell TorchDevice to use this object to get the torch device.
+        TorchDevice.set_model_cache(self)
         try:
             yield device
         finally:
             with self._device_lock:
-                print(f'DEBUG: RELEASED {device} for TID {current_thread}')
+                self.logger.info("Released torch device {device}")
                 self._execution_devices[device] = 0
                 self._free_execution_device.release()
                 torch.cuda.empty_cache()
@@ -407,12 +409,11 @@ def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[t
             if torch.cuda.is_available():
                 devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
             elif torch.backends.mps.is_available():
-                devices = {torch.device('mps')}
+                devices = {torch.device("mps")}
             else:
-                devices = {torch.device('cpu')}
+                devices = {torch.device("cpu")}
         return devices
 
     @staticmethod
     def _device_name(device: torch.device) -> str:
         return f"{device.type}:{device.index}"
-
diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
index 548bbea3ce6..bd60b0b8c7f 100644
--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@@ -399,11 +399,6 @@ def generate_latents_from_embeddings(
             unet_attention_patcher = UNetAttentionPatcher(ip_adapters)
             attn_ctx = unet_attention_patcher.apply_ip_adapter_attention(self.invokeai_diffuser.model)
 
-        # NOTE error is not here!
-        if conditioning_data.unconditioned_embeddings.embeds.device != \
-           conditioning_data.text_embeddings.embeds.device:
-            print('DEBUG; HERE IS THE ERROR 1')
-
         with attn_ctx:
             if callback is not None:
                 callback(
@@ -418,10 +413,6 @@ def generate_latents_from_embeddings(
 
             # print("timesteps:", timesteps)
             for i, t in enumerate(self.progress_bar(timesteps)):
-                if conditioning_data.unconditioned_embeddings.embeds.device != \
-                   conditioning_data.text_embeddings.embeds.device:
-                    print('DEBUG; HERE IS THE ERROR 2')
-
                 batched_t = t.expand(batch_size)
                 step_output = self.step(
                     batched_t,
@@ -466,7 +457,6 @@ def step(
         ip_adapter_data: Optional[list[IPAdapterData]] = None,
         t2i_adapter_data: Optional[list[T2IAdapterData]] = None,
     ):
-
         # invokeai_diffuser has batched timesteps, but diffusers schedulers expect a single value
         timestep = t[0]
         if additional_guidance is None:
diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
index 137c84c14c9..f418133e49f 100644
--- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
+++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
@@ -4,7 +4,6 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
-import threading
 from typing_extensions import TypeAlias
 
 from invokeai.app.services.config.config_default import get_config
@@ -256,8 +255,6 @@ def _pad_conditioning(cond, target_len, encoder_attention_mask):
             unconditioning, encoder_attention_mask = _pad_conditioning(unconditioning, max_len, encoder_attention_mask)
             conditioning, encoder_attention_mask = _pad_conditioning(conditioning, max_len, encoder_attention_mask)
 
-        if unconditioning.device != conditioning.device:
-            print(f'DEBUG: TID={threading.current_thread().ident}: Unconditioning device = {unconditioning.device}, conditioning device={conditioning.device}')
         return torch.cat([unconditioning, conditioning]), encoder_attention_mask
 
     # methods below are called from do_diffusion_step and should be considered private to this class.
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index c5a4def281d..b8cdec2ac35 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -1,16 +1,21 @@
-from typing import Dict, Literal, Optional, Union
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Union
 
 import torch
 from deprecated import deprecated
 
 from invokeai.app.services.config.config_default import get_config
 
+if TYPE_CHECKING:
+    from invokeai.backend.model_manager.config import AnyModel
+    from invokeai.backend.model_manager.load.model_cache.model_cache_base import ModelCacheBase
+
 # legacy APIs
 TorchPrecisionNames = Literal["float32", "float16", "bfloat16"]
 CPU_DEVICE = torch.device("cpu")
 CUDA_DEVICE = torch.device("cuda")
 MPS_DEVICE = torch.device("mps")
 
+
 @deprecated("Use TorchDevice.choose_torch_dtype() instead.")  # type: ignore
 def choose_precision(device: torch.device) -> TorchPrecisionNames:
     """Return the string representation of the recommended torch device."""
@@ -41,9 +46,18 @@ def torch_dtype(device: torch.device) -> torch.dtype:
 class TorchDevice:
     """Abstraction layer for torch devices."""
 
+    _model_cache: Optional["ModelCacheBase[AnyModel]"] = None
+
+    @classmethod
+    def set_model_cache(cls, cache: "ModelCacheBase[AnyModel]"):
+        """Set the current model cache."""
+        cls._model_cache = cache
+
     @classmethod
     def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
+        if cls._model_cache:
+            return cls._model_cache.get_execution_device()
         app_config = get_config()
         if app_config.device != "auto":
             device = torch.device(app_config.device)

From a84f3058e2119b7b54ea16ae6730ada655fb0427 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 15 Apr 2024 22:28:48 -0400
Subject: [PATCH 12/30] revert object_serializer_forward_cache.py

---
 .../object_serializer_forward_cache.py        | 32 +++++--------------
 .../load/model_cache/model_cache_default.py   |  4 +--
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
index bf16bfe242d..b361259a4b1 100644
--- a/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
+++ b/invokeai/app/services/object_serializer/object_serializer_forward_cache.py
@@ -1,4 +1,3 @@
-import threading
 from queue import Queue
 from typing import TYPE_CHECKING, Optional, TypeVar
 
@@ -19,8 +18,8 @@ class ObjectSerializerForwardCache(ObjectSerializerBase[T]):
     def __init__(self, underlying_storage: ObjectSerializerBase[T], max_cache_size: int = 20):
         super().__init__()
         self._underlying_storage = underlying_storage
-        self._cache: dict[int, dict[str, T]] = {}
-        self._cache_ids: dict[int, Queue[str]] = {}
+        self._cache: dict[str, T] = {}
+        self._cache_ids = Queue[str]()
         self._max_cache_size = max_cache_size
 
     def start(self, invoker: "Invoker") -> None:
@@ -55,27 +54,12 @@ def delete(self, name: str) -> None:
             del self._cache[name]
         self._on_deleted(name)
 
-    def _get_tid_cache(self) -> dict[str, T]:
-        tid = threading.current_thread().ident
-        if tid not in self._cache:
-            self._cache[tid] = {}
-        return self._cache[tid]
-
-    def _get_tid_cache_ids(self) -> Queue[str]:
-        tid = threading.current_thread().ident
-        if tid not in self._cache_ids:
-            self._cache_ids[tid] = Queue[str]()
-        return self._cache_ids[tid]
-
     def _get_cache(self, name: str) -> Optional[T]:
-        cache = self._get_tid_cache()
-        return None if name not in cache else cache[name]
+        return None if name not in self._cache else self._cache[name]
 
     def _set_cache(self, name: str, data: T):
-        cache = self._get_tid_cache()
-        if name not in cache:
-            cache[name] = data
-            cache_ids = self._get_tid_cache_ids()
-            cache_ids.put(name)
-            if cache_ids.qsize() > self._max_cache_size:
-                cache.pop(cache_ids.get())
+        if name not in self._cache:
+            self._cache[name] = data
+            self._cache_ids.put(name)
+            if self._cache_ids.qsize() > self._max_cache_size:
+                self._cache.pop(self._cache_ids.get())
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index f7f466f2b09..026bb8aec53 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -156,7 +156,7 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
                 device = free_device[0]
 
         # we are outside the lock region now
-        self.logger.info("Reserved torch device {device} for execution thread {current_thread}")
+        self.logger.info(f"Reserved torch device {device} for execution thread {current_thread}")
 
         # Tell TorchDevice to use this object to get the torch device.
         TorchDevice.set_model_cache(self)
@@ -164,7 +164,7 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
             yield device
         finally:
             with self._device_lock:
-                self.logger.info("Released torch device {device}")
+                self.logger.info(f"Released torch device {device}")
                 self._execution_devices[device] = 0
                 self._free_execution_device.release()
                 torch.cuda.empty_cache()

From bd833900a33de58fcc050d3f956233da7567d510 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 15:02:06 -0400
Subject: [PATCH 13/30] add tid to cache name to avoid non-safe uuid4 on
 windows

---
 .../app/services/object_serializer/object_serializer_disk.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/invokeai/app/services/object_serializer/object_serializer_disk.py b/invokeai/app/services/object_serializer/object_serializer_disk.py
index 935fec30605..7e28320ff2c 100644
--- a/invokeai/app/services/object_serializer/object_serializer_disk.py
+++ b/invokeai/app/services/object_serializer/object_serializer_disk.py
@@ -1,3 +1,4 @@
+import threading
 import tempfile
 import typing
 from dataclasses import dataclass
@@ -70,7 +71,8 @@ def _get_path(self, name: str) -> Path:
         return self._output_dir / name
 
     def _new_name(self) -> str:
-        return f"{self._obj_class_name}_{uuid_string()}"
+        tid = threading.current_thread().ident
+        return f"{self._obj_class_name}_{tid}_{uuid_string()}"
 
     def _tempdir_cleanup(self) -> None:
         """Calls `cleanup` on the temporary directory, if it exists."""

From fb9b7fb63ad5aee1da347a1d3ce0793551234f6b Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 15:23:49 -0400
Subject: [PATCH 14/30] make object_serializer._new_name() thread-safe; add
 max_threads config

---
 invokeai/app/services/config/config_default.py       |  4 ++--
 .../object_serializer/object_serializer_disk.py      |  6 ++++--
 .../session_processor/session_processor_default.py   |  5 ++++-
 invokeai/backend/util/devices.py                     | 12 +++++++++++-
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 39dc1fe83be..895008b7530 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -110,7 +110,7 @@ class InvokeAIAppConfig(BaseSettings):
         force_tiled_decode: Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).
         pil_compress_level: The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.
         max_queue_size: Maximum number of items in the session queue.
-        max_threads: Maximum number of session queue execution threads.
+        max_threads: Maximum number of session queue execution threads. Autocalculated from number of GPUs if not set.
         allow_nodes: List of nodes to allow. Omit to allow all.
         deny_nodes: List of nodes to deny. Omit to deny none.
         node_cache_size: How many cached nodes to keep in memory.
@@ -182,7 +182,7 @@ class InvokeAIAppConfig(BaseSettings):
     force_tiled_decode:            bool = Field(default=False,              description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).")
     pil_compress_level:             int = Field(default=1,                  description="The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.")
     max_queue_size:                 int = Field(default=10000, gt=0,        description="Maximum number of items in the session queue.")
-    max_threads:                    int = Field(default=4,                  description="Maximum number of session queue execution threads.")
+    max_threads:          Optional[int] = Field(default=None,               description="Maximum number of session queue execution threads. Autocalculated from number of GPUs if not set.")
 
     # NODES
     allow_nodes:    Optional[list[str]] = Field(default=None,               description="List of nodes to allow. Omit to allow all.")
diff --git a/invokeai/app/services/object_serializer/object_serializer_disk.py b/invokeai/app/services/object_serializer/object_serializer_disk.py
index 7e28320ff2c..354a9b0c049 100644
--- a/invokeai/app/services/object_serializer/object_serializer_disk.py
+++ b/invokeai/app/services/object_serializer/object_serializer_disk.py
@@ -1,5 +1,5 @@
-import threading
 import tempfile
+import threading
 import typing
 from dataclasses import dataclass
 from pathlib import Path
@@ -72,7 +72,9 @@ def _get_path(self, name: str) -> Path:
 
     def _new_name(self) -> str:
         tid = threading.current_thread().ident
-        return f"{self._obj_class_name}_{tid}_{uuid_string()}"
+        # Add tid to the object name because uuid4 not thread-safe on windows
+        # See https://stackoverflow.com/questions/2759644/python-multiprocessing-doesnt-play-nicely-with-uuid-uuid4
+        return f"{self._obj_class_name}_{tid}-{uuid_string()}"
 
     def _tempdir_cleanup(self) -> None:
         """Calls `cleanup` on the temporary directory, if it exists."""
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index eb00caba5b1..1860e4a5b3f 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -16,6 +16,7 @@
 from invokeai.app.services.session_queue.session_queue_common import SessionQueueItem
 from invokeai.app.services.shared.invocation_context import InvocationContextData, build_invocation_context
 from invokeai.app.util.profiler import Profiler
+from invokeai.backend.util.devices import TorchDevice
 
 from ..invoker import Invoker
 from .session_processor_base import SessionProcessorBase
@@ -40,7 +41,9 @@ def start(self, invoker: Invoker, polling_interval: int = 1) -> None:
         self._thread_semaphore = BoundedSemaphore(self._thread_limit)
         self._polling_interval = polling_interval
 
-        self._worker_thread_count = self._invoker.services.configuration.max_threads
+        self._worker_thread_count = self._invoker.services.configuration.max_threads or len(
+            TorchDevice.execution_devices()
+        )
         self._session_worker_queue: Queue[SessionQueueItem] = Queue()
         self._process_lock = Lock()
 
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index b8cdec2ac35..b88206c5f72 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -1,4 +1,6 @@
-from typing import TYPE_CHECKING, Dict, Literal, Optional, Union
+"""Torch Device class provides torch device selection services."""
+
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 
 import torch
 from deprecated import deprecated
@@ -69,6 +71,14 @@ def choose_torch_device(cls) -> torch.device:
             device = CPU_DEVICE
         return cls.normalize(device)
 
+    @classmethod
+    def execution_devices(cls) -> List[torch.device]:
+        """Return a list of torch.devices that can be used for accelerated inference."""
+        if cls._model_cache:
+            return cls._model_cache.execution_devices
+        else:
+            return [cls.choose_torch_device]
+
     @classmethod
     def choose_torch_dtype(cls, device: Optional[torch.device] = None) -> torch.dtype:
         """Return the precision to use for accelerated inference."""

From 371f5bc782a364de4eea3389a7bed2d387934186 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 15:52:03 -0400
Subject: [PATCH 15/30] simplify logic for retrieving execution devices

---
 .../model_manager/model_manager_default.py    |  1 -
 .../load/model_cache/model_cache_default.py   | 14 +----------
 invokeai/backend/util/devices.py              | 23 ++++++++++++++-----
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index 241259c803b..902501c1f9e 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -85,7 +85,6 @@ def build_model_manager(
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
             logger=logger,
-            execution_devices=execution_devices,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
         loader = ModelLoadService(
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 026bb8aec53..9eab509e64d 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -65,7 +65,6 @@ def __init__(
         Initialize the model RAM cache.
 
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
-        :param execution_devices: Set of torch device to load active model into [calculated]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
@@ -88,7 +87,7 @@ def __init__(
         # device to thread id
         self._device_lock = threading.Lock()
         self._execution_devices: Dict[torch.device, int] = {
-            x: 0 for x in execution_devices or self._get_execution_devices()
+            x: 0 for x in TorchDevice.execution_devices()
         }
         self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
 
@@ -403,17 +402,6 @@ def _delete_cache_entry(self, cache_entry: CacheRecord[AnyModel]) -> None:
         self._cache_stack.remove(cache_entry.key)
         del self._cached_models[cache_entry.key]
 
-    @staticmethod
-    def _get_execution_devices(devices: Optional[Set[torch.device]] = None) -> Set[torch.device]:
-        if not devices:
-            if torch.cuda.is_available():
-                devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
-            elif torch.backends.mps.is_available():
-                devices = {torch.device("mps")}
-            else:
-                devices = {torch.device("cpu")}
-        return devices
-
     @staticmethod
     def _device_name(device: torch.device) -> str:
         return f"{device.type}:{device.index}"
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index b88206c5f72..d1c432a53f1 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -1,6 +1,6 @@
 """Torch Device class provides torch device selection services."""
 
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union, Set
 
 import torch
 from deprecated import deprecated
@@ -72,12 +72,12 @@ def choose_torch_device(cls) -> torch.device:
         return cls.normalize(device)
 
     @classmethod
-    def execution_devices(cls) -> List[torch.device]:
+    def execution_devices(cls) -> Set[torch.device]:
         """Return a list of torch.devices that can be used for accelerated inference."""
-        if cls._model_cache:
-            return cls._model_cache.execution_devices
-        else:
-            return [cls.choose_torch_device]
+        app_config = get_config()
+        if app_config.devices is None:
+            return cls._lookup_execution_devices()
+        return {torch.device(x) for x in app_config.devices}
 
     @classmethod
     def choose_torch_dtype(cls, device: Optional[torch.device] = None) -> torch.dtype:
@@ -131,3 +131,14 @@ def empty_cache(cls) -> None:
     @classmethod
     def _to_dtype(cls, precision_name: TorchPrecisionNames) -> torch.dtype:
         return NAME_TO_PRECISION[precision_name]
+
+    @classmethod
+    def _lookup_execution_devices(cls) -> Set[torch.device]:
+        if torch.cuda.is_available():
+            devices = {torch.device(f"cuda:{x}") for x in range(0, torch.cuda.device_count())}
+        elif torch.backends.mps.is_available():
+            devices = {torch.device("mps")}
+        else:
+            devices = {torch.device("cpu")}
+        return devices
+

From 99558de17863196cd9e8d874d046a4f917a3c22f Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 16:26:58 -0400
Subject: [PATCH 16/30] device selection calls go through TorchDevice

---
 .../app/services/model_manager/model_manager_default.py   | 8 --------
 .../model_manager/load/model_cache/model_cache_default.py | 4 +---
 invokeai/backend/util/devices.py                          | 3 +--
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index 902501c1f9e..ccb68f783b1 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Team
 """Implementation of ModelManagerServiceBase."""
 
-import torch
 from typing_extensions import Self
 
 from invokeai.app.services.invoker import Invoker
@@ -75,13 +74,6 @@ def build_model_manager(
         logger = InvokeAILogger.get_logger(cls.__name__)
         logger.setLevel(app_config.log_level.upper())
 
-        execution_devices = (
-            None
-            if app_config.devices is None
-            else None
-            if "auto" in app_config.devices
-            else {torch.device(x) for x in app_config.devices}
-        )
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
             logger=logger,
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 9eab509e64d..551412d66a5 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -86,9 +86,7 @@ def __init__(
 
         # device to thread id
         self._device_lock = threading.Lock()
-        self._execution_devices: Dict[torch.device, int] = {
-            x: 0 for x in TorchDevice.execution_devices()
-        }
+        self._execution_devices: Dict[torch.device, int] = {x: 0 for x in TorchDevice.execution_devices()}
         self._free_execution_device = BoundedSemaphore(len(self._execution_devices))
 
         self.logger.info(
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index d1c432a53f1..745c128099b 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -1,6 +1,6 @@
 """Torch Device class provides torch device selection services."""
 
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union, Set
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Set, Union
 
 import torch
 from deprecated import deprecated
@@ -141,4 +141,3 @@ def _lookup_execution_devices(cls) -> Set[torch.device]:
         else:
             devices = {torch.device("cpu")}
         return devices
-

From eaadc55c7dc133903469dbc0c995b753a23f6c98 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 16:55:56 -0400
Subject: [PATCH 17/30] make pause/resume work in multithreaded environment

---
 .../app/services/session_processor/session_processor_default.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index 1860e4a5b3f..02860f46f11 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -181,7 +181,9 @@ def _process_next_session(self) -> None:
         while True:
             # Outer try block. Any error here is a fatal processor error
             try:
+                self._resume_event.wait()
                 session = self._session_worker_queue.get()
+
                 if self._cancel_event.is_set():
                     if session.item_id in self._sessions_to_cancel:
                         continue

From 763a2e26324b9c9bb9d0a37ae1dbb3ec87f6cb2e Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 17:18:51 -0400
Subject: [PATCH 18/30] added more unit tests

---
 tests/backend/util/test_devices.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/backend/util/test_devices.py b/tests/backend/util/test_devices.py
index 8e810e43678..f4faea5d98d 100644
--- a/tests/backend/util/test_devices.py
+++ b/tests/backend/util/test_devices.py
@@ -8,6 +8,7 @@
 import torch
 
 from invokeai.app.services.config import get_config
+from invokeai.backend.model_manager.load import ModelCache
 from invokeai.backend.util.devices import TorchDevice, choose_precision, choose_torch_device, torch_dtype
 
 devices = ["cpu", "cuda:0", "cuda:1", "mps"]
@@ -130,3 +131,32 @@ def test_legacy_precision_name():
         assert "float16" == choose_precision(torch.device("cuda"))
         assert "float16" == choose_precision(torch.device("mps"))
         assert "float32" == choose_precision(torch.device("cpu"))
+
+
+def test_multi_device_support_1():
+    config = get_config()
+    config.devices = ["cuda:0", "cuda:1"]
+    assert TorchDevice.execution_devices() == {torch.device("cuda:0"), torch.device("cuda:1")}
+
+
+def test_multi_device_support_2():
+    config = get_config()
+    config.devices = None
+    with (
+        patch("torch.cuda.device_count", return_value=3),
+        patch("torch.cuda.is_available", return_value=True),
+    ):
+        assert TorchDevice.execution_devices() == {
+            torch.device("cuda:0"),
+            torch.device("cuda:1"),
+            torch.device("cuda:2"),
+        }
+
+
+def test_multi_device_support_3():
+    config = get_config()
+    config.devices = ["cuda:0", "cuda:1"]
+    cache = ModelCache()
+    with cache.reserve_execution_device() as gpu:
+        assert gpu in [torch.device(x) for x in config.devices]
+        assert TorchDevice.choose_torch_device() == gpu

From d04c880cce7cc7bada48dcf72129ce25a8fa7582 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 17:57:40 -0400
Subject: [PATCH 19/30] fix ValueError on model manager install

---
 .../model_manager/load/model_cache/model_cache_default.py | 2 +-
 invokeai/backend/util/devices.py                          | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 551412d66a5..910087c4bb6 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -123,7 +123,7 @@ def get_execution_device(self) -> torch.device:
         assert current_thread is not None
         assigned = [x for x, tid in self._execution_devices.items() if current_thread == tid]
         if not assigned:
-            raise ValueError("No GPU has been reserved for the use of thread {current_thread}")
+            raise ValueError(f"No GPU has been reserved for the use of thread {current_thread}")
         return assigned[0]
 
     @contextmanager
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index 745c128099b..c7db33a667a 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -59,7 +59,13 @@ def set_model_cache(cls, cache: "ModelCacheBase[AnyModel]"):
     def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
         if cls._model_cache:
-            return cls._model_cache.get_execution_device()
+            try:
+                return cls._model_cache.get_execution_device()
+            except ValueError as e:  # May happen if no gpu was reserved. Return a generic device.
+                if str(e).startswith("No GPU has been reserved"):
+                    pass
+                else:
+                    raise e
         app_config = get_config()
         if app_config.device != "auto":
             device = torch.device(app_config.device)

From edac01d4fb8e921b620147efb5bb067c87422229 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 18:13:59 -0400
Subject: [PATCH 20/30] reverse stupid hack

---
 .../app/services/model_install/model_install_default.py   | 3 +--
 invokeai/backend/util/devices.py                          | 8 +-------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/invokeai/app/services/model_install/model_install_default.py b/invokeai/app/services/model_install/model_install_default.py
index 6a3117bcb87..32b31f744c6 100644
--- a/invokeai/app/services/model_install/model_install_default.py
+++ b/invokeai/app/services/model_install/model_install_default.py
@@ -43,7 +43,6 @@
 from invokeai.backend.model_manager.probe import ModelProbe
 from invokeai.backend.model_manager.search import ModelSearch
 from invokeai.backend.util import InvokeAILogger
-from invokeai.backend.util.devices import TorchDevice
 
 from .model_install_base import (
     MODEL_SOURCE_TO_TYPE_MAP,
@@ -637,7 +636,7 @@ def _next_id(self) -> int:
 
     def _guess_variant(self) -> Optional[ModelRepoVariant]:
         """Guess the best HuggingFace variant type to download."""
-        precision = TorchDevice.choose_torch_dtype()
+        precision = torch.float16 if self._app_config.precision == "auto" else torch.dtype(self._app_config.precision)
         return ModelRepoVariant.FP16 if precision == torch.float16 else None
 
     def _import_local_model(self, source: LocalModelSource, config: Optional[Dict[str, Any]]) -> ModelInstallJob:
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index c7db33a667a..745c128099b 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -59,13 +59,7 @@ def set_model_cache(cls, cache: "ModelCacheBase[AnyModel]"):
     def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
         if cls._model_cache:
-            try:
-                return cls._model_cache.get_execution_device()
-            except ValueError as e:  # May happen if no gpu was reserved. Return a generic device.
-                if str(e).startswith("No GPU has been reserved"):
-                    pass
-                else:
-                    raise e
+            return cls._model_cache.get_execution_device()
         app_config = get_config()
         if app_config.device != "auto":
             device = torch.device(app_config.device)

From 84f5cbdd9775b2b05acfbad954fff4a476ed92ba Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Tue, 16 Apr 2024 19:19:19 -0400
Subject: [PATCH 21/30] make choose_torch_dtype() usable outside an invocation
 context

---
 .../app/services/model_install/model_install_default.py    | 3 ++-
 invokeai/backend/util/devices.py                           | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/invokeai/app/services/model_install/model_install_default.py b/invokeai/app/services/model_install/model_install_default.py
index 32b31f744c6..6a3117bcb87 100644
--- a/invokeai/app/services/model_install/model_install_default.py
+++ b/invokeai/app/services/model_install/model_install_default.py
@@ -43,6 +43,7 @@
 from invokeai.backend.model_manager.probe import ModelProbe
 from invokeai.backend.model_manager.search import ModelSearch
 from invokeai.backend.util import InvokeAILogger
+from invokeai.backend.util.devices import TorchDevice
 
 from .model_install_base import (
     MODEL_SOURCE_TO_TYPE_MAP,
@@ -636,7 +637,7 @@ def _next_id(self) -> int:
 
     def _guess_variant(self) -> Optional[ModelRepoVariant]:
         """Guess the best HuggingFace variant type to download."""
-        precision = torch.float16 if self._app_config.precision == "auto" else torch.dtype(self._app_config.precision)
+        precision = TorchDevice.choose_torch_dtype()
         return ModelRepoVariant.FP16 if precision == torch.float16 else None
 
     def _import_local_model(self, source: LocalModelSource, config: Optional[Dict[str, Any]]) -> ModelInstallJob:
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index 745c128099b..dc2bafaa9c4 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -60,6 +60,11 @@ def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
         if cls._model_cache:
             return cls._model_cache.get_execution_device()
+        else:
+            return cls._choose_device()
+
+    @classmethod
+    def _choose_device(cls) -> torch.device:
         app_config = get_config()
         if app_config.device != "auto":
             device = torch.device(app_config.device)
@@ -82,8 +87,8 @@ def execution_devices(cls) -> Set[torch.device]:
     @classmethod
     def choose_torch_dtype(cls, device: Optional[torch.device] = None) -> torch.dtype:
         """Return the precision to use for accelerated inference."""
-        device = device or cls.choose_torch_device()
         config = get_config()
+        device = device or cls._choose_device()
         if device.type == "cuda" and torch.cuda.is_available():
             device_name = torch.cuda.get_device_name(device)
             if "GeForce GTX 1660" in device_name or "GeForce GTX 1650" in device_name:

From c3d125289200fb7b45c904c4d47ba925b5c8ab81 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Wed, 17 Apr 2024 09:51:57 -0400
Subject: [PATCH 22/30] revert to old system for doing RAM <-> VRAM transfers;
 new way leaks memory

---
 .../app/services/config/config_default.py     |  16 +--
 .../model_manager/model_manager_default.py    |   2 +
 .../load/model_cache/model_cache_base.py      |  16 +++
 .../load/model_cache/model_cache_default.py   | 101 +++++++++++++++++-
 .../load/model_cache/model_locker.py          |  22 ++--
 5 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 895008b7530..4f4d4850da8 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -24,6 +24,7 @@
 DB_FILE = Path("invokeai.db")
 LEGACY_INIT_FILE = Path("invokeai.init")
 DEFAULT_RAM_CACHE = 10.0
+DEFAULT_VRAM_CACHE = 0.25
 DEFAULT_CONVERT_CACHE = 20.0
 DEVICE = Literal["auto", "cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "cuda:6", "cuda:7", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32", "autocast"]
@@ -99,7 +100,9 @@ class InvokeAIAppConfig(BaseSettings):
         profile_prefix: An optional prefix for profile output files.
         profiles_dir: Path to profiles output directory.
         ram: Maximum memory amount used by memory model cache for rapid switching (GB).
+        vram: Amount of VRAM reserved for model storage (GB).
         convert_cache: Maximum size of on-disk converted models cache (GB).
+        lazy_offload: Keep models in VRAM until their space is needed.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `mps`
         devices: List of execution devices; will override default device selected.
@@ -167,7 +170,9 @@ class InvokeAIAppConfig(BaseSettings):
 
     # CACHE
     ram:                          float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
+    vram:                         float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
     convert_cache:                float = Field(default=DEFAULT_CONVERT_CACHE, ge=0, description="Maximum size of on-disk converted models cache (GB).")
+    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
 
     # DEVICE
@@ -366,9 +371,6 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
             # `max_cache_size` was renamed to `ram` some time in v3, but both names were used
             if k == "max_cache_size" and "ram" not in category_dict:
                 parsed_config_dict["ram"] = v
-            # vram was removed in v4.0.2
-            if k in ["vram", "max_vram_cache_size", "lazy_offload"]:
-                continue
             # autocast was removed in v4.0.1
             if k == "precision" and v == "autocast":
                 parsed_config_dict["precision"] = "auto"
@@ -419,6 +421,9 @@ def migrate_v4_0_0_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
 def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
     """Migrate v4.0.1 config dictionary to a current config object.
 
+    A few new multi-GPU options were added in 4.0.2, and this simply
+    updates the schema label.
+
     Args:
         config_dict: A dictionary of settings from a v4.0.1 config file.
 
@@ -426,15 +431,14 @@ def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
         An instance of `InvokeAIAppConfig` with the migrated settings.
     """
     parsed_config_dict: dict[str, Any] = {}
-    for k, v in config_dict.items():
-        if k not in ["vram", "lazy_offload"]:
-            parsed_config_dict[k] = v
+    for k, _ in config_dict.items():
         if k == "schema_version":
             parsed_config_dict[k] = CONFIG_SCHEMA_VERSION
     config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
     return config
 
 
+# TO DO: replace this with a formal registration and migration system
 def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
     """Load and migrate a config file to the latest version.
 
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index ccb68f783b1..fbb19ab5275 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -76,6 +76,8 @@ def build_model_manager(
 
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
+            max_vram_cache_size=app_config.vram,
+            lazy_offloading=app_config.lazy_offload,
             logger=logger,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index c86ec5ddda3..6e6553db47a 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -113,12 +113,28 @@ def get_execution_device(self) -> torch.device:
         """
         pass
 
+    @property
+    @abstractmethod
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        pass
+
     @property
     @abstractmethod
     def max_cache_size(self) -> float:
         """Return true if the cache is configured to lazily offload models in VRAM."""
         pass
 
+    @abstractmethod
+    def offload_unlocked_models(self, size_required: int) -> None:
+        """Offload from VRAM any models not actively in use."""
+        pass
+
+    @abstractmethod
+    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
+        """Move model into the indicated device."""
+        pass
+
     @property
     @abstractmethod
     def stats(self) -> Optional[CacheStats]:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 910087c4bb6..3cebfb8820f 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -19,8 +19,10 @@
 """
 
 import gc
+import math
 import sys
 import threading
+import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
@@ -29,7 +31,7 @@
 import torch
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
@@ -40,6 +42,11 @@
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
 
+# amount of GPU memory to hold in reserve for use by generations (GB)
+# Empirically this value seems to improve performance without starving other
+# processes.
+DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
+
 # actual size of a gig
 GIG = 1073741824
 
@@ -53,10 +60,12 @@ class ModelCache(ModelCacheBase[AnyModel]):
     def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
+        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
         execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
         sequential_offload: bool = False,
+        lazy_offloading: bool = True,
         sha_chunksize: int = 16777216,
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
@@ -67,14 +76,18 @@ def __init__(
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
+        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
         :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
             operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
             snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
             behaviour.
         """
+        # allow lazy offloading only when vram cache enabled
+        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
+        self._max_vram_cache_size: float = max_vram_cache_size
         self._storage_device: torch.device = storage_device
         self._ram_lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -98,6 +111,11 @@ def logger(self) -> Logger:
         """Return the logger used by the cache."""
         return self._logger
 
+    @property
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        return self._lazy_offloading
+
     @property
     def storage_device(self) -> torch.device:
         """Return the storage device (e.g. "CPU" for RAM)."""
@@ -277,6 +295,87 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
         else:
             return model_key
 
+    def offload_unlocked_models(self, size_required: int) -> None:
+        """Move any unused models from VRAM."""
+        reserved = self._max_vram_cache_size * GIG
+        vram_in_use = torch.cuda.memory_allocated() + size_required
+        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
+        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
+            if vram_in_use <= reserved:
+                break
+            if not cache_entry.loaded:
+                continue
+            if not cache_entry.locked:
+                self.move_model_to_device(cache_entry, self.storage_device)
+                cache_entry.loaded = False
+                vram_in_use = torch.cuda.memory_allocated() + size_required
+                self.logger.debug(
+                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
+                )
+
+        TorchDevice.empty_cache()
+
+    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
+        """Move model into the indicated device.
+
+        :param cache_entry: The CacheRecord for the model
+        :param target_device: The torch.device to move the model into
+
+        May raise a torch.cuda.OutOfMemoryError
+        """
+        # These attributes are not in the base ModelMixin class but in various derived classes.
+        # Some models don't have these attributes, in which case they run in RAM/CPU.
+        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
+        if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
+            return
+
+        source_device = cache_entry.model.device
+
+        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
+        # This would need to be revised to support multi-GPU.
+        if torch.device(source_device).type == torch.device(target_device).type:
+            return
+
+        start_model_to_time = time.time()
+        snapshot_before = self._capture_memory_snapshot()
+        try:
+            cache_entry.model.to(target_device)
+        except Exception as e:  # blow away cache entry
+            self._delete_cache_entry(cache_entry)
+            raise e
+
+        snapshot_after = self._capture_memory_snapshot()
+        end_model_to_time = time.time()
+        self.logger.debug(
+            f"Moved model '{cache_entry.key}' from {source_device} to"
+            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
+            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
+            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+        )
+
+        if (
+            snapshot_before is not None
+            and snapshot_after is not None
+            and snapshot_before.vram is not None
+            and snapshot_after.vram is not None
+        ):
+            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
+
+            # If the estimated model size does not match the change in VRAM, log a warning.
+            if not math.isclose(
+                vram_change,
+                cache_entry.size,
+                rel_tol=0.1,
+                abs_tol=10 * MB,
+            ):
+                self.logger.debug(
+                    f"Moving model '{cache_entry.key}' from {source_device} to"
+                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
+                    " estimated size may be incorrect. Estimated model size:"
+                    f" {(cache_entry.size/GIG):.3f} GB.\n"
+                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+                )
+
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index c7685fc8f72..b9349ea3dd3 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -2,7 +2,6 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """
 
-import copy
 from typing import Optional
 
 import torch
@@ -55,13 +54,14 @@ def lock(self) -> AnyModel:
         # NOTE that the model has to have the to() method in order for this code to move it into GPU!
         self._cache_entry.lock()
         try:
-            # We wait for a gpu to be free - may raise a ValueError
-            self._execution_device = self._cache.get_execution_device()
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
-            model_in_gpu = copy.deepcopy(self._cache_entry.model)
-            if hasattr(model_in_gpu, "to"):
-                model_in_gpu.to(self._execution_device)
+            if self._cache.lazy_offloading:
+                self._cache.offload_unlocked_models(self._cache_entry.size)
+
+            execution_device = self._cache.get_execution_device()
+            self._cache.move_model_to_device(self._cache_entry, execution_device)
             self._cache_entry.loaded = True
+
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -70,11 +70,15 @@ def lock(self) -> AnyModel:
         except Exception:
             self._cache_entry.unlock()
             raise
-        return model_in_gpu
+
+        return self.model
 
     def unlock(self) -> None:
         """Call upon exit from context."""
         if not hasattr(self.model, "to"):
             return
+
         self._cache_entry.unlock()
-        self._cache.print_cuda_stats()
+        if not self._cache.lazy_offloading:
+            self._cache.offload_unlocked_models(self._cache_entry.size)
+            self._cache.print_cuda_stats()

From 589a7959c019bc56e23ad3d989d015e443ffa20b Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 2 Jun 2024 18:19:29 -0400
Subject: [PATCH 23/30] fixup unit tests and remove debugging statements

---
 invokeai/app/api/dependencies.py              |   1 -
 invokeai/app/invocations/compel.py            |   2 +
 .../invocation_stats_default.py               |   6 +-
 .../model_manager/model_manager_default.py    |   2 -
 .../session_processor_default.py              |  13 +-
 .../session_queue/session_queue_common.py     |   1 +
 .../app/services/shared/invocation_context.py |   1 -
 .../load/model_cache/model_cache_base.py      |  35 +---
 .../load/model_cache/model_cache_default.py   | 163 ++++--------------
 .../load/model_cache/model_locker.py          |  22 +--
 tests/conftest.py                             |   1 +
 11 files changed, 61 insertions(+), 186 deletions(-)

diff --git a/invokeai/app/api/dependencies.py b/invokeai/app/api/dependencies.py
index f2274d8f462..4e8103d8d36 100644
--- a/invokeai/app/api/dependencies.py
+++ b/invokeai/app/api/dependencies.py
@@ -4,7 +4,6 @@
 
 import torch
 
-import invokeai.backend.util.devices  # horrible hack
 from invokeai.app.services.object_serializer.object_serializer_disk import ObjectSerializerDisk
 from invokeai.app.services.object_serializer.object_serializer_forward_cache import ObjectSerializerForwardCache
 from invokeai.app.services.shared.sqlite.sqlite_util import init_db
diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index 766b44fdc8a..252e00ecab6 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -99,6 +99,7 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
                 textual_inversion_manager=ti_manager,
                 dtype_for_device_getter=TorchDevice.choose_torch_dtype,
                 truncate_long_prompts=False,
+                device=TorchDevice.choose_torch_device(),
             )
 
             conjunction = Compel.parse_prompt_string(self.prompt)
@@ -113,6 +114,7 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
         conditioning_data = ConditioningFieldData(conditionings=[BasicConditioningInfo(embeds=c)])
 
         conditioning_name = context.conditioning.save(conditioning_data)
+
         return ConditioningOutput(
             conditioning=ConditioningField(
                 conditioning_name=conditioning_name,
diff --git a/invokeai/app/services/invocation_stats/invocation_stats_default.py b/invokeai/app/services/invocation_stats/invocation_stats_default.py
index 5a41f1f5d6b..2aa6f28f658 100644
--- a/invokeai/app/services/invocation_stats/invocation_stats_default.py
+++ b/invokeai/app/services/invocation_stats/invocation_stats_default.py
@@ -74,9 +74,9 @@ def collect_stats(self, invocation: BaseInvocation, graph_execution_state_id: st
             )
             self._stats[graph_execution_state_id].add_node_execution_stats(node_stats)
 
-    def reset_stats(self):
-        self._stats = {}
-        self._cache_stats = {}
+    def reset_stats(self, graph_execution_state_id: str):
+        self._stats.pop(graph_execution_state_id)
+        self._cache_stats.pop(graph_execution_state_id)
 
     def get_stats(self, graph_execution_state_id: str) -> InvocationStatsSummary:
         graph_stats_summary = self._get_graph_summary(graph_execution_state_id)
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index fbb19ab5275..ccb68f783b1 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -76,8 +76,6 @@ def build_model_manager(
 
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
-            max_vram_cache_size=app_config.vram,
-            lazy_offloading=app_config.lazy_offload,
             logger=logger,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py
index 916df832612..6ca4e164e06 100644
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@@ -1,7 +1,7 @@
 import traceback
 from contextlib import suppress
 from queue import Queue
-from threading import BoundedSemaphore, Thread, Lock
+from threading import BoundedSemaphore, Lock, Thread
 from threading import Event as ThreadEvent
 from typing import Optional, Set
 
@@ -61,7 +61,9 @@ def __init__(
         self._on_after_run_session_callbacks = on_after_run_session_callbacks or []
         self._process_lock = Lock()
 
-    def start(self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None) -> None:
+    def start(
+        self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None
+    ) -> None:
         self._services = services
         self._cancel_event = cancel_event
         self._profiler = profiler
@@ -214,7 +216,7 @@ def _on_after_run_session(self, queue_item: SessionQueueItem) -> None:
             # we don't care about that - suppress the error.
             with suppress(GESStatsNotFoundError):
                 self._services.performance_statistics.log_stats(queue_item.session.id)
-                self._services.performance_statistics.reset_stats()
+                self._services.performance_statistics.reset_stats(queue_item.session.id)
 
             for callback in self._on_after_run_session_callbacks:
                 callback(queue_item=queue_item)
@@ -384,7 +386,6 @@ def start(self, invoker: Invoker) -> None:
             )
             worker.start()
 
-
     def stop(self, *args, **kwargs) -> None:
         self._stop_event.set()
 
@@ -465,7 +466,7 @@ def _process(
                     # Run the graph
                     # self.session_runner.run(queue_item=self._queue_item)
 
-                except Exception as e:
+                except Exception:
                     # Wait for next polling interval or event to try again
                     poll_now_event.wait(self._polling_interval)
                     continue
@@ -494,7 +495,7 @@ def _process_next_session(self) -> None:
                 with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device():
                     # Run the session on the reserved GPU
                     self.session_runner.run(queue_item=queue_item)
-            except Exception as e:
+            except Exception:
                 continue
             finally:
                 self._active_queue_items.remove(queue_item)
diff --git a/invokeai/app/services/session_queue/session_queue_common.py b/invokeai/app/services/session_queue/session_queue_common.py
index f043248d275..3cff330cff7 100644
--- a/invokeai/app/services/session_queue/session_queue_common.py
+++ b/invokeai/app/services/session_queue/session_queue_common.py
@@ -239,6 +239,7 @@ def queue_item_dto_from_dict(cls, queue_item_dict: dict) -> "SessionQueueItemDTO
     def __hash__(self) -> int:
         return self.item_id
 
+
 class SessionQueueItemDTO(SessionQueueItemWithoutGraph):
     pass
 
diff --git a/invokeai/app/services/shared/invocation_context.py b/invokeai/app/services/shared/invocation_context.py
index e3be2035c67..53b9027e029 100644
--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@@ -325,7 +325,6 @@ def load(self, name: str) -> ConditioningFieldData:
         Returns:
             The loaded conditioning data.
         """
-
         return self._services.conditioning.load(name)
 
 
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index 33e0c6683ee..c86ec5ddda3 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -43,26 +43,9 @@ def model(self) -> AnyModel:
 
 @dataclass
 class CacheRecord(Generic[T]):
-    """
-    Elements of the cache:
-
-    key: Unique key for each model, same as used in the models database.
-    model: Model in memory.
-    state_dict: A read-only copy of the model's state dict in RAM. It will be
-                used as a template for creating a copy in the VRAM.
-    size: Size of the model
-    loaded: True if the model's state dict is currently in VRAM
-
-    Before a model is executed, the state_dict template is copied into VRAM,
-    and then injected into the model. When the model is finished, the VRAM
-    copy of the state dict is deleted, and the RAM version is reinjected
-    into the model.
-    """
+    """Elements of the cache."""
 
     key: str
-    model: T
-    device: torch.device
-    state_dict: Optional[Dict[str, torch.Tensor]]
     size: int
     model: T
     loaded: bool = False
@@ -130,28 +113,12 @@ def get_execution_device(self) -> torch.device:
         """
         pass
 
-    @property
-    @abstractmethod
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        pass
-
     @property
     @abstractmethod
     def max_cache_size(self) -> float:
         """Return true if the cache is configured to lazily offload models in VRAM."""
         pass
 
-    @abstractmethod
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Offload from VRAM any models not actively in use."""
-        pass
-
-    @abstractmethod
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device."""
-        pass
-
     @property
     @abstractmethod
     def stats(self) -> Optional[CacheStats]:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 65420d233bc..910087c4bb6 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -19,10 +19,8 @@
 """
 
 import gc
-import math
 import sys
 import threading
-import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
@@ -31,7 +29,7 @@
 import torch
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
@@ -42,11 +40,6 @@
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
 
-# amount of GPU memory to hold in reserve for use by generations (GB)
-# Empirically this value seems to improve performance without starving other
-# processes.
-DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
-
 # actual size of a gig
 GIG = 1073741824
 
@@ -60,12 +53,10 @@ class ModelCache(ModelCacheBase[AnyModel]):
     def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
-        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
         execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
         sequential_offload: bool = False,
-        lazy_offloading: bool = True,
         sha_chunksize: int = 16777216,
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
@@ -76,18 +67,14 @@ def __init__(
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
-        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
         :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
             operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
             snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
             behaviour.
         """
-        # allow lazy offloading only when vram cache enabled
-        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
-        self._max_vram_cache_size: float = max_vram_cache_size
         self._storage_device: torch.device = storage_device
         self._ram_lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -111,11 +98,6 @@ def logger(self) -> Logger:
         """Return the logger used by the cache."""
         return self._logger
 
-    @property
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        return self._lazy_offloading
-
     @property
     def storage_device(self) -> torch.device:
         """Return the storage device (e.g. "CPU" for RAM)."""
@@ -233,10 +215,9 @@ def put(
             if key in self._cached_models:
                 return
             self.make_room(size)
-        state_dict = model.state_dict() if isinstance(model, torch.nn.Module) else None
-        cache_record = CacheRecord(key=key, model=model, device=self.storage_device, state_dict=state_dict, size=size)
-        self._cached_models[key] = cache_record
-        self._cache_stack.append(key)
+            cache_record = CacheRecord(key, model=model, size=size)
+            self._cached_models[key] = cache_record
+            self._cache_stack.append(key)
 
     def get(
         self,
@@ -296,107 +277,6 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
         else:
             return model_key
 
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Move any unused models from VRAM."""
-        reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated() + size_required
-        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
-        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
-            if vram_in_use <= reserved:
-                break
-            if not cache_entry.loaded:
-                continue
-            if not cache_entry.locked:
-                self.move_model_to_device(cache_entry, self.storage_device)
-                cache_entry.loaded = False
-                vram_in_use = torch.cuda.memory_allocated() + size_required
-                self.logger.debug(
-                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
-                )
-
-        TorchDevice.empty_cache()
-
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device.
-
-        :param cache_entry: The CacheRecord for the model
-        :param target_device: The torch.device to move the model into
-
-        May raise a torch.cuda.OutOfMemoryError
-        """
-        # These attributes are not in the base ModelMixin class but in various derived classes.
-        # Some models don't have these attributes, in which case they run in RAM/CPU.
-        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
-        if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
-            return
-
-        source_device = cache_entry.device
-
-        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
-        # This would need to be revised to support multi-GPU.
-        if torch.device(source_device).type == torch.device(target_device).type:
-            return
-
-        # This roundabout method for moving the model around is done to avoid
-        # the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
-        # When moving to VRAM, we copy (not move) each element of the state dict from
-        # RAM to a new state dict in VRAM, and then inject it into the model.
-        # This operation is slightly faster than running `to()` on the whole model.
-        #
-        # When the model needs to be removed from VRAM we simply delete the copy
-        # of the state dict in VRAM, and reinject the state dict that is cached
-        # in RAM into the model. So this operation is very fast.
-        start_model_to_time = time.time()
-        snapshot_before = self._capture_memory_snapshot()
-
-        try:
-            if cache_entry.state_dict is not None:
-                assert hasattr(cache_entry.model, "load_state_dict")
-                if target_device == self.storage_device:
-                    cache_entry.model.load_state_dict(cache_entry.state_dict, assign=True)
-                else:
-                    new_dict: Dict[str, torch.Tensor] = {}
-                    for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(torch.device(target_device), copy=True)
-                    cache_entry.model.load_state_dict(new_dict, assign=True)
-            cache_entry.model.to(target_device)
-            cache_entry.device = target_device
-        except Exception as e:  # blow away cache entry
-            self._delete_cache_entry(cache_entry)
-            raise e
-
-        snapshot_after = self._capture_memory_snapshot()
-        end_model_to_time = time.time()
-        self.logger.debug(
-            f"Moved model '{cache_entry.key}' from {source_device} to"
-            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
-            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
-            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-        )
-
-        if (
-            snapshot_before is not None
-            and snapshot_after is not None
-            and snapshot_before.vram is not None
-            and snapshot_after.vram is not None
-        ):
-            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
-
-            # If the estimated model size does not match the change in VRAM, log a warning.
-            if not math.isclose(
-                vram_change,
-                cache_entry.size,
-                rel_tol=0.1,
-                abs_tol=10 * MB,
-            ):
-                self.logger.debug(
-                    f"Moving model '{cache_entry.key}' from {source_device} to"
-                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
-                    " estimated size may be incorrect. Estimated model size:"
-                    f" {(cache_entry.size/GIG):.3f} GB.\n"
-                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-                )
-
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
@@ -440,12 +320,43 @@ def make_room(self, size: int) -> None:
         while current_size + bytes_needed > maximum_size and pos < len(self._cache_stack):
             model_key = self._cache_stack[pos]
             cache_entry = self._cached_models[model_key]
+
+            refs = sys.getrefcount(cache_entry.model)
+
+            # HACK: This is a workaround for a memory-management issue that we haven't tracked down yet. We are directly
+            # going against the advice in the Python docs by using `gc.get_referrers(...)` in this way:
+            # https://docs.python.org/3/library/gc.html#gc.get_referrers
+
+            # manualy clear local variable references of just finished function calls
+            # for some reason python don't want to collect it even by gc.collect() immidiately
+            if refs > 2:
+                while True:
+                    cleared = False
+                    for referrer in gc.get_referrers(cache_entry.model):
+                        if type(referrer).__name__ == "frame":
+                            # RuntimeError: cannot clear an executing frame
+                            with suppress(RuntimeError):
+                                referrer.clear()
+                                cleared = True
+                                # break
+
+                    # repeat if referrers changes(due to frame clear), else exit loop
+                    if cleared:
+                        gc.collect()
+                    else:
+                        break
+
             device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
             self.logger.debug(
-                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded}"
+                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
+                f" refs: {refs}"
             )
 
-            if not cache_entry.locked:
+            # Expected refs:
+            # 1 from cache_entry
+            # 1 from getrefcount function
+            # 1 from onnx runtime object
+            if not cache_entry.locked and refs <= (3 if "onnx" in model_key else 2):
                 self.logger.debug(
                     f"Removing {model_key} from RAM cache to free at least {(size/GIG):.2f} GB (-{(cache_entry.size/GIG):.2f} GB)"
                 )
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 7bb411dfda3..c7685fc8f72 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -2,6 +2,7 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """
 
+import copy
 from typing import Optional
 
 import torch
@@ -54,14 +55,13 @@ def lock(self) -> AnyModel:
         # NOTE that the model has to have the to() method in order for this code to move it into GPU!
         self._cache_entry.lock()
         try:
-            if self._cache.lazy_offloading:
-                self._cache.offload_unlocked_models(self._cache_entry.size)
-
-            execution_device = self._cache.get_execution_device()
-            self._cache.move_model_to_device(self._cache_entry, execution_device)
+            # We wait for a gpu to be free - may raise a ValueError
+            self._execution_device = self._cache.get_execution_device()
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
+            model_in_gpu = copy.deepcopy(self._cache_entry.model)
+            if hasattr(model_in_gpu, "to"):
+                model_in_gpu.to(self._execution_device)
             self._cache_entry.loaded = True
-
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -70,15 +70,11 @@ def lock(self) -> AnyModel:
         except Exception:
             self._cache_entry.unlock()
             raise
-
-        return self.model
+        return model_in_gpu
 
     def unlock(self) -> None:
         """Call upon exit from context."""
         if not hasattr(self.model, "to"):
             return
-
         self._cache_entry.unlock()
-        if not self._cache.lazy_offloading:
-            self._cache.offload_unlocked_models(0)
-            self._cache.print_cuda_stats()
+        self._cache.print_cuda_stats()
diff --git a/tests/conftest.py b/tests/conftest.py
index 6b3fc49b9b9..e140bcd7df4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -54,6 +54,7 @@ def mock_services() -> InvocationServices:
         workflow_records=None,  # type: ignore
         tensors=None,  # type: ignore
         conditioning=None,  # type: ignore
+        performance_statistics=None,  # type: ignore
     )
 
 

From 7088d5610b6a7eb192f10842596d8d8f506757f9 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 16 Jun 2024 19:50:49 -0400
Subject: [PATCH 24/30] add script to sync models db with models.yaml

---
 .../model_install/model_install_default.py    | 10 +++-
 scripts/populate_model_db_from_yaml.py        | 54 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100755 scripts/populate_model_db_from_yaml.py

diff --git a/invokeai/app/services/model_install/model_install_default.py b/invokeai/app/services/model_install/model_install_default.py
index df060caff30..1590282e998 100644
--- a/invokeai/app/services/model_install/model_install_default.py
+++ b/invokeai/app/services/model_install/model_install_default.py
@@ -305,9 +305,14 @@ def prune_jobs(self) -> None:
         unfinished_jobs = [x for x in self._install_jobs if not x.in_terminal_state]
         self._install_jobs = unfinished_jobs
 
-    def _migrate_yaml(self) -> None:
+    def _migrate_yaml(self, rename_yaml: Optional[bool] = True, overwrite_db: Optional[bool] = False) -> None:
         db_models = self.record_store.all_models()
 
+        if overwrite_db:
+            for model in db_models:
+                self.record_store.del_model(model.key)
+            db_models = self.record_store.all_models()
+
         legacy_models_yaml_path = (
             self._app_config.legacy_models_yaml_path or self._app_config.root_path / "configs" / "models.yaml"
         )
@@ -357,7 +362,8 @@ def _migrate_yaml(self) -> None:
                         self._logger.warning(f"Model at {model_path} could not be migrated: {e}")
 
             # Rename `models.yaml` to `models.yaml.bak` to prevent re-migration
-            legacy_models_yaml_path.rename(legacy_models_yaml_path.with_suffix(".yaml.bak"))
+            if rename_yaml:
+                legacy_models_yaml_path.rename(legacy_models_yaml_path.with_suffix(".yaml.bak"))
 
         # Unset the path - we are done with it either way
         self._app_config.legacy_models_yaml_path = None
diff --git a/scripts/populate_model_db_from_yaml.py b/scripts/populate_model_db_from_yaml.py
new file mode 100755
index 00000000000..80e5bcfc5c6
--- /dev/null
+++ b/scripts/populate_model_db_from_yaml.py
@@ -0,0 +1,54 @@
+#!/bin/env python
+
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+from invokeai.app.services.config import InvokeAIAppConfig, get_config
+from invokeai.app.services.download import DownloadQueueService
+from invokeai.app.services.model_install import ModelInstallService
+from invokeai.app.services.model_records import ModelRecordServiceSQL
+from invokeai.app.services.shared.sqlite.sqlite_database import SqliteDatabase
+from invokeai.backend.util.logging import InvokeAILogger
+
+
+def get_args() -> Namespace:
+    parser = ArgumentParser(description="Update models database from yaml file")
+    parser.add_argument("--root", type=Path, required=False, default=None)
+    parser.add_argument("--yaml_file", type=Path, required=False, default=None)
+    return parser.parse_args()
+
+
+def populate_config() -> InvokeAIAppConfig:
+    args = get_args()
+    config = get_config()
+    if args.root:
+        config._root = args.root
+    if args.yaml_file:
+        config.legacy_models_yaml_path = args.yaml_file
+    else:
+        config.legacy_models_yaml_path = config.root_path / "configs/models.yaml"
+    return config
+
+
+def initialize_installer(config: InvokeAIAppConfig) -> ModelInstallService:
+    logger = InvokeAILogger.get_logger(config=config)
+    db = SqliteDatabase(config.db_path, logger)
+    record_store = ModelRecordServiceSQL(db)
+    queue = DownloadQueueService()
+    queue.start()
+    installer = ModelInstallService(app_config=config, record_store=record_store, download_queue=queue)
+    return installer
+
+
+def main() -> None:
+    config = populate_config()
+    installer = initialize_installer(config)
+    installer._migrate_yaml(rename_yaml=False, overwrite_db=True)
+    print("\n<INSTALLED MODELS>")
+    print("\t".join(["key", "name", "type", "path"]))
+    for model in installer.record_store.all_models():
+        print("\t".join([model.key, model.name, model.type, (config.models_path / model.path).as_posix()]))
+
+
+if __name__ == "__main__":
+    main()

From 6932f27b4303517678072a1336da5097b76d23fc Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 23 Jun 2024 12:17:16 -0400
Subject: [PATCH 25/30] fixup code broken by merge with main

---
 .../app/services/model_load/model_load_base.py  |  1 -
 .../model_manager/model_manager_default.py      |  1 +
 .../load/model_cache/model_cache_default.py     | 10 +++++++++-
 .../load/model_cache/model_locker.py            |  8 ++++----
 .../model_loading/test_model_load.py            | 17 +++++++++--------
 tests/backend/util/test_devices.py              |  2 ++
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/invokeai/app/services/model_load/model_load_base.py b/invokeai/app/services/model_load/model_load_base.py
index 4a838c25674..990f8ca207e 100644
--- a/invokeai/app/services/model_load/model_load_base.py
+++ b/invokeai/app/services/model_load/model_load_base.py
@@ -60,4 +60,3 @@ def load_model_from_path(
         Returns:
           A LoadedModel object.
         """
-
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index ccb68f783b1..6ff1b7de675 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -76,6 +76,7 @@ def build_model_manager(
 
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
+            max_vram_cache_size=app_config.vram,
             logger=logger,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index c95abe2bc07..a00c8fcb87d 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -19,8 +19,10 @@
 """
 
 import gc
+import math
 import sys
 import threading
+import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
@@ -40,6 +42,7 @@
 # Maximum size of the cache, in gigs
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
+DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
 
 # actual size of a gig
 GIG = 1073741824
@@ -54,6 +57,7 @@ class ModelCache(ModelCacheBase[AnyModel]):
     def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
+        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
         execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
@@ -76,6 +80,7 @@ def __init__(
         """
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
+        self._max_vram_cache_size: float = max_vram_cache_size
         self._storage_device: torch.device = storage_device
         self._ram_lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -281,14 +286,17 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
 
     def offload_unlocked_models(self, size_required: int) -> None:
         """Move any unused models from VRAM."""
+        device = self.get_execution_device()
         reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated() + size_required
+        vram_in_use = torch.cuda.memory_allocated(device) + size_required
         self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
         for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
             if vram_in_use <= reserved:
                 break
             if not cache_entry.loaded:
                 continue
+            if cache_entry.device is not device:
+                continue
             if not cache_entry.locked:
                 self.move_model_to_device(cache_entry, self.storage_device)
                 cache_entry.loaded = False
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 36ec6610932..9f9c05bce56 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -39,11 +39,11 @@ def lock(self) -> AnyModel:
         """Move the model into the execution device (GPU) and lock it."""
         self._cache_entry.lock()
         try:
-            if self._cache.lazy_offloading:
-                self._cache.offload_unlocked_models(self._cache_entry.size)
-            self._cache.move_model_to_device(self._cache_entry, self._cache.get_execution_device())
+            device = self._cache.get_execution_device()
+            self._cache.offload_unlocked_models(self._cache_entry.size)
+            self._cache.move_model_to_device(self._cache_entry, device)
             self._cache_entry.loaded = True
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._cache.execution_device}")
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
diff --git a/tests/backend/model_manager/model_loading/test_model_load.py b/tests/backend/model_manager/model_loading/test_model_load.py
index 3f12f7f8ee9..ff80d49397f 100644
--- a/tests/backend/model_manager/model_loading/test_model_load.py
+++ b/tests/backend/model_manager/model_loading/test_model_load.py
@@ -14,13 +14,14 @@ def test_loading(mm2_model_manager: ModelManagerServiceBase, embedding_file: Pat
     matches = store.search_by_attr(model_name="test_embedding")
     assert len(matches) == 0
     key = mm2_model_manager.install.register_path(embedding_file)
-    loaded_model = mm2_model_manager.load.load_model(store.get_model(key))
-    assert loaded_model is not None
-    assert loaded_model.config.key == key
-    with loaded_model as model:
-        assert isinstance(model, TextualInversionModelRaw)
+    with mm2_model_manager.load.ram_cache.reserve_execution_device():
+        loaded_model = mm2_model_manager.load.load_model(store.get_model(key))
+        assert loaded_model is not None
+        assert loaded_model.config.key == key
+        with loaded_model as model:
+            assert isinstance(model, TextualInversionModelRaw)
 
-    config = mm2_model_manager.store.get_model(key)
-    loaded_model_2 = mm2_model_manager.load.load_model(config)
+        config = mm2_model_manager.store.get_model(key)
+        loaded_model_2 = mm2_model_manager.load.load_model(config)
 
-    assert loaded_model.config.key == loaded_model_2.config.key
+        assert loaded_model.config.key == loaded_model_2.config.key
diff --git a/tests/backend/util/test_devices.py b/tests/backend/util/test_devices.py
index f4faea5d98d..d854a82e622 100644
--- a/tests/backend/util/test_devices.py
+++ b/tests/backend/util/test_devices.py
@@ -10,6 +10,7 @@
 from invokeai.app.services.config import get_config
 from invokeai.backend.model_manager.load import ModelCache
 from invokeai.backend.util.devices import TorchDevice, choose_precision, choose_torch_device, torch_dtype
+from tests.backend.model_manager.model_manager_fixtures import *  # noqa F403
 
 devices = ["cpu", "cuda:0", "cuda:1", "mps"]
 device_types_cpu = [("cpu", torch.float32), ("cuda:0", torch.float32), ("mps", torch.float32)]
@@ -21,6 +22,7 @@
 def test_device_choice(device_name):
     config = get_config()
     config.device = device_name
+    TorchDevice.set_model_cache(None)  # disable dynamic selection of GPU device
     torch_device = TorchDevice.choose_torch_device()
     assert torch_device == torch.device(device_name)
 

From 2219e3643a5392f778afeff8194d758d3fb96ae5 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 24 Jun 2024 10:55:15 -0400
Subject: [PATCH 26/30] copy model from a meta device template

- temporarily disable vram cache
---
 .../load/model_cache/model_cache_base.py      |  21 +---
 .../load/model_cache/model_cache_default.py   | 119 +++++-------------
 .../load/model_cache/model_locker.py          |  15 +--
 invokeai/backend/model_patcher.py             |   7 +-
 4 files changed, 40 insertions(+), 122 deletions(-)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index b3e4e3ac12c..9f9dd263622 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -52,11 +52,10 @@ class CacheRecord(Generic[T]):
     Elements of the cache:
 
     key: Unique key for each model, same as used in the models database.
-    model: Model in memory.
+    model: Read-only copy of the model *without weights* residing in the "meta device"
     state_dict: A read-only copy of the model's state dict in RAM. It will be
                 used as a template for creating a copy in the VRAM.
     size: Size of the model
-    loaded: True if the model's state dict is currently in VRAM
 
     Before a model is executed, the state_dict template is copied into VRAM,
     and then injected into the model. When the model is finished, the VRAM
@@ -72,25 +71,7 @@ class CacheRecord(Generic[T]):
     key: str
     size: int
     model: T
-    device: torch.device
     state_dict: Optional[Dict[str, torch.Tensor]]
-    size: int
-    loaded: bool = False
-    _locks: int = 0
-
-    def lock(self) -> None:
-        """Lock this record."""
-        self._locks += 1
-
-    def unlock(self) -> None:
-        """Unlock this record."""
-        self._locks -= 1
-        assert self._locks >= 0
-
-    @property
-    def locked(self) -> bool:
-        """Return true if record is locked."""
-        return self._locks > 0
 
 
 @dataclass
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index a00c8fcb87d..6357ada241d 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -36,6 +36,7 @@
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
+from ..optimizations import skip_torch_weight_init
 from .model_cache_base import CacheRecord, CacheStats, ModelCacheBase, ModelLockerBase
 from .model_locker import ModelLocker
 
@@ -221,8 +222,12 @@ def put(
         size = calc_model_size_by_data(model)
         self.make_room(size)
 
-        state_dict = model.state_dict() if isinstance(model, torch.nn.Module) else None
-        cache_record = CacheRecord(key=key, model=model, device=self.storage_device, state_dict=state_dict, size=size)
+        if isinstance(model, torch.nn.Module):
+            state_dict = model.state_dict()  # keep a master copy of the state dict
+            model = model.to(device="meta")  # and keep a template in the meta device
+        else:
+            state_dict = None
+        cache_record = CacheRecord(key=key, model=model, state_dict=state_dict, size=size)
         self._cached_models[key] = cache_record
         self._cache_stack.append(key)
 
@@ -284,48 +289,20 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
         else:
             return model_key
 
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Move any unused models from VRAM."""
-        device = self.get_execution_device()
-        reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated(device) + size_required
-        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
-        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
-            if vram_in_use <= reserved:
-                break
-            if not cache_entry.loaded:
-                continue
-            if cache_entry.device is not device:
-                continue
-            if not cache_entry.locked:
-                self.move_model_to_device(cache_entry, self.storage_device)
-                cache_entry.loaded = False
-                vram_in_use = torch.cuda.memory_allocated() + size_required
-                self.logger.debug(
-                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
-                )
-
-        TorchDevice.empty_cache()
-
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device.
+    def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> AnyModel:
+        """Move a copy of the model into the indicated device and return it.
 
         :param cache_entry: The CacheRecord for the model
         :param target_device: The torch.device to move the model into
 
         May raise a torch.cuda.OutOfMemoryError
         """
-        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
-        source_device = cache_entry.device
-
-        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
-        # This would need to be revised to support multi-GPU.
-        if torch.device(source_device).type == torch.device(target_device).type:
-            return
+        self.logger.info(f"Called to move {cache_entry.key} to {target_device}")
 
-        # Some models don't have a `to` method, in which case they run in RAM/CPU.
-        if not hasattr(cache_entry.model, "to"):
-            return
+        # Some models don't have a state dictionary, in which case the
+        # stored model will still reside in CPU
+        if cache_entry.state_dict is None:
+            return cache_entry.model
 
         # This roundabout method for moving the model around is done to avoid
         # the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
@@ -338,27 +315,25 @@ def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device
         # in RAM into the model. So this operation is very fast.
         start_model_to_time = time.time()
         snapshot_before = self._capture_memory_snapshot()
-
         try:
-            if cache_entry.state_dict is not None:
-                assert hasattr(cache_entry.model, "load_state_dict")
-                if target_device == self.storage_device:
-                    cache_entry.model.load_state_dict(cache_entry.state_dict, assign=True)
+            assert isinstance(cache_entry.model, torch.nn.Module)
+            template = cache_entry.model
+            cls = template.__class__
+            with skip_torch_weight_init():
+                if hasattr(cls, "from_config"):
+                    working_model = template.__class__.from_config(template.config)  # diffusers style
                 else:
-                    new_dict: Dict[str, torch.Tensor] = {}
-                    for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(torch.device(target_device), copy=True, non_blocking=True)
-                    cache_entry.model.load_state_dict(new_dict, assign=True)
-            cache_entry.model.to(target_device, non_blocking=True)
-            cache_entry.device = target_device
+                    working_model = template.__class__(config=template.config)  # transformers style (sigh)
+            working_model.to(device=target_device, dtype=self._precision)
+            working_model.load_state_dict(cache_entry.state_dict)
         except Exception as e:  # blow away cache entry
             self._delete_cache_entry(cache_entry)
             raise e
 
         snapshot_after = self._capture_memory_snapshot()
         end_model_to_time = time.time()
-        self.logger.debug(
-            f"Moved model '{cache_entry.key}' from {source_device} to"
+        self.logger.info(
+            f"Moved model '{cache_entry.key}' to"
             f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
             f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
             f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
@@ -380,34 +355,21 @@ def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device
                 abs_tol=10 * MB,
             ):
                 self.logger.debug(
-                    f"Moving model '{cache_entry.key}' from {source_device} to"
+                    f"Moving model '{cache_entry.key}' from to"
                     f" {target_device} caused an unexpected change in VRAM usage. The model's"
                     " estimated size may be incorrect. Estimated model size:"
                     f" {(cache_entry.size/GIG):.3f} GB.\n"
                     f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
                 )
+        return working_model
 
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
         ram = "%4.2fG" % (self.cache_size() / GIG)
 
-        in_ram_models = 0
-        in_vram_models = 0
-        locked_in_vram_models = 0
-        for cache_record in self._cached_models.values():
-            if hasattr(cache_record.model, "device"):
-                if cache_record.model.device == self.storage_device:
-                    in_ram_models += 1
-                else:
-                    in_vram_models += 1
-                if cache_record.locked:
-                    locked_in_vram_models += 1
-
-                self.logger.debug(
-                    f"Current VRAM/RAM usage: {vram}/{ram}; models_in_ram/models_in_vram(locked) ="
-                    f" {in_ram_models}/{in_vram_models}({locked_in_vram_models})"
-                )
+        in_ram_models = len(self._cached_models)
+        self.logger.debug(f"Current VRAM/RAM usage for {in_ram_models} models: {vram}/{ram}")
 
     def make_room(self, size: int) -> None:
         """Make enough room in the cache to accommodate a new model of indicated size."""
@@ -433,29 +395,6 @@ def make_room(self, size: int) -> None:
 
             refs = sys.getrefcount(cache_entry.model)
 
-            # HACK: This is a workaround for a memory-management issue that we haven't tracked down yet. We are directly
-            # going against the advice in the Python docs by using `gc.get_referrers(...)` in this way:
-            # https://docs.python.org/3/library/gc.html#gc.get_referrers
-
-            # manualy clear local variable references of just finished function calls
-            # for some reason python don't want to collect it even by gc.collect() immidiately
-            if refs > 2:
-                while True:
-                    cleared = False
-                    for referrer in gc.get_referrers(cache_entry.model):
-                        if type(referrer).__name__ == "frame":
-                            # RuntimeError: cannot clear an executing frame
-                            with suppress(RuntimeError):
-                                referrer.clear()
-                                cleared = True
-                                # break
-
-                    # repeat if referrers changes(due to frame clear), else exit loop
-                    if cleared:
-                        gc.collect()
-                    else:
-                        break
-
             device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
             self.logger.debug(
                 f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 9f9c05bce56..815fd41f04e 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -37,25 +37,22 @@ def get_state_dict(self) -> Optional[Dict[str, torch.Tensor]]:
 
     def lock(self) -> AnyModel:
         """Move the model into the execution device (GPU) and lock it."""
-        self._cache_entry.lock()
         try:
             device = self._cache.get_execution_device()
-            self._cache.offload_unlocked_models(self._cache_entry.size)
-            self._cache.move_model_to_device(self._cache_entry, device)
-            self._cache_entry.loaded = True
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {device}")
+            model_on_device = self._cache.model_to_device(self._cache_entry, device)
+            self._cache.logger.debug(f"Moved {self._cache_entry.key} to {device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
-            self._cache_entry.unlock()
             raise
         except Exception:
-            self._cache_entry.unlock()
             raise
 
-        return self.model
+        return model_on_device
 
+    # It is no longer necessary to move the model out of VRAM
+    # because it will be removed when it goes out of scope
+    # in the caller's context
     def unlock(self) -> None:
         """Call upon exit from context."""
-        self._cache_entry.unlock()
         self._cache.print_cuda_stats()
diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py
index fdc79539ae7..0f57c0efdcc 100644
--- a/invokeai/backend/model_patcher.py
+++ b/invokeai/backend/model_patcher.py
@@ -129,9 +129,7 @@ def apply_lora(
                         dtype = module.weight.dtype
 
                         if module_key not in original_weights:
-                            if model_state_dict is not None:  # we were provided with the CPU copy of the state dict
-                                original_weights[module_key] = model_state_dict[module_key + ".weight"]
-                            else:
+                            if model_state_dict is None:  # no CPU copy of the state dict was provided
                                 original_weights[module_key] = module.weight.detach().to(device="cpu", copy=True)
 
                         layer_scale = layer.alpha / layer.rank if (layer.alpha and layer.rank) else 1.0
@@ -158,6 +156,9 @@ def apply_lora(
             yield  # wait for context manager exit
 
         finally:
+            # LS check: for now, we are not reusing models in VRAM but re-copying them each time they are needed.
+            # Therefore it should not be necessary to copy the original model weights back.
+            # This needs to be fixed before resurrecting the VRAM cache.
             assert hasattr(model, "get_submodule")  # mypy not picking up fact that torch.nn.Module has get_submodule()
             with torch.no_grad():
                 for module_key, weight in original_weights.items():

From 9b7b182cf78732d6992d8b14442c4e51e2fdb5a3 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 24 Jun 2024 11:58:26 -0400
Subject: [PATCH 27/30] remove dangling attributes in ModelCache class

---
 .../load/model_cache/model_cache_default.py           | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 6357ada241d..dd14526bc29 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -60,10 +60,7 @@ def __init__(
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
         max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
-        execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
-        sequential_offload: bool = False,
-        sha_chunksize: int = 16777216,
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
     ):
@@ -395,17 +392,11 @@ def make_room(self, size: int) -> None:
 
             refs = sys.getrefcount(cache_entry.model)
 
-            device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
-            self.logger.debug(
-                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
-                f" refs: {refs}"
-            )
-
             # Expected refs:
             # 1 from cache_entry
             # 1 from getrefcount function
             # 1 from onnx runtime object
-            if not cache_entry.locked and refs <= (3 if "onnx" in model_key else 2):
+            if refs <= (3 if "onnx" in model_key else 2):
                 self.logger.debug(
                     f"Removing {model_key} from RAM cache to free at least {(size/GIG):.2f} GB (-{(cache_entry.size/GIG):.2f} GB)"
                 )

From 5d6a77d33692f7d43bfbd97e241f56ee94912c82 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Mon, 24 Jun 2024 14:57:54 -0400
Subject: [PATCH 28/30] fixup ip adapter handling

---
 invokeai/app/invocations/denoise_latents.py        |  1 +
 invokeai/backend/model_manager/config.py           |  6 ++++--
 .../load/model_cache/model_cache_base.py           |  5 +++++
 .../load/model_cache/model_cache_default.py        | 14 +++++++++++---
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py
index e94daf70bdd..13a92efab88 100644
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -226,6 +226,7 @@ def _preprocess_regional_prompt_mask(
         # Add a batch dimension to the mask, because torchvision expects shape (batch, channels, h, w).
         mask = mask.unsqueeze(0)  # Shape: (1, h, w) -> (1, 1, h, w)
         resized_mask = tf(mask)
+        assert isinstance(resized_mask, torch.Tensor)
         return resized_mask
 
     def _concat_regional_text_embeddings(
diff --git a/invokeai/backend/model_manager/config.py b/invokeai/backend/model_manager/config.py
index 7ed12a7674d..c0065cefa9b 100644
--- a/invokeai/backend/model_manager/config.py
+++ b/invokeai/backend/model_manager/config.py
@@ -25,6 +25,7 @@
 from typing import Literal, Optional, Type, TypeAlias, Union
 
 import torch
+from diffusers.configuration_utils import ConfigMixin
 from diffusers.models.modeling_utils import ModelMixin
 from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag, TypeAdapter
 from typing_extensions import Annotated, Any, Dict
@@ -37,7 +38,7 @@
 
 # ModelMixin is the base class for all diffusers and transformers models
 # RawModel is the InvokeAI wrapper class for ip_adapters, loras, textual_inversion and onnx runtime
-AnyModel = Union[ModelMixin, RawModel, torch.nn.Module, Dict[str, torch.Tensor]]
+AnyModel = Union[ConfigMixin, ModelMixin, RawModel, torch.nn.Module, Dict[str, torch.Tensor]]
 
 
 class InvalidModelConfigException(Exception):
@@ -177,6 +178,7 @@ class ModelConfigBase(BaseModel):
 
     @staticmethod
     def json_schema_extra(schema: dict[str, Any], model_class: Type[BaseModel]) -> None:
+        """Extend the pydantic schema from a json."""
         schema["required"].extend(["key", "type", "format"])
 
     model_config = ConfigDict(validate_assignment=True, json_schema_extra=json_schema_extra)
@@ -443,7 +445,7 @@ def make_config(
             model = dest_class.model_validate(model_data)
         else:
             # mypy doesn't typecheck TypeAdapters well?
-            model = AnyModelConfigValidator.validate_python(model_data)  # type: ignore
+            model = AnyModelConfigValidator.validate_python(model_data)
         assert model is not None
         if key:
             model.key = key
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index 9f9dd263622..6e518bc9e3e 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -188,6 +188,11 @@ def exists(
         """Return true if the model identified by key and submodel_type is in the cache."""
         pass
 
+    @abstractmethod
+    def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> AnyModel:
+        """Move a copy of the model into the indicated device and return it."""
+        pass
+
     @abstractmethod
     def cache_size(self) -> int:
         """Get the total size of the models currently cached."""
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index dd14526bc29..3a2d36e87a8 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -18,6 +18,7 @@
 
 """
 
+import copy
 import gc
 import math
 import sys
@@ -29,6 +30,7 @@
 from typing import Dict, Generator, List, Optional, Set
 
 import torch
+from diffusers.configuration_utils import ConfigMixin
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
 from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
@@ -294,12 +296,18 @@ def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: tor
 
         May raise a torch.cuda.OutOfMemoryError
         """
-        self.logger.info(f"Called to move {cache_entry.key} to {target_device}")
+        self.logger.info(f"Called to move {cache_entry.key} ({type(cache_entry.model)=}) to {target_device}")
 
         # Some models don't have a state dictionary, in which case the
         # stored model will still reside in CPU
         if cache_entry.state_dict is None:
-            return cache_entry.model
+            if hasattr(cache_entry.model, "to"):
+                model_in_gpu = copy.deepcopy(cache_entry.model)
+                assert hasattr(model_in_gpu, "to")
+                model_in_gpu.to(target_device)
+                return model_in_gpu
+            else:
+                return cache_entry.model  # what happens in CPU stays in CPU
 
         # This roundabout method for moving the model around is done to avoid
         # the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
@@ -317,7 +325,7 @@ def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: tor
             template = cache_entry.model
             cls = template.__class__
             with skip_torch_weight_init():
-                if hasattr(cls, "from_config"):
+                if isinstance(cls, ConfigMixin) or hasattr(cls, "from_config"):
                     working_model = template.__class__.from_config(template.config)  # diffusers style
                 else:
                     working_model = template.__class__(config=template.config)  # transformers style (sigh)

From 02957be3332667c62c6aa0d79a36d0f25d8ba3c3 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 18 Jul 2024 14:53:03 -0400
Subject: [PATCH 29/30] fix compel conditioning object caching issue by
 applying deepcopy() before moving to VRAM

---
 invokeai/app/invocations/compel.py            |   4 +-
 invokeai/app/invocations/denoise_latents.py   |   7 +-
 .../object_serializer_disk.py                 |   3 +-
 .../load/model_cache/model_cache_base.py      |  13 ---
 .../load/model_cache/model_cache_default.py   | 101 ++++--------------
 .../load/model_cache/model_locker.py          |  10 +-
 invokeai/backend/model_patcher.py             |  13 ++-
 .../diffusion/conditioning_data.py            |   7 +-
 .../diffusion/shared_invokeai_diffusion.py    |  44 ++++----
 9 files changed, 77 insertions(+), 125 deletions(-)

diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index 4a56730e056..f860b21dec0 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -1,6 +1,7 @@
 from typing import Iterator, List, Optional, Tuple, Union, cast
 
 import torch
+import threading
 from compel import Compel, ReturnedEmbeddingsType
 from compel.prompt_parser import Blend, Conjunction, CrossAttentionControlSubstitute, FlattenedPrompt, Fragment
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
@@ -139,6 +140,7 @@ def run_clip_compel(
         lora_prefix: str,
         zero_on_empty: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        tid = threading.current_thread().ident
         tokenizer_info = context.models.load(clip_field.tokenizer)
         text_encoder_info = context.models.load(clip_field.text_encoder)
 
@@ -205,6 +207,7 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
                 truncate_long_prompts=False,  # TODO:
                 returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,  # TODO: clip skip
                 requires_pooled=get_pooled,
+                device=TorchDevice.choose_torch_device(),
             )
 
             conjunction = Compel.parse_prompt_string(prompt)
@@ -315,7 +318,6 @@ def invoke(self, context: InvocationContext) -> ConditioningOutput:
                 )
             ]
         )
-
         conditioning_name = context.conditioning.save(conditioning_data)
 
         return ConditioningOutput(
diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py
index 13a92efab88..a58fb69cb37 100644
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654)
+import copy
 import inspect
+import threading
 from contextlib import ExitStack
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
@@ -192,10 +194,10 @@ def _get_text_embeddings_and_masks(
         """Get the text embeddings and masks from the input conditioning fields."""
         text_embeddings: Union[list[BasicConditioningInfo], list[SDXLConditioningInfo]] = []
         text_embeddings_masks: list[Optional[torch.Tensor]] = []
+        tid = threading.current_thread().ident
         for cond in cond_list:
-            cond_data = context.conditioning.load(cond.conditioning_name)
+            cond_data = copy.deepcopy(context.conditioning.load(cond.conditioning_name))
             text_embeddings.append(cond_data.conditionings[0].to(device=device, dtype=dtype))
-
             mask = cond.mask
             if mask is not None:
                 mask = context.tensors.load(mask.tensor_name)
@@ -317,6 +319,7 @@ def get_conditioning_data(
         if not isinstance(uncond_list, list):
             uncond_list = [uncond_list]
 
+        tid = threading.current_thread().ident
         cond_text_embeddings, cond_text_embedding_masks = self._get_text_embeddings_and_masks(
             cond_list, context, unet.device, unet.dtype
         )
diff --git a/invokeai/app/services/object_serializer/object_serializer_disk.py b/invokeai/app/services/object_serializer/object_serializer_disk.py
index d3171f85309..e27ea46e6b5 100644
--- a/invokeai/app/services/object_serializer/object_serializer_disk.py
+++ b/invokeai/app/services/object_serializer/object_serializer_disk.py
@@ -10,6 +10,7 @@
 from invokeai.app.services.object_serializer.object_serializer_base import ObjectSerializerBase
 from invokeai.app.services.object_serializer.object_serializer_common import ObjectNotFoundError
 from invokeai.app.util.misc import uuid_string
+from invokeai.backend.util.devices import TorchDevice
 
 if TYPE_CHECKING:
     from invokeai.app.services.invoker import Invoker
@@ -46,7 +47,7 @@ def __init__(self, output_dir: Path, ephemeral: bool = False):
     def load(self, name: str) -> T:
         file_path = self._get_path(name)
         try:
-            return torch.load(file_path)  # pyright: ignore [reportUnknownMemberType]
+            return torch.load(file_path, map_location=TorchDevice.choose_torch_device())  # pyright: ignore [reportUnknownMemberType]
         except FileNotFoundError as e:
             raise ObjectNotFoundError(name) from e
 
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index 6e518bc9e3e..4fe99c31e6d 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -53,25 +53,12 @@ class CacheRecord(Generic[T]):
 
     key: Unique key for each model, same as used in the models database.
     model: Read-only copy of the model *without weights* residing in the "meta device"
-    state_dict: A read-only copy of the model's state dict in RAM. It will be
-                used as a template for creating a copy in the VRAM.
     size: Size of the model
-
-    Before a model is executed, the state_dict template is copied into VRAM,
-    and then injected into the model. When the model is finished, the VRAM
-    copy of the state dict is deleted, and the RAM version is reinjected
-    into the model.
-
-    The state_dict should be treated as a read-only attribute. Do not attempt
-    to patch or otherwise modify it. Instead, patch the copy of the state_dict
-    after it is loaded into the execution device (e.g. CUDA) using the `LoadedModel`
-    context manager call `model_on_device()`.
     """
 
     key: str
     size: int
     model: T
-    state_dict: Optional[Dict[str, torch.Tensor]]
 
 
 @dataclass
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 3a2d36e87a8..817fcb2ec01 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -159,7 +159,7 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
                 device = free_device[0]
 
         # we are outside the lock region now
-        self.logger.info(f"Reserved torch device {device} for execution thread {current_thread}")
+        self.logger.info(f"{current_thread} Reserved torch device {device}")
 
         # Tell TorchDevice to use this object to get the torch device.
         TorchDevice.set_model_cache(self)
@@ -167,7 +167,7 @@ def reserve_execution_device(self, timeout: Optional[int] = None) -> Generator[t
             yield device
         finally:
             with self._device_lock:
-                self.logger.info(f"Released torch device {device}")
+                self.logger.info(f"{current_thread} Released torch device {device}")
                 self._execution_devices[device] = 0
                 self._free_execution_device.release()
                 torch.cuda.empty_cache()
@@ -215,20 +215,17 @@ def put(
         submodel_type: Optional[SubModelType] = None,
     ) -> None:
         """Store model under key and optional submodel_type."""
-        key = self._make_cache_key(key, submodel_type)
-        if key in self._cached_models:
-            return
-        size = calc_model_size_by_data(model)
-        self.make_room(size)
+        with self._ram_lock:
+            key = self._make_cache_key(key, submodel_type)
+            if key in self._cached_models:
+                return
+            size = calc_model_size_by_data(model)
+            self.make_room(size)
 
-        if isinstance(model, torch.nn.Module):
-            state_dict = model.state_dict()  # keep a master copy of the state dict
-            model = model.to(device="meta")  # and keep a template in the meta device
-        else:
-            state_dict = None
-        cache_record = CacheRecord(key=key, model=model, state_dict=state_dict, size=size)
-        self._cached_models[key] = cache_record
-        self._cache_stack.append(key)
+            tid = threading.current_thread().ident
+            cache_record = CacheRecord(key=key, model=model, size=size)
+            self._cached_models[key] = cache_record
+            self._cache_stack.append(key)
 
     def get(
         self,
@@ -296,11 +293,11 @@ def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: tor
 
         May raise a torch.cuda.OutOfMemoryError
         """
-        self.logger.info(f"Called to move {cache_entry.key} ({type(cache_entry.model)=}) to {target_device}")
+        with self._ram_lock:
+            self.logger.debug(f"Called to move {cache_entry.key} ({type(cache_entry.model)=}) to {target_device}")
 
-        # Some models don't have a state dictionary, in which case the
-        # stored model will still reside in CPU
-        if cache_entry.state_dict is None:
+            # Some models don't have a state dictionary, in which case the
+            # stored model will still reside in CPU
             if hasattr(cache_entry.model, "to"):
                 model_in_gpu = copy.deepcopy(cache_entry.model)
                 assert hasattr(model_in_gpu, "to")
@@ -309,65 +306,6 @@ def model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: tor
             else:
                 return cache_entry.model  # what happens in CPU stays in CPU
 
-        # This roundabout method for moving the model around is done to avoid
-        # the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
-        # When moving to VRAM, we copy (not move) each element of the state dict from
-        # RAM to a new state dict in VRAM, and then inject it into the model.
-        # This operation is slightly faster than running `to()` on the whole model.
-        #
-        # When the model needs to be removed from VRAM we simply delete the copy
-        # of the state dict in VRAM, and reinject the state dict that is cached
-        # in RAM into the model. So this operation is very fast.
-        start_model_to_time = time.time()
-        snapshot_before = self._capture_memory_snapshot()
-        try:
-            assert isinstance(cache_entry.model, torch.nn.Module)
-            template = cache_entry.model
-            cls = template.__class__
-            with skip_torch_weight_init():
-                if isinstance(cls, ConfigMixin) or hasattr(cls, "from_config"):
-                    working_model = template.__class__.from_config(template.config)  # diffusers style
-                else:
-                    working_model = template.__class__(config=template.config)  # transformers style (sigh)
-            working_model.to(device=target_device, dtype=self._precision)
-            working_model.load_state_dict(cache_entry.state_dict)
-        except Exception as e:  # blow away cache entry
-            self._delete_cache_entry(cache_entry)
-            raise e
-
-        snapshot_after = self._capture_memory_snapshot()
-        end_model_to_time = time.time()
-        self.logger.info(
-            f"Moved model '{cache_entry.key}' to"
-            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
-            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
-            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-        )
-
-        if (
-            snapshot_before is not None
-            and snapshot_after is not None
-            and snapshot_before.vram is not None
-            and snapshot_after.vram is not None
-        ):
-            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
-
-            # If the estimated model size does not match the change in VRAM, log a warning.
-            if not math.isclose(
-                vram_change,
-                cache_entry.size,
-                rel_tol=0.1,
-                abs_tol=10 * MB,
-            ):
-                self.logger.debug(
-                    f"Moving model '{cache_entry.key}' from to"
-                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
-                    " estimated size may be incorrect. Estimated model size:"
-                    f" {(cache_entry.size/GIG):.3f} GB.\n"
-                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-                )
-        return working_model
-
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
@@ -445,8 +383,11 @@ def _check_free_vram(self, target_device: torch.device, needed_size: int) -> Non
             raise torch.cuda.OutOfMemoryError
 
     def _delete_cache_entry(self, cache_entry: CacheRecord[AnyModel]) -> None:
-        self._cache_stack.remove(cache_entry.key)
-        del self._cached_models[cache_entry.key]
+        try:
+            self._cache_stack.remove(cache_entry.key)
+            del self._cached_models[cache_entry.key]
+        except ValueError:
+            pass
 
     @staticmethod
     def _device_name(device: torch.device) -> str:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 815fd41f04e..fd85e2d8adf 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -31,10 +31,6 @@ def model(self) -> AnyModel:
         """Return the model without moving it around."""
         return self._cache_entry.model
 
-    def get_state_dict(self) -> Optional[Dict[str, torch.Tensor]]:
-        """Return the state dict (if any) for the cached model."""
-        return self._cache_entry.state_dict
-
     def lock(self) -> AnyModel:
         """Move the model into the execution device (GPU) and lock it."""
         try:
@@ -56,3 +52,9 @@ def lock(self) -> AnyModel:
     def unlock(self) -> None:
         """Call upon exit from context."""
         self._cache.print_cuda_stats()
+
+    # This is no longer in use in MGPU.
+    def get_state_dict(self) -> Optional[Dict[str, torch.Tensor]]:
+        """Return the state dict (if any) for the cached model."""
+        return None
+
diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py
index 0f57c0efdcc..b45cf91c981 100644
--- a/invokeai/backend/model_patcher.py
+++ b/invokeai/backend/model_patcher.py
@@ -2,7 +2,7 @@
 """These classes implement model patching with LoRAs and Textual Inversions."""
 
 from __future__ import annotations
-
+import threading
 import pickle
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union
@@ -34,6 +34,9 @@
 
 # TODO: rename smth like ModelPatcher and add TI method?
 class ModelPatcher:
+
+    _thread_lock = threading.Lock()
+
     @staticmethod
     def _resolve_lora_key(model: torch.nn.Module, lora_key: str, prefix: str) -> Tuple[str, torch.nn.Module]:
         assert "." not in lora_key
@@ -106,7 +109,10 @@ def apply_lora(
         """
         original_weights = {}
         try:
-            with torch.no_grad():
+            with (
+                    torch.no_grad(),
+                    cls._thread_lock
+            ):
                 for lora, lora_weight in loras:
                     # assert lora.device.type == "cpu"
                     for layer_key, layer in lora.layers.items():
@@ -156,9 +162,6 @@ def apply_lora(
             yield  # wait for context manager exit
 
         finally:
-            # LS check: for now, we are not reusing models in VRAM but re-copying them each time they are needed.
-            # Therefore it should not be necessary to copy the original model weights back.
-            # This needs to be fixed before resurrecting the VRAM cache.
             assert hasattr(model, "get_submodule")  # mypy not picking up fact that torch.nn.Module has get_submodule()
             with torch.no_grad():
                 for module_key, weight in original_weights.items():
diff --git a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
index 85950a01df5..b0291d06fe2 100644
--- a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
+++ b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
@@ -1,4 +1,5 @@
 import math
+import threading
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -31,9 +32,13 @@ class SDXLConditioningInfo(BasicConditioningInfo):
     add_time_ids: torch.Tensor
 
     def to(self, device, dtype=None):
+        tid = threading.current_thread().ident
         self.pooled_embeds = self.pooled_embeds.to(device=device, dtype=dtype)
+        assert self.pooled_embeds.device == device
         self.add_time_ids = self.add_time_ids.to(device=device, dtype=dtype)
-        return super().to(device=device, dtype=dtype)
+        result = super().to(device=device, dtype=dtype)
+        assert self.embeds.device == device
+        return result
 
 
 @dataclass
diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
index f418133e49f..3e3040968db 100644
--- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
+++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import math
+import threading
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -293,24 +294,31 @@ def _apply_standard_conditioning(
             cross_attention_kwargs["regional_ip_data"] = regional_ip_data
 
         added_cond_kwargs = None
-        if conditioning_data.is_sdxl():
-            added_cond_kwargs = {
-                "text_embeds": torch.cat(
-                    [
-                        # TODO: how to pad? just by zeros? or even truncate?
-                        conditioning_data.uncond_text.pooled_embeds,
-                        conditioning_data.cond_text.pooled_embeds,
-                    ],
-                    dim=0,
-                ),
-                "time_ids": torch.cat(
-                    [
-                        conditioning_data.uncond_text.add_time_ids,
-                        conditioning_data.cond_text.add_time_ids,
-                    ],
-                    dim=0,
-                ),
-            }
+        try:
+            if conditioning_data.is_sdxl():
+                #tid = threading.current_thread().ident
+                #print(f'DEBUG {tid} {conditioning_data.uncond_text.pooled_embeds.device=} {conditioning_data.cond_text.pooled_embeds.device=}', flush=True),                
+                added_cond_kwargs = {
+                    "text_embeds": torch.cat(
+                        [
+                            # TODO: how to pad? just by zeros? or even truncate?
+                            conditioning_data.uncond_text.pooled_embeds,
+                            conditioning_data.cond_text.pooled_embeds,
+                        ],
+                        dim=0,
+                    ),
+                    "time_ids": torch.cat(
+                        [
+                            conditioning_data.uncond_text.add_time_ids,
+                            conditioning_data.cond_text.add_time_ids,
+                        ],
+                        dim=0,
+                    ),
+                }
+        except Exception as e:
+            tid = threading.current_thread().ident
+            print(f'DEBUG: {tid} {str(e)}')
+            raise e
 
         if conditioning_data.cond_regions is not None or conditioning_data.uncond_regions is not None:
             # TODO(ryand): We currently initialize RegionalPromptData for every denoising step. The text conditionings

From 9dcace7d824b46790d35cbcf3cb96ebbf8779a73 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 18 Jul 2024 15:07:09 -0400
Subject: [PATCH 30/30] ruff fixes and restore default map location of object
 serializer load

---
 invokeai/app/invocations/compel.py                       | 2 --
 invokeai/app/invocations/denoise_latents.py              | 3 ---
 .../services/object_serializer/object_serializer_disk.py | 2 +-
 invokeai/app/services/shared/graph.py                    | 2 +-
 .../load/model_cache/model_cache_default.py              | 7 +------
 .../model_manager/load/model_cache/model_locker.py       | 1 -
 invokeai/backend/model_patcher.py                        | 9 +++------
 .../stable_diffusion/diffusion/conditioning_data.py      | 2 --
 .../diffusion/shared_invokeai_diffusion.py               | 6 +++---
 9 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py
index f860b21dec0..67b25dcda93 100644
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@@ -1,7 +1,6 @@
 from typing import Iterator, List, Optional, Tuple, Union, cast
 
 import torch
-import threading
 from compel import Compel, ReturnedEmbeddingsType
 from compel.prompt_parser import Blend, Conjunction, CrossAttentionControlSubstitute, FlattenedPrompt, Fragment
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
@@ -140,7 +139,6 @@ def run_clip_compel(
         lora_prefix: str,
         zero_on_empty: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        tid = threading.current_thread().ident
         tokenizer_info = context.models.load(clip_field.tokenizer)
         text_encoder_info = context.models.load(clip_field.text_encoder)
 
diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py
index a58fb69cb37..db43723e339 100644
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654)
 import copy
 import inspect
-import threading
 from contextlib import ExitStack
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
@@ -194,7 +193,6 @@ def _get_text_embeddings_and_masks(
         """Get the text embeddings and masks from the input conditioning fields."""
         text_embeddings: Union[list[BasicConditioningInfo], list[SDXLConditioningInfo]] = []
         text_embeddings_masks: list[Optional[torch.Tensor]] = []
-        tid = threading.current_thread().ident
         for cond in cond_list:
             cond_data = copy.deepcopy(context.conditioning.load(cond.conditioning_name))
             text_embeddings.append(cond_data.conditionings[0].to(device=device, dtype=dtype))
@@ -319,7 +317,6 @@ def get_conditioning_data(
         if not isinstance(uncond_list, list):
             uncond_list = [uncond_list]
 
-        tid = threading.current_thread().ident
         cond_text_embeddings, cond_text_embedding_masks = self._get_text_embeddings_and_masks(
             cond_list, context, unet.device, unet.dtype
         )
diff --git a/invokeai/app/services/object_serializer/object_serializer_disk.py b/invokeai/app/services/object_serializer/object_serializer_disk.py
index e27ea46e6b5..0c9567553a6 100644
--- a/invokeai/app/services/object_serializer/object_serializer_disk.py
+++ b/invokeai/app/services/object_serializer/object_serializer_disk.py
@@ -47,7 +47,7 @@ def __init__(self, output_dir: Path, ephemeral: bool = False):
     def load(self, name: str) -> T:
         file_path = self._get_path(name)
         try:
-            return torch.load(file_path, map_location=TorchDevice.choose_torch_device())  # pyright: ignore [reportUnknownMemberType]
+            return torch.load(file_path)  # pyright: ignore [reportUnknownMemberType]
         except FileNotFoundError as e:
             raise ObjectNotFoundError(name) from e
 
diff --git a/invokeai/app/services/shared/graph.py b/invokeai/app/services/shared/graph.py
index d745e738233..60fd909881b 100644
--- a/invokeai/app/services/shared/graph.py
+++ b/invokeai/app/services/shared/graph.py
@@ -652,7 +652,7 @@ def _is_iterator_connection_valid(
         output_fields = [get_input_field(self.get_node(e.node_id), e.field) for e in outputs]
 
         # Input type must be a list
-        if get_origin(input_field) != list:
+        if get_origin(input_field) is not list:
             return False
 
         # Validate that all outputs match the input type
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 817fcb2ec01..31a10b6ea40 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -20,25 +20,21 @@
 
 import copy
 import gc
-import math
 import sys
 import threading
-import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
 from typing import Dict, Generator, List, Optional, Set
 
 import torch
-from diffusers.configuration_utils import ConfigMixin
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
 from invokeai.backend.model_manager.load.model_util import calc_model_size_by_data
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
-from ..optimizations import skip_torch_weight_init
 from .model_cache_base import CacheRecord, CacheStats, ModelCacheBase, ModelLockerBase
 from .model_locker import ModelLocker
 
@@ -222,7 +218,6 @@ def put(
             size = calc_model_size_by_data(model)
             self.make_room(size)
 
-            tid = threading.current_thread().ident
             cache_record = CacheRecord(key=key, model=model, size=size)
             self._cached_models[key] = cache_record
             self._cache_stack.append(key)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index fd85e2d8adf..68af7ba97a9 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -57,4 +57,3 @@ def unlock(self) -> None:
     def get_state_dict(self) -> Optional[Dict[str, torch.Tensor]]:
         """Return the state dict (if any) for the cached model."""
         return None
-
diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py
index b45cf91c981..b879c3d4e80 100644
--- a/invokeai/backend/model_patcher.py
+++ b/invokeai/backend/model_patcher.py
@@ -2,8 +2,9 @@
 """These classes implement model patching with LoRAs and Textual Inversions."""
 
 from __future__ import annotations
-import threading
+
 import pickle
+import threading
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union
 
@@ -34,7 +35,6 @@
 
 # TODO: rename smth like ModelPatcher and add TI method?
 class ModelPatcher:
-
     _thread_lock = threading.Lock()
 
     @staticmethod
@@ -109,10 +109,7 @@ def apply_lora(
         """
         original_weights = {}
         try:
-            with (
-                    torch.no_grad(),
-                    cls._thread_lock
-            ):
+            with torch.no_grad(), cls._thread_lock:
                 for lora, lora_weight in loras:
                     # assert lora.device.type == "cpu"
                     for layer_key, layer in lora.layers.items():
diff --git a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
index b0291d06fe2..01aae6b5a49 100644
--- a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
+++ b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py
@@ -1,5 +1,4 @@
 import math
-import threading
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -32,7 +31,6 @@ class SDXLConditioningInfo(BasicConditioningInfo):
     add_time_ids: torch.Tensor
 
     def to(self, device, dtype=None):
-        tid = threading.current_thread().ident
         self.pooled_embeds = self.pooled_embeds.to(device=device, dtype=dtype)
         assert self.pooled_embeds.device == device
         self.add_time_ids = self.add_time_ids.to(device=device, dtype=dtype)
diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
index 3e3040968db..a8f47247eca 100644
--- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
+++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
@@ -296,8 +296,8 @@ def _apply_standard_conditioning(
         added_cond_kwargs = None
         try:
             if conditioning_data.is_sdxl():
-                #tid = threading.current_thread().ident
-                #print(f'DEBUG {tid} {conditioning_data.uncond_text.pooled_embeds.device=} {conditioning_data.cond_text.pooled_embeds.device=}', flush=True),                
+                # tid = threading.current_thread().ident
+                # print(f'DEBUG {tid} {conditioning_data.uncond_text.pooled_embeds.device=} {conditioning_data.cond_text.pooled_embeds.device=}', flush=True),
                 added_cond_kwargs = {
                     "text_embeds": torch.cat(
                         [
@@ -317,7 +317,7 @@ def _apply_standard_conditioning(
                 }
         except Exception as e:
             tid = threading.current_thread().ident
-            print(f'DEBUG: {tid} {str(e)}')
+            print(f"DEBUG: {tid} {str(e)}")
             raise e
 
         if conditioning_data.cond_regions is not None or conditioning_data.uncond_regions is not None: