|
19 | 19 | import psutil |
20 | 20 | import logging |
21 | 21 | from enum import Enum |
22 | | -from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram |
| 22 | +from comfy.cli_args import args, PerformanceFeature |
23 | 23 | import threading |
24 | 24 | import torch |
25 | 25 | import sys |
@@ -651,7 +651,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_ |
651 | 651 | soft_empty_cache() |
652 | 652 | return unloaded_models |
653 | 653 |
|
654 | | -def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): |
| 654 | +def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): |
655 | 655 | cleanup_models_gc() |
656 | 656 | global vram_state |
657 | 657 |
|
@@ -747,26 +747,6 @@ def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, m |
747 | 747 | current_loaded_models.insert(0, loaded_model) |
748 | 748 | return |
749 | 749 |
|
750 | | -def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load): |
751 | | - with torch.inference_mode(): |
752 | | - load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load) |
753 | | - soft_empty_cache() |
754 | | - |
755 | | -def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): |
756 | | - #Deliberately load models outside of the Aimdo mempool so they can be retained accross |
757 | | - #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are |
758 | | - #thread local. So exploit that to escape context |
759 | | - if enables_dynamic_vram(): |
760 | | - t = threading.Thread( |
761 | | - target=load_models_gpu_thread, |
762 | | - args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load) |
763 | | - ) |
764 | | - t.start() |
765 | | - t.join() |
766 | | - else: |
767 | | - load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights, |
768 | | - minimum_memory_required=minimum_memory_required, force_full_load=force_full_load) |
769 | | - |
770 | 750 | def load_model_gpu(model): |
771 | 751 | return load_models_gpu([model]) |
772 | 752 |
|
@@ -1226,21 +1206,16 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str |
1226 | 1206 | if dtype is None: |
1227 | 1207 | dtype = weight._model_dtype |
1228 | 1208 |
|
1229 | | - r = torch.empty_like(weight, dtype=dtype, device=device) |
1230 | | - |
1231 | 1209 | signature = comfy_aimdo.model_vbar.vbar_fault(weight._v) |
1232 | 1210 | if signature is not None: |
1233 | | - raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device) |
1234 | | - v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0] |
| 1211 | + v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0] |
1235 | 1212 | if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature): |
1236 | 1213 | weight._v_signature = signature |
1237 | 1214 | #Send it over |
1238 | 1215 | v_tensor.copy_(weight, non_blocking=non_blocking) |
1239 | | - #always take a deep copy even if _v is good, as we have no reasonable point to unpin |
1240 | | - #a non comfy weight |
1241 | | - r.copy_(v_tensor) |
1242 | | - comfy_aimdo.model_vbar.vbar_unpin(weight._v) |
1243 | | - return r |
| 1216 | + return v_tensor.to(dtype=dtype) |
| 1217 | + |
| 1218 | + r = torch.empty_like(weight, dtype=dtype, device=device) |
1244 | 1219 |
|
1245 | 1220 | if weight.dtype != r.dtype and weight.dtype != weight._model_dtype: |
1246 | 1221 | #Offloaded casting could skip this, however it would make the quantizations |
|
0 commit comments