diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 9304c34b4e01..4d373b2a5ded 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -23,7 +23,6 @@
     is_transformers_available,
 )
 
-
 # Lazy Import based on
 # https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
 
@@ -60,7 +59,11 @@
 }
 
 try:
-    if not is_torch_available() and not is_accelerate_available() and not is_bitsandbytes_available():
+    if (
+        not is_torch_available()
+        and not is_accelerate_available()
+        and not is_bitsandbytes_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_bitsandbytes_objects
@@ -72,7 +75,11 @@
     _import_structure["quantizers.quantization_config"].append("BitsAndBytesConfig")
 
 try:
-    if not is_torch_available() and not is_accelerate_available() and not is_gguf_available():
+    if (
+        not is_torch_available()
+        and not is_accelerate_available()
+        and not is_gguf_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_gguf_objects
@@ -84,7 +91,11 @@
     _import_structure["quantizers.quantization_config"].append("GGUFQuantizationConfig")
 
 try:
-    if not is_torch_available() and not is_accelerate_available() and not is_torchao_available():
+    if (
+        not is_torch_available()
+        and not is_accelerate_available()
+        and not is_torchao_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_torchao_objects
@@ -96,7 +107,11 @@
     _import_structure["quantizers.quantization_config"].append("TorchAoConfig")
 
 try:
-    if not is_torch_available() and not is_accelerate_available() and not is_optimum_quanto_available():
+    if (
+        not is_torch_available()
+        and not is_accelerate_available()
+        and not is_optimum_quanto_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_optimum_quanto_objects
@@ -126,7 +141,9 @@
 except OptionalDependencyNotAvailable:
     from .utils import dummy_pt_objects  # noqa F403
 
-    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
+    _import_structure["utils.dummy_pt_objects"] = [
+        name for name in dir(dummy_pt_objects) if not name.startswith("_")
+    ]
 
 else:
     _import_structure["hooks"].extend(
@@ -187,6 +204,7 @@
             "OmniGenTransformer2DModel",
             "PixArtTransformer2DModel",
             "PriorTransformer",
+            "SanaControlNetModel",
             "SanaTransformer2DModel",
             "SD3ControlNetModel",
             "SD3MultiControlNetModel",
@@ -303,11 +321,15 @@
     from .utils import dummy_torch_and_torchsde_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_torchsde_objects"] = [
-        name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_torch_and_torchsde_objects)
+        if not name.startswith("_")
     ]
 
 else:
-    _import_structure["schedulers"].extend(["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"])
+    _import_structure["schedulers"].extend(
+        ["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"]
+    )
 
 try:
     if not (is_torch_available() and is_transformers_available()):
@@ -316,7 +338,9 @@
     from .utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_objects"] = [
-        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_torch_and_transformers_objects)
+        if not name.startswith("_")
     ]
 
 else:
@@ -424,6 +448,7 @@
             "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
             "ReduxImageEncoder",
+            "SanaControlNetPipeline",
             "SanaPAGPipeline",
             "SanaPipeline",
             "SanaSprintPipeline",
@@ -517,39 +542,63 @@
     )
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+    if not (
+        is_torch_available()
+        and is_transformers_available()
+        and is_k_diffusion_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
-        name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects)
+        if not name.startswith("_")
     ]
 
 else:
-    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"])
+    _import_structure["pipelines"].extend(
+        ["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"]
+    )
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+    if not (
+        is_torch_available()
+        and is_transformers_available()
+        and is_sentencepiece_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
+    from .utils import (  # noqa F403
+        dummy_torch_and_transformers_and_sentencepiece_objects,
+    )
 
-    _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [
-        name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_")
+    _import_structure[
+        "utils.dummy_torch_and_transformers_and_sentencepiece_objects"
+    ] = [
+        name
+        for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects)
+        if not name.startswith("_")
     ]
 
 else:
-    _import_structure["pipelines"].extend(["KolorsImg2ImgPipeline", "KolorsPAGPipeline", "KolorsPipeline"])
+    _import_structure["pipelines"].extend(
+        ["KolorsImg2ImgPipeline", "KolorsPAGPipeline", "KolorsPipeline"]
+    )
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+    if not (
+        is_torch_available() and is_transformers_available() and is_onnx_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
-        name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_torch_and_transformers_and_onnx_objects)
+        if not name.startswith("_")
     ]
 
 else:
@@ -571,20 +620,26 @@
     from .utils import dummy_torch_and_librosa_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_librosa_objects"] = [
-        name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_torch_and_librosa_objects)
+        if not name.startswith("_")
     ]
 
 else:
     _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+    if not (
+        is_transformers_available() and is_torch_available() and is_note_seq_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
 
     _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
-        name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_transformers_and_torch_and_note_seq_objects)
+        if not name.startswith("_")
     ]
 
 
@@ -605,7 +660,9 @@
 else:
     _import_structure["models.controlnets.controlnet_flax"] = ["FlaxControlNetModel"]
     _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
-    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.unets.unet_2d_condition_flax"] = [
+        "FlaxUNet2DConditionModel"
+    ]
     _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
     _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
     _import_structure["schedulers"].extend(
@@ -630,7 +687,9 @@
     from .utils import dummy_flax_and_transformers_objects  # noqa F403
 
     _import_structure["utils.dummy_flax_and_transformers_objects"] = [
-        name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_")
+        name
+        for name in dir(dummy_flax_and_transformers_objects)
+        if not name.startswith("_")
     ]
 
 
@@ -763,6 +822,7 @@
             OmniGenTransformer2DModel,
             PixArtTransformer2DModel,
             PriorTransformer,
+            SanaControlNetModel,
             SanaTransformer2DModel,
             SD3ControlNetModel,
             SD3MultiControlNetModel,
@@ -979,6 +1039,7 @@
             PixArtSigmaPAGPipeline,
             PixArtSigmaPipeline,
             ReduxImageEncoder,
+            SanaControlNetPipeline,
             SanaPAGPipeline,
             SanaPipeline,
             SanaSprintPipeline,
@@ -1070,22 +1131,35 @@
         )
 
     try:
-        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        if not (
+            is_torch_available()
+            and is_transformers_available()
+            and is_k_diffusion_available()
+        ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
     else:
-        from .pipelines import StableDiffusionKDiffusionPipeline, StableDiffusionXLKDiffusionPipeline
+        from .pipelines import (
+            StableDiffusionKDiffusionPipeline,
+            StableDiffusionXLKDiffusionPipeline,
+        )
 
     try:
-        if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+        if not (
+            is_torch_available()
+            and is_transformers_available()
+            and is_sentencepiece_available()
+        ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torch_and_transformers_and_sentencepiece_objects import *  # noqa F403
     else:
         from .pipelines import KolorsImg2ImgPipeline, KolorsPAGPipeline, KolorsPipeline
     try:
-        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        if not (
+            is_torch_available() and is_transformers_available() and is_onnx_available()
+        ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
@@ -1108,7 +1182,11 @@
         from .pipelines import AudioDiffusionPipeline, Mel
 
     try:
-        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        if not (
+            is_transformers_available()
+            and is_torch_available()
+            and is_note_seq_available()
+        ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index f7d70f1d9826..719325de13ef 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -21,7 +21,6 @@
     is_torch_available,
 )
 
-
 _import_structure = {}
 
 if is_torch_available():
@@ -30,58 +29,102 @@
     _import_structure["autoencoders.autoencoder_dc"] = ["AutoencoderDC"]
     _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
     _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
-    _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
-    _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
+    _import_structure["autoencoders.autoencoder_kl_cogvideox"] = [
+        "AutoencoderKLCogVideoX"
+    ]
+    _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = [
+        "AutoencoderKLHunyuanVideo"
+    ]
     _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
     _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
     _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
-    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
+    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = [
+        "AutoencoderKLTemporalDecoder"
+    ]
     _import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
-    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["autoencoders.consistency_decoder_vae"] = [
+        "ConsistencyDecoderVAE"
+    ]
     _import_structure["autoencoders.vq_model"] = ["VQModel"]
     _import_structure["cache_utils"] = ["CacheMixin"]
     _import_structure["controlnets.controlnet"] = ["ControlNetModel"]
-    _import_structure["controlnets.controlnet_flux"] = ["FluxControlNetModel", "FluxMultiControlNetModel"]
+    _import_structure["controlnets.controlnet_sana"] = ["SanaControlNetModel"]
+    _import_structure["controlnets.controlnet_flux"] = [
+        "FluxControlNetModel",
+        "FluxMultiControlNetModel",
+    ]
     _import_structure["controlnets.controlnet_hunyuan"] = [
         "HunyuanDiT2DControlNetModel",
         "HunyuanDiT2DMultiControlNetModel",
     ]
-    _import_structure["controlnets.controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
+    _import_structure["controlnets.controlnet_sd3"] = [
+        "SD3ControlNetModel",
+        "SD3MultiControlNetModel",
+    ]
     _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
     _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
-    _import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["controlnets.controlnet_xs"] = [
+        "ControlNetXSAdapter",
+        "UNetControlNetXSModel",
+    ]
     _import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
-    _import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
+    _import_structure["controlnets.multicontrolnet_union"] = [
+        "MultiControlNetUnionModel"
+    ]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
-    _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
-    _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
-    _import_structure["transformers.consisid_transformer_3d"] = ["ConsisIDTransformer3DModel"]
+    _import_structure["transformers.auraflow_transformer_2d"] = [
+        "AuraFlowTransformer2DModel"
+    ]
+    _import_structure["transformers.cogvideox_transformer_3d"] = [
+        "CogVideoXTransformer3DModel"
+    ]
+    _import_structure["transformers.consisid_transformer_3d"] = [
+        "ConsisIDTransformer3DModel"
+    ]
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
     _import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
     _import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
     _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
-    _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
+    _import_structure["transformers.pixart_transformer_2d"] = [
+        "PixArtTransformer2DModel"
+    ]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
     _import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
     _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
-    _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
-    _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
-    _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
-    _import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"]
+    _import_structure["transformers.transformer_allegro"] = [
+        "AllegroTransformer3DModel"
+    ]
+    _import_structure["transformers.transformer_cogview3plus"] = [
+        "CogView3PlusTransformer2DModel"
+    ]
+    _import_structure["transformers.transformer_cogview4"] = [
+        "CogView4Transformer2DModel"
+    ]
+    _import_structure["transformers.transformer_easyanimate"] = [
+        "EasyAnimateTransformer3DModel"
+    ]
     _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
-    _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
+    _import_structure["transformers.transformer_hunyuan_video"] = [
+        "HunyuanVideoTransformer3DModel"
+    ]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
-    _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
+    _import_structure["transformers.transformer_lumina2"] = [
+        "Lumina2Transformer2DModel"
+    ]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
-    _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
+    _import_structure["transformers.transformer_omnigen"] = [
+        "OmniGenTransformer2DModel"
+    ]
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
-    _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
+    _import_structure["transformers.transformer_temporal"] = [
+        "TransformerTemporalModel"
+    ]
     _import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
     _import_structure["unets.unet_1d"] = ["UNet1DModel"]
     _import_structure["unets.unet_2d"] = ["UNet2DModel"]
@@ -90,7 +133,9 @@
     _import_structure["unets.unet_i2vgen_xl"] = ["I2VGenXLUNet"]
     _import_structure["unets.unet_kandinsky3"] = ["Kandinsky3UNet"]
     _import_structure["unets.unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
-    _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
+    _import_structure["unets.unet_spatio_temporal_condition"] = [
+        "UNetSpatioTemporalConditionModel"
+    ]
     _import_structure["unets.unet_stable_cascade"] = ["StableCascadeUNet"]
     _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
 
@@ -131,6 +176,7 @@
             HunyuanDiT2DMultiControlNetModel,
             MultiControlNetModel,
             MultiControlNetUnionModel,
+            SanaControlNetModel,
             SD3ControlNetModel,
             SD3MultiControlNetModel,
             SparseControlNetModel,
@@ -189,4 +235,6 @@
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/diffusers/models/controlnets/__init__.py b/src/diffusers/models/controlnets/__init__.py
index 1dd92e51a44c..621de4329868 100644
--- a/src/diffusers/models/controlnets/__init__.py
+++ b/src/diffusers/models/controlnets/__init__.py
@@ -1,22 +1,34 @@
 from ...utils import is_flax_available, is_torch_available
 
-
 if is_torch_available():
     from .controlnet import ControlNetModel, ControlNetOutput
-    from .controlnet_flux import FluxControlNetModel, FluxControlNetOutput, FluxMultiControlNetModel
+    from .controlnet_flux import (
+        FluxControlNetModel,
+        FluxControlNetOutput,
+        FluxMultiControlNetModel,
+    )
     from .controlnet_hunyuan import (
         HunyuanControlNetOutput,
         HunyuanDiT2DControlNetModel,
         HunyuanDiT2DMultiControlNetModel,
     )
-    from .controlnet_sd3 import SD3ControlNetModel, SD3ControlNetOutput, SD3MultiControlNetModel
+    from .controlnet_sana import SanaControlNetModel
+    from .controlnet_sd3 import (
+        SD3ControlNetModel,
+        SD3ControlNetOutput,
+        SD3MultiControlNetModel,
+    )
     from .controlnet_sparsectrl import (
         SparseControlNetConditioningEmbedding,
         SparseControlNetModel,
         SparseControlNetOutput,
     )
     from .controlnet_union import ControlNetUnionModel
-    from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
+    from .controlnet_xs import (
+        ControlNetXSAdapter,
+        ControlNetXSOutput,
+        UNetControlNetXSModel,
+    )
     from .multicontrolnet import MultiControlNetModel
     from .multicontrolnet_union import MultiControlNetUnionModel
 
diff --git a/src/diffusers/models/controlnets/controlnet_sana.py b/src/diffusers/models/controlnets/controlnet_sana.py
new file mode 100644
index 000000000000..ed521adbedda
--- /dev/null
+++ b/src/diffusers/models/controlnets/controlnet_sana.py
@@ -0,0 +1,290 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ..attention_processor import AttentionProcessor
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle, RMSNorm
+from ..transformers.sana_transformer import SanaTransformerBlock
+from .controlnet import zero_module
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class SanaControlNetOutput(BaseOutput):
+    controlnet_block_samples: Tuple[torch.Tensor]
+
+
+class SanaControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["SanaTransformerBlock", "PatchEmbed"]
+    _skip_layerwise_casting_patterns = ["patch_embed", "norm"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 32,
+        out_channels: Optional[int] = 32,
+        num_attention_heads: int = 70,
+        attention_head_dim: int = 32,
+        num_layers: int = 7,
+        num_cross_attention_heads: Optional[int] = 20,
+        cross_attention_head_dim: Optional[int] = 112,
+        cross_attention_dim: Optional[int] = 2240,
+        caption_channels: int = 2304,
+        mlp_ratio: float = 2.5,
+        dropout: float = 0.0,
+        attention_bias: bool = False,
+        sample_size: int = 32,
+        patch_size: int = 1,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+
+        out_channels = out_channels or in_channels
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Patch Embedding
+        self.patch_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            interpolation_scale=interpolation_scale,
+            pos_embed_type="sincos" if interpolation_scale is not None else None,
+        )
+
+        # 2. Additional condition embeddings
+        self.time_embed = AdaLayerNormSingle(inner_dim)
+
+        self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.caption_norm = RMSNorm(inner_dim, eps=1e-5, elementwise_affine=True)
+
+        # 3. Transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                SanaTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    num_cross_attention_heads=num_cross_attention_heads,
+                    cross_attention_head_dim=cross_attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    mlp_ratio=mlp_ratio,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # controlnet_blocks
+        self.controlnet_blocks = nn.ModuleList([])
+
+        self.input_block = zero_module(nn.Linear(inner_dim, inner_dim))
+        for _ in range(len(self.transformer_blocks)):
+            controlnet_block = nn.Linear(inner_dim, inner_dim)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks.append(controlnet_block)
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        batch_size, num_channels, height, width = hidden_states.shape
+        p = self.config.patch_size
+        post_patch_height, post_patch_width = height // p, width // p
+
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = hidden_states + self.input_block(self.patch_embed(controlnet_cond.to(hidden_states.dtype)))
+
+        timestep, embedded_timestep = self.time_embed(
+            timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        encoder_hidden_states = self.caption_norm(encoder_hidden_states)
+
+        # 2. Transformer blocks
+        block_res_samples = ()
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.transformer_blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    post_patch_height,
+                    post_patch_width,
+                )
+                block_res_samples = block_res_samples + (hidden_states,)
+        else:
+            for block in self.transformer_blocks:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    post_patch_height,
+                    post_patch_width,
+                )
+                block_res_samples = block_res_samples + (hidden_states,)
+
+        # 3. ControlNet blocks
+        controlnet_block_res_samples = ()
+        for block_res_sample, controlnet_block in zip(block_res_samples, self.controlnet_blocks):
+            block_res_sample = controlnet_block(block_res_sample)
+            controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        controlnet_block_res_samples = [sample * conditioning_scale for sample in controlnet_block_res_samples]
+
+        if not return_dict:
+            return (controlnet_block_res_samples,)
+
+        return SanaControlNetOutput(controlnet_block_samples=controlnet_block_res_samples)
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
index 48b731406191..54e996e13d42 100644
--- a/src/diffusers/models/transformers/sana_transformer.py
+++ b/src/diffusers/models/transformers/sana_transformer.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,12 +26,16 @@
     AttentionProcessor,
     SanaLinearAttnProcessor2_0,
 )
-from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
+from ..embeddings import (
+    PatchEmbed,
+    PixArtAlphaTextProjection,
+    TimestepEmbedding,
+    Timesteps,
+)
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import AdaLayerNormSingle, RMSNorm
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -52,12 +56,21 @@ def __init__(
 
         self.nonlinearity = nn.SiLU()
         self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
-        self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_depth = nn.Conv2d(
+            hidden_channels * 2,
+            hidden_channels * 2,
+            3,
+            1,
+            1,
+            groups=hidden_channels * 2,
+        )
         self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
 
         self.norm = None
         if norm_type == "rms_norm":
-            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
+            self.norm = RMSNorm(
+                out_channels, eps=1e-5, elementwise_affine=True, bias=True
+            )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if self.residual_connection:
@@ -88,10 +101,15 @@ def __init__(self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6
         self.norm = nn.LayerNorm(dim, elementwise_affine=elementwise_affine, eps=eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, temb: torch.Tensor, scale_shift_table: torch.Tensor
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        scale_shift_table: torch.Tensor,
     ) -> torch.Tensor:
         hidden_states = self.norm(hidden_states)
-        shift, scale = (scale_shift_table[None] + temb[:, None].to(scale_shift_table.device)).chunk(2, dim=1)
+        shift, scale = (
+            scale_shift_table[None] + temb[:, None].to(scale_shift_table.device)
+        ).chunk(2, dim=1)
         hidden_states = hidden_states * (1 + scale) + shift
         return hidden_states
 
@@ -99,18 +117,33 @@ def forward(
 class SanaCombinedTimestepGuidanceEmbeddings(nn.Module):
     def __init__(self, embedding_dim):
         super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.time_proj = Timesteps(
+            num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=256, time_embed_dim=embedding_dim
+        )
 
-        self.guidance_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_condition_proj = Timesteps(
+            num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+        self.guidance_embedder = TimestepEmbedding(
+            in_channels=256, time_embed_dim=embedding_dim
+        )
 
         self.silu = nn.SiLU()
         self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
 
-    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor = None, hidden_dtype: torch.dtype = None):
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        guidance: torch.Tensor = None,
+        hidden_dtype: torch.dtype = None,
+    ):
         timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        timesteps_emb = self.timestep_embedder(
+            timesteps_proj.to(dtype=hidden_dtype)
+        )  # (N, D)
 
         guidance_proj = self.guidance_condition_proj(guidance)
         guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=hidden_dtype))
@@ -126,7 +159,9 @@ class SanaAttnProcessor2_0:
 
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+            raise ImportError(
+                "SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
 
     def __call__(
         self,
@@ -136,14 +171,20 @@ def __call__(
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
         )
 
         if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
             # scaled_dot_product_attention expects attention_mask shape to be
             # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
 
         query = attn.to_q(hidden_states)
 
@@ -172,7 +213,9 @@ def __call__(
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
 
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
         hidden_states = hidden_states.to(query.dtype)
 
         # linear proj
@@ -224,7 +267,9 @@ def __init__(
 
         # 2. Cross Attention
         if cross_attention_dim is not None:
-            self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+            self.norm2 = nn.LayerNorm(
+                dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+            )
             self.attn2 = Attention(
                 query_dim=dim,
                 qk_norm=qk_norm,
@@ -239,7 +284,9 @@ def __init__(
             )
 
         # 3. Feed-forward
-        self.ff = GLUMBConv(dim, dim, mlp_ratio, norm_type=None, residual_connection=False)
+        self.ff = GLUMBConv(
+            dim, dim, mlp_ratio, norm_type=None, residual_connection=False
+        )
 
         self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
 
@@ -281,7 +328,9 @@ def forward(
         norm_hidden_states = self.norm2(hidden_states)
         norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
 
-        norm_hidden_states = norm_hidden_states.unflatten(1, (height, width)).permute(0, 3, 1, 2)
+        norm_hidden_states = norm_hidden_states.unflatten(1, (height, width)).permute(
+            0, 3, 1, 2
+        )
         ff_output = self.ff(norm_hidden_states)
         ff_output = ff_output.flatten(2, 3).permute(0, 2, 1)
         hidden_states = hidden_states + gate_mlp * ff_output
@@ -289,7 +338,9 @@ def forward(
         return hidden_states
 
 
-class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+class SanaTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin
+):
     r"""
     A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.
 
@@ -383,7 +434,9 @@ def __init__(
         else:
             self.time_embed = AdaLayerNormSingle(inner_dim)
 
-        self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.caption_projection = PixArtAlphaTextProjection(
+            in_features=caption_channels, hidden_size=inner_dim
+        )
         self.caption_norm = RMSNorm(inner_dim, eps=1e-5, elementwise_affine=True)
 
         # 3. Transformer blocks
@@ -408,7 +461,9 @@ def __init__(
         )
 
         # 4. Output blocks
-        self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, inner_dim) / inner_dim**0.5
+        )
         self.norm_out = SanaModulatedNorm(inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
 
@@ -425,7 +480,11 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
             if hasattr(module, "get_processor"):
                 processors[f"{name}.processor"] = module.get_processor()
 
@@ -440,7 +499,9 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors:
         return processors
 
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
         r"""
         Sets the attention processor to use to compute attention.
 
@@ -483,6 +544,7 @@ def forward(
         encoder_attention_mask: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples: Optional[Tuple[torch.Tensor]] = None,
         return_dict: bool = True,
     ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
         if attention_kwargs is not None:
@@ -495,7 +557,10 @@ def forward(
             # weight the lora layers by setting `lora_scale` for each PEFT layer
             scale_lora_layers(self, lora_scale)
         else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+            if (
+                attention_kwargs is not None
+                and attention_kwargs.get("scale", None) is not None
+            ):
                 logger.warning(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
@@ -520,7 +585,9 @@ def forward(
 
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
         if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(hidden_states.dtype)
+            ) * -10000.0
             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 
         # 1. Input
@@ -540,13 +607,15 @@ def forward(
             )
 
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)
-        encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+        encoder_hidden_states = encoder_hidden_states.view(
+            batch_size, -1, hidden_states.shape[-1]
+        )
 
         encoder_hidden_states = self.caption_norm(encoder_hidden_states)
 
         # 2. Transformer blocks
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for block in self.transformer_blocks:
+            for index_block, block in enumerate(self.transformer_blocks):
                 hidden_states = self._gradient_checkpointing_func(
                     block,
                     hidden_states,
@@ -557,9 +626,15 @@ def forward(
                     post_patch_height,
                     post_patch_width,
                 )
+                if controlnet_block_samples is not None and 0 < index_block <= len(
+                    controlnet_block_samples
+                ):
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block - 1]
+                    )
 
         else:
-            for block in self.transformer_blocks:
+            for index_block, block in enumerate(self.transformer_blocks):
                 hidden_states = block(
                     hidden_states,
                     attention_mask,
@@ -569,18 +644,33 @@ def forward(
                     post_patch_height,
                     post_patch_width,
                 )
+                if controlnet_block_samples is not None and 0 < index_block <= len(
+                    controlnet_block_samples
+                ):
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block - 1]
+                    )
 
         # 3. Normalization
-        hidden_states = self.norm_out(hidden_states, embedded_timestep, self.scale_shift_table)
+        hidden_states = self.norm_out(
+            hidden_states, embedded_timestep, self.scale_shift_table
+        )
 
         hidden_states = self.proj_out(hidden_states)
 
         # 5. Unpatchify
         hidden_states = hidden_states.reshape(
-            batch_size, post_patch_height, post_patch_width, self.config.patch_size, self.config.patch_size, -1
+            batch_size,
+            post_patch_height,
+            post_patch_width,
+            self.config.patch_size,
+            self.config.patch_size,
+            -1,
         )
         hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
-        output = hidden_states.reshape(batch_size, -1, post_patch_height * p, post_patch_width * p)
+        output = hidden_states.reshape(
+            batch_size, -1, post_patch_height * p, post_patch_width * p
+        )
 
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b901d42d9cf7..90c85547d652 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -16,7 +16,6 @@
     is_transformers_available,
 )
 
-
 # These modules contain pipelines from multiple libraries/frameworks
 _dummy_objects = {}
 _import_structure = {
@@ -78,12 +77,16 @@
     _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"])
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+    if not (
+        is_transformers_available() and is_torch_available() and is_note_seq_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
 
-    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+    _dummy_objects.update(
+        get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects)
+    )
 else:
     _import_structure["deprecated"].extend(
         [
@@ -117,7 +120,11 @@
         ]
     )
     _import_structure["allegro"] = ["AllegroPipeline"]
-    _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
+    _import_structure["amused"] = [
+        "AmusedImg2ImgPipeline",
+        "AmusedInpaintPipeline",
+        "AmusedPipeline",
+    ]
     _import_structure["animatediff"] = [
         "AnimateDiffPipeline",
         "AnimateDiffControlNetPipeline",
@@ -264,7 +271,11 @@
         ]
     )
     _import_structure["latte"] = ["LattePipeline"]
-    _import_structure["ltx"] = ["LTXPipeline", "LTXImageToVideoPipeline", "LTXConditionPipeline"]
+    _import_structure["ltx"] = [
+        "LTXPipeline",
+        "LTXImageToVideoPipeline",
+        "LTXConditionPipeline",
+    ]
     _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
     _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
     _import_structure["marigold"].extend(
@@ -280,7 +291,11 @@
     _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
     _import_structure["pia"] = ["PIAPipeline"]
     _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"]
-    _import_structure["sana"] = ["SanaPipeline", "SanaSprintPipeline"]
+    _import_structure["sana"] = [
+        "SanaPipeline",
+        "SanaSprintPipeline",
+        "SanaControlNetPipeline",
+    ]
     _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
     _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
     _import_structure["stable_audio"] = [
@@ -314,7 +329,9 @@
         "StableDiffusion3Img2ImgPipeline",
         "StableDiffusion3InpaintPipeline",
     ]
-    _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["stable_diffusion_attend_and_excite"] = [
+        "StableDiffusionAttendAndExcitePipeline"
+    ]
     _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
     _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
     _import_structure["stable_diffusion_gligen"] = [
@@ -356,7 +373,11 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
+    _import_structure["wan"] = [
+        "WanPipeline",
+        "WanImageToVideoPipeline",
+        "WanVideoToVideoPipeline",
+    ]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -367,12 +388,16 @@
 else:
     _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
 try:
-    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+    if not (
+        is_torch_available() and is_transformers_available() and is_onnx_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
+    _dummy_objects.update(
+        get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects)
+    )
 else:
     _import_structure["stable_diffusion"].extend(
         [
@@ -385,14 +410,18 @@
     )
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+    if not (
+        is_torch_available()
+        and is_transformers_available()
+        and is_k_diffusion_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_k_diffusion_objects,
-    )
+    from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+    _dummy_objects.update(
+        get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects)
+    )
 else:
     _import_structure["stable_diffusion_k_diffusion"] = [
         "StableDiffusionKDiffusionPipeline",
@@ -400,14 +429,18 @@
     ]
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+    if not (
+        is_torch_available()
+        and is_transformers_available()
+        and is_sentencepiece_available()
+    ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_sentencepiece_objects,
-    )
+    from ..utils import dummy_torch_and_transformers_and_sentencepiece_objects
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects))
+    _dummy_objects.update(
+        get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects)
+    )
 else:
     _import_structure["kolors"] = [
         "KolorsPipeline",
@@ -462,7 +495,13 @@
         from .dance_diffusion import DanceDiffusionPipeline
         from .ddim import DDIMPipeline
         from .ddpm import DDPMPipeline
-        from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
+        from .deprecated import (
+            KarrasVePipeline,
+            LDMPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
+        )
         from .dit import DiTPipeline
         from .latent_diffusion import LDMSuperResolutionPipeline
         from .pipeline_utils import (
@@ -525,10 +564,11 @@
             StableDiffusionXLControlNetUnionInpaintPipeline,
             StableDiffusionXLControlNetUnionPipeline,
         )
-        from .controlnet_hunyuandit import (
-            HunyuanDiTControlNetPipeline,
+        from .controlnet_hunyuandit import HunyuanDiTControlNetPipeline
+        from .controlnet_sd3 import (
+            StableDiffusion3ControlNetInpaintingPipeline,
+            StableDiffusion3ControlNetPipeline,
         )
-        from .controlnet_sd3 import StableDiffusion3ControlNetInpaintingPipeline, StableDiffusion3ControlNetPipeline
         from .controlnet_xs import (
             StableDiffusionControlNetXSPipeline,
             StableDiffusionXLControlNetXSPipeline,
@@ -602,10 +642,7 @@
             KandinskyV22PriorEmb2EmbPipeline,
             KandinskyV22PriorPipeline,
         )
-        from .kandinsky3 import (
-            Kandinsky3Img2ImgPipeline,
-            Kandinsky3Pipeline,
-        )
+        from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
         from .latent_consistency_models import (
             LatentConsistencyModelImg2ImgPipeline,
             LatentConsistencyModelPipeline,
@@ -651,7 +688,7 @@
         from .paint_by_example import PaintByExamplePipeline
         from .pia import PIAPipeline
         from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
-        from .sana import SanaPipeline, SanaSprintPipeline
+        from .sana import SanaControlNetPipeline, SanaPipeline, SanaSprintPipeline
         from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
         from .stable_audio import StableAudioPipeline, StableAudioProjectionModel
@@ -678,9 +715,14 @@
             StableDiffusion3InpaintPipeline,
             StableDiffusion3Pipeline,
         )
-        from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+        from .stable_diffusion_attend_and_excite import (
+            StableDiffusionAttendAndExcitePipeline,
+        )
         from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
-        from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
+        from .stable_diffusion_gligen import (
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+        )
         from .stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
         from .stable_diffusion_panorama import StableDiffusionPanoramaPipeline
         from .stable_diffusion_safe import StableDiffusionPipelineSafe
@@ -726,7 +768,11 @@
             from .onnx_utils import OnnxRuntimeModel
 
         try:
-            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+            if not (
+                is_torch_available()
+                and is_transformers_available()
+                and is_onnx_available()
+            ):
                 raise OptionalDependencyNotAvailable()
         except OptionalDependencyNotAvailable:
             from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
@@ -740,7 +786,11 @@
             )
 
         try:
-            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+            if not (
+                is_torch_available()
+                and is_transformers_available()
+                and is_k_diffusion_available()
+            ):
                 raise OptionalDependencyNotAvailable()
         except OptionalDependencyNotAvailable:
             from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
@@ -751,15 +801,16 @@
             )
 
         try:
-            if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+            if not (
+                is_torch_available()
+                and is_transformers_available()
+                and is_sentencepiece_available()
+            ):
                 raise OptionalDependencyNotAvailable()
         except OptionalDependencyNotAvailable:
             from ..utils.dummy_torch_and_transformers_and_sentencepiece_objects import *
         else:
-            from .kolors import (
-                KolorsImg2ImgPipeline,
-                KolorsPipeline,
-            )
+            from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
 
         try:
             if not is_flax_available():
@@ -781,21 +832,20 @@
                 FlaxStableDiffusionInpaintPipeline,
                 FlaxStableDiffusionPipeline,
             )
-            from .stable_diffusion_xl import (
-                FlaxStableDiffusionXLPipeline,
-            )
+            from .stable_diffusion_xl import FlaxStableDiffusionXLPipeline
 
         try:
-            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            if not (
+                is_transformers_available()
+                and is_torch_available()
+                and is_note_seq_available()
+            ):
                 raise OptionalDependencyNotAvailable()
         except OptionalDependencyNotAvailable:
             from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
 
         else:
-            from .deprecated import (
-                MidiProcessor,
-                SpectrogramDiffusionPipeline,
-            )
+            from .deprecated import MidiProcessor, SpectrogramDiffusionPipeline
 
 else:
     import sys
diff --git a/src/diffusers/pipelines/sana/__init__.py b/src/diffusers/pipelines/sana/__init__.py
index 1393b37e2d3a..c5814b2eb4da 100644
--- a/src/diffusers/pipelines/sana/__init__.py
+++ b/src/diffusers/pipelines/sana/__init__.py
@@ -9,7 +9,6 @@
     is_transformers_available,
 )
 
-
 _dummy_objects = {}
 _import_structure = {}
 
@@ -24,6 +23,7 @@
 else:
     _import_structure["pipeline_sana"] = ["SanaPipeline"]
     _import_structure["pipeline_sana_sprint"] = ["SanaSprintPipeline"]
+    _import_structure["pipeline_sana_controlnet"] = ["SanaControlNetPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -34,6 +34,7 @@
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_sana import SanaPipeline
+        from .pipeline_sana_controlnet import SanaControlNetPipeline
         from .pipeline_sana_sprint import SanaSprintPipeline
 else:
     import sys
diff --git a/src/diffusers/pipelines/sana/pipeline_sana_controlnet.py b/src/diffusers/pipelines/sana/pipeline_sana_controlnet.py
new file mode 100644
index 000000000000..8a23486d6f80
--- /dev/null
+++ b/src/diffusers/pipelines/sana/pipeline_sana_controlnet.py
@@ -0,0 +1,1236 @@
+# Copyright 2025 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, PixArtImageProcessor
+from ...loaders import SanaLoraLoaderMixin
+from ...models import AutoencoderDC, SanaControlNetModel, SanaTransformer2DModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import get_device, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..pixart_alpha.pipeline_pixart_alpha import (
+    ASPECT_RATIO_512_BIN,
+    ASPECT_RATIO_1024_BIN,
+)
+from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
+from .pipeline_output import SanaPipelineOutput
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+ASPECT_RATIO_4096_BIN = {
+    "0.25": [2048.0, 8192.0],
+    "0.26": [2048.0, 7936.0],
+    "0.27": [2048.0, 7680.0],
+    "0.28": [2048.0, 7424.0],
+    "0.32": [2304.0, 7168.0],
+    "0.33": [2304.0, 6912.0],
+    "0.35": [2304.0, 6656.0],
+    "0.4": [2560.0, 6400.0],
+    "0.42": [2560.0, 6144.0],
+    "0.48": [2816.0, 5888.0],
+    "0.5": [2816.0, 5632.0],
+    "0.52": [2816.0, 5376.0],
+    "0.57": [3072.0, 5376.0],
+    "0.6": [3072.0, 5120.0],
+    "0.68": [3328.0, 4864.0],
+    "0.72": [3328.0, 4608.0],
+    "0.78": [3584.0, 4608.0],
+    "0.82": [3584.0, 4352.0],
+    "0.88": [3840.0, 4352.0],
+    "0.94": [3840.0, 4096.0],
+    "1.0": [4096.0, 4096.0],
+    "1.07": [4096.0, 3840.0],
+    "1.13": [4352.0, 3840.0],
+    "1.21": [4352.0, 3584.0],
+    "1.29": [4608.0, 3584.0],
+    "1.38": [4608.0, 3328.0],
+    "1.46": [4864.0, 3328.0],
+    "1.67": [5120.0, 3072.0],
+    "1.75": [5376.0, 3072.0],
+    "2.0": [5632.0, 2816.0],
+    "2.09": [5888.0, 2816.0],
+    "2.4": [6144.0, 2560.0],
+    "2.5": [6400.0, 2560.0],
+    "2.89": [6656.0, 2304.0],
+    "3.0": [6912.0, 2304.0],
+    "3.11": [7168.0, 2304.0],
+    "3.62": [7424.0, 2048.0],
+    "3.75": [7680.0, 2048.0],
+    "3.88": [7936.0, 2048.0],
+    "4.0": [8192.0, 2048.0],
+}
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import SanaControlNetPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = SanaControlNetPipeline.from_pretrained(
+        ...     "ishan24/Sana_600M_1024px_ControlNetPlus_diffusers",
+        ...     variant="fp16",
+        ...     torch_dtype={"default": torch.bfloat16, "controlnet": torch.float16, "transformer": torch.float16},
+        ...     device_map="balanced",
+        ... )
+        >>> cond_image = load_image(
+        ...     "https://huggingface.co/ishan24/Sana_600M_1024px_ControlNet_diffusers/resolve/main/hed_example.png"
+        ... )
+        >>> prompt = 'a cat with a neon sign that says "Sana"'
+        >>> image = pipe(
+        ...     prompt,
+        ...     control_image=cond_image,
+        ... ).images[0]
+        >>> image.save("output.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class SanaControlNetPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
+    r"""
+    Pipeline for text-to-image generation using [Sana](https://huggingface.co/papers/2410.10629).
+    """
+
+    # fmt: off
+    bad_punct_regex = re.compile(r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}")
+    # fmt: on
+
+    model_cpu_offload_seq = "text_encoder->controlnet->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "control_image",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
+        text_encoder: Gemma2PreTrainedModel,
+        vae: AutoencoderDC,
+        transformer: SanaTransformer2DModel,
+        controlnet: SanaControlNetModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.encoder_block_out_channels) - 1)
+            if hasattr(self, "vae") and self.vae is not None
+            else 32
+        )
+        self.image_processor = PixArtImageProcessor(
+            vae_scale_factor=self.vae_scale_factor
+        )
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
+        deprecate(
+            "enable_vae_slicing",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
+        deprecate(
+            "disable_vae_slicing",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
+        deprecate(
+            "enable_vae_tiling",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
+        deprecate(
+            "disable_vae_tiling",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.sana.pipeline_sana.SanaPipeline._get_gemma_prompt_embeds
+    def _get_gemma_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        dtype: torch.dtype,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+
+        # prepare complex human instruction
+        if not complex_human_instruction:
+            max_length_all = max_sequence_length
+        else:
+            chi_prompt = "\n".join(complex_human_instruction)
+            prompt = [chi_prompt + p for p in prompt]
+            num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+            max_length_all = num_chi_prompt_tokens + max_sequence_length - 2
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length_all,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device), attention_mask=prompt_attention_mask
+        )
+        prompt_embeds = prompt_embeds[0].to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_attention_mask
+
+    # Copied from diffusers.pipelines.sana.pipeline_sana.SanaPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+
+        if device is None:
+            device = self._execution_device
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SanaLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+        select_index = [0] + list(range(-max_length + 1, 0))
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                dtype=dtype,
+                clean_caption=clean_caption,
+                max_sequence_length=max_sequence_length,
+                complex_human_instruction=complex_human_instruction,
+            )
+
+            prompt_embeds = prompt_embeds[:, select_index]
+            prompt_attention_mask = prompt_attention_mask[:, select_index]
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = (
+                [negative_prompt] * batch_size
+                if isinstance(negative_prompt, str)
+                else negative_prompt
+            )
+            negative_prompt_embeds, negative_prompt_attention_mask = (
+                self._get_gemma_prompt_embeds(
+                    prompt=negative_prompt,
+                    device=device,
+                    dtype=dtype,
+                    clean_caption=clean_caption,
+                    max_sequence_length=max_sequence_length,
+                    complex_human_instruction=False,
+                )
+            )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=dtype, device=device
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(
+                bs_embed, -1
+            )
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(
+                num_images_per_prompt, 1
+            )
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        if self.text_encoder is not None:
+            if isinstance(self, SanaLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 32 != 0 or width % 32 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 32 but are {height} and {width}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError(
+                "Must provide `prompt_attention_mask` when specifying `prompt_embeds`."
+            )
+
+        if (
+            negative_prompt_embeds is not None
+            and negative_prompt_attention_mask is None
+        ):
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(
+                BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")
+            )
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(
+                BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")
+            )
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip addresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(
+            r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption
+        )
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(
+            self.bad_punct_regex, r" ", caption
+        )  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(
+            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption
+        )
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(
+            r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption
+        )  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 4.5,
+        control_image: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        height: int = 1024,
+        width: int = 1024,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clean_caption: bool = False,
+        use_resolution_binning: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 300,
+        complex_human_instruction: List[str] = [
+            "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:",
+            "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.",
+            "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
+            "Here are examples of how to transform or refine prompts:",
+            "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.",
+            "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.",
+            "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
+            "User Prompt: ",
+        ],
+    ) -> Union[SanaPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
+                applies to [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to `300`):
+                Maximum sequence length to use with the `prompt`.
+            complex_human_instruction (`List[str]`, *optional*):
+                Instructions for complex human attention:
+                https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_4096_BIN
+            elif self.transformer.config.sample_size == 64:
+                aspect_ratio_bin = ASPECT_RATIO_2048_BIN
+            elif self.transformer.config.sample_size == 32:
+                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+            elif self.transformer.config.sample_size == 16:
+                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.image_processor.classify_height_width_bin(
+                height, width, ratios=aspect_ratio_bin
+            )
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        lora_scale = (
+            self.attention_kwargs.get("scale", None)
+            if self.attention_kwargs is not None
+            else None
+        )
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+            complex_human_instruction=complex_human_instruction,
+            lora_scale=lora_scale,
+        )
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat(
+                [negative_prompt_attention_mask, prompt_attention_mask], dim=0
+            )
+
+        # 4. Prepare control image
+        if isinstance(self.controlnet, SanaControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=False,
+            )
+            height, width = control_image.shape[-2:]
+
+            control_image = self.vae.encode(control_image).latent
+            control_image = control_image * self.vae.config.scaling_factor
+        else:
+            raise ValueError("`controlnet` must be of type `SanaControlNetModel`.")
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 6. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        self._num_timesteps = len(timesteps)
+
+        controlnet_dtype = self.controlnet.dtype
+        transformer_dtype = self.transformer.dtype
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # controlnet(s) inference
+                controlnet_block_samples = self.controlnet(
+                    latent_model_input.to(dtype=controlnet_dtype),
+                    encoder_hidden_states=prompt_embeds.to(dtype=controlnet_dtype),
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=timestep,
+                    return_dict=False,
+                    attention_kwargs=self.attention_kwargs,
+                    controlnet_cond=control_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                )[0]
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input.to(dtype=transformer_dtype),
+                    encoder_hidden_states=prompt_embeds.to(dtype=transformer_dtype),
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=timestep,
+                    return_dict=False,
+                    attention_kwargs=self.attention_kwargs,
+                    controlnet_block_samples=tuple(
+                        t.to(dtype=transformer_dtype) for t in controlnet_block_samples
+                    ),
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+                # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            torch_accelerator_module = getattr(torch, get_device(), torch.cuda)
+            oom_error = (
+                torch.OutOfMemoryError
+                if is_torch_version(">=", "2.5.0")
+                else torch_accelerator_module.OutOfMemoryError
+            )
+            try:
+                image = self.vae.decode(
+                    latents / self.vae.config.scaling_factor, return_dict=False
+                )[0]
+            except oom_error as e:
+                warnings.warn(
+                    f"{e}. \n"
+                    f"Try to use VAE tiling for large images. For example: \n"
+                    f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
+                )
+            if use_resolution_binning:
+                image = self.image_processor.resize_and_crop_tensor(
+                    image, orig_width, orig_height
+                )
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return SanaPipelineOutput(images=image)
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index 3c8911773e39..06be5cb961ac 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -14,13 +14,12 @@
 """
 PyTorch utilities: Utilities related to PyTorch
 """
-
+import functools
 from typing import List, Optional, Tuple, Union
 
 from . import logging
 from .import_utils import is_torch_available, is_torch_version
 
-
 if is_torch_available():
     import torch
     from torch.fft import fftn, fftshift, ifftn, ifftshift
@@ -54,7 +53,11 @@ def randn_tensor(
     device = device or torch.device("cpu")
 
     if generator is not None:
-        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        gen_device_type = (
+            generator.device.type
+            if not isinstance(generator, list)
+            else generator[0].device.type
+        )
         if gen_device_type != device.type and gen_device_type == "cpu":
             rand_device = "cpu"
             if device != "mps":
@@ -64,7 +67,9 @@ def randn_tensor(
                     f" slighly speed up this function by passing a generator that was created on the {device} device."
                 )
         elif gen_device_type != device.type and gen_device_type == "cuda":
-            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+            raise ValueError(
+                f"Cannot generate a {device} tensor from a generator of type {gen_device_type}."
+            )
 
     # make sure generator list of length 1 is treated like a non-list
     if isinstance(generator, list) and len(generator) == 1:
@@ -73,12 +78,20 @@ def randn_tensor(
     if isinstance(generator, list):
         shape = (1,) + shape[1:]
         latents = [
-            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            torch.randn(
+                shape,
+                generator=generator[i],
+                device=rand_device,
+                dtype=dtype,
+                layout=layout,
+            )
             for i in range(batch_size)
         ]
         latents = torch.cat(latents, dim=0).to(device)
     else:
-        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+        latents = torch.randn(
+            shape, generator=generator, device=rand_device, dtype=dtype, layout=layout
+        ).to(device)
 
     return latents
 
@@ -114,7 +127,9 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
     mask = torch.ones((B, C, H, W), device=x.device)
 
     crow, ccol = H // 2, W // 2
-    mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
+    mask[
+        ..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold
+    ] = scale
     x_freq = x_freq * mask
 
     # IFFT
@@ -125,7 +140,10 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
 
 
 def apply_freeu(
-    resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
+    resolution_idx: int,
+    hidden_states: "torch.Tensor",
+    res_hidden_states: "torch.Tensor",
+    **freeu_kwargs,
 ) -> Tuple["torch.Tensor", "torch.Tensor"]:
     """Applies the FreeU mechanism as introduced in https:
     //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
@@ -141,12 +159,20 @@ def apply_freeu(
     """
     if resolution_idx == 0:
         num_half_channels = hidden_states.shape[1] // 2
-        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
-        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s1"])
+        hidden_states[:, :num_half_channels] = (
+            hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
+        )
+        res_hidden_states = fourier_filter(
+            res_hidden_states, threshold=1, scale=freeu_kwargs["s1"]
+        )
     if resolution_idx == 1:
         num_half_channels = hidden_states.shape[1] // 2
-        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
-        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
+        hidden_states[:, :num_half_channels] = (
+            hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
+        )
+        res_hidden_states = fourier_filter(
+            res_hidden_states, threshold=1, scale=freeu_kwargs["s2"]
+        )
 
     return hidden_states, res_hidden_states
 
@@ -159,3 +185,17 @@ def get_torch_cuda_device_capability():
         return float(compute_capability)
     else:
         return None
+
+
+@functools.lru_cache
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    elif is_torch_npu_available():
+        return "npu"
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        return "xpu"
+    elif torch.backends.mps.is_available():
+        return "mps"
+    else:
+        return "cpu"