Move slicing in denoising loop back outside of engine

akhilg-nv · akhilg-nv · commit 16e09124caef · 2025-07-29T17:08:24.000-07:00
diff --git a/tripy/examples/diffusion/example.py b/tripy/examples/diffusion/example.py
@@ -57,19 +57,18 @@ def compile_clip(model, engine_path, dtype=tp.int32, verbose=False):
     return compile_model(model, inputs, engine_path, verbose=verbose)
 
 
-def compile_unet(model, steps, engine_path, dtype, verbose=False):
+def compile_unet(model, engine_path, dtype, verbose=False):
     unconditional_context_shape = (1, 77, 768)
     conditional_context_shape = (1, 77, 768)
     latent_shape = (1, 4, 64, 64)
     inputs = (
         tp.InputInfo(unconditional_context_shape, dtype=dtype),
         tp.InputInfo(conditional_context_shape, dtype=dtype),
         tp.InputInfo(latent_shape, dtype=dtype),
-        tp.InputInfo((steps,), dtype=dtype),
-        tp.InputInfo((steps,), dtype=dtype),
-        tp.InputInfo((steps,), dtype=dtype),
         tp.InputInfo((1,), dtype=dtype),
-        tp.InputInfo((1,), dtype=tp.int32),
+        tp.InputInfo((1,), dtype=dtype),
+        tp.InputInfo((1,), dtype=dtype),
+        tp.InputInfo((1,), dtype=dtype),
     )
     return compile_model(model, inputs, engine_path, verbose=verbose)
 
@@ -91,7 +90,6 @@ def run_diffusion_loop(model, unconditional_context, context, latent, steps, gui
     torch_dtype = torch.float16 if dtype == tp.float16 else torch.float32
     idx_timesteps = list(range(1, 1000, 1000 // steps))
     num_timesteps = len(idx_timesteps)
-    print(f"num_timesteps: {num_timesteps}")
     timesteps = torch.tensor(idx_timesteps, dtype=torch_dtype, device="cuda")
     guidance = torch.tensor([guidance], dtype=torch_dtype, device="cuda")
 
@@ -106,16 +104,14 @@ def run_diffusion_loop(model, unconditional_context, context, latent, steps, gui
         iterator = list(range(num_timesteps))[::-1]
 
     for index in iterator:
-        idx = torch.tensor([index], dtype=torch.int32, device="cuda")
         latent = model(
             unconditional_context,
             context,
             latent,
-            tp.Tensor(timesteps),
-            tp.Tensor(alphas),
-            tp.Tensor(alphas_prev),
+            tp.Tensor(timesteps[index : index + 1]),
+            tp.Tensor(alphas[index : index + 1]),
+            tp.Tensor(alphas_prev[index : index + 1]),
             tp.Tensor(guidance),
-            tp.Tensor(idx),
         )
 
     return latent
@@ -165,9 +161,8 @@ def tripy_diffusion(args):
         os.mkdir(args.engine_dir)
 
     # Load existing engines if they exist, otherwise compile and save them
-    timesteps_size = len(list(range(1, 1000, 1000 // args.steps)))
     clip_compiled = compile_clip(model.text_encoder, engine_path=clip_path, verbose=args.verbose)
-    unet_compiled = compile_unet(model, timesteps_size, engine_path=unet_path, dtype=dtype, verbose=args.verbose)
+    unet_compiled = compile_unet(model, engine_path=unet_path, dtype=dtype, verbose=args.verbose)
     vae_compiled = compile_vae(model.decode, engine_path=vae_path, dtype=dtype, verbose=args.verbose)
 
     # Run through CLIP to get context from prompt
diff --git a/tripy/examples/diffusion/models/model.py b/tripy/examples/diffusion/models/model.py
@@ -81,12 +81,7 @@ def decode(self, x):
         x = clamp(tp.permute(tp.reshape(x, (3, 512, 512)), (1, 2, 0)), 0, 1) * 255
         return x
 
-    def __call__(
-        self, unconditional_context, context, latent, timesteps, alphas_cumprod, alphas_cumprod_prev, guidance, index
-    ):
-        timestep = tp.reshape(timesteps[index], (1,))
-        alphas = alphas_cumprod[index]
-        alphas_prev = alphas_cumprod_prev[index]
+    def __call__(self, unconditional_context, context, latent, timestep, alphas, alphas_prev, guidance):
         e_t = self.get_model_output(unconditional_context, context, latent, timestep, guidance)
         x_prev, _ = self.get_x_prev_and_pred_x0(latent, e_t, alphas, alphas_prev)
         return x_prev
diff --git a/tripy/examples/diffusion/models/unet_model.py b/tripy/examples/diffusion/models/unet_model.py
@@ -289,7 +289,7 @@ def __init__(self, config: UNetConfig):
             config.model_channels, config.io_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype
         )
 
-    def __call__(self, x, timesteps=None, context=None, index=None):
+    def __call__(self, x, timesteps=None, context=None):
         t_emb = timestep_embedding(timesteps, self.config.model_channels, self.config.dtype)
         emb = self.time_embedding(t_emb)
         x = self.conv_in(x)

Original file line number	Diff line number	Diff line change
`@@ -289,7 +289,7 @@ def __init__(self, config: UNetConfig):`
`289`	`289`	`config.model_channels, config.io_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype`
`290`	`290`	`)`
`291`	`291`
`292`		`- def __call__(self, x, timesteps=None, context=None, index=None):`
	`292`	`+ def __call__(self, x, timesteps=None, context=None):`
`293`	`293`	`t_emb = timestep_embedding(timesteps, self.config.model_channels, self.config.dtype)`
`294`	`294`	`emb = self.time_embedding(t_emb)`
`295`	`295`	`x = self.conv_in(x)`