NVIDIA
diff --git a/‎tripy/examples/diffusion/README.md‎
Lines changed: 15 additions & 2 deletions b/‎tripy/examples/diffusion/README.md‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎tripy/examples/diffusion/example.py‎
Lines changed: 72 additions & 136 deletions b/‎tripy/examples/diffusion/example.py‎
Lines changed: 72 additions & 136 deletions
diff --git a/‎tripy/examples/diffusion/requirements.txt‎
Lines changed: 10 additions & 2 deletions b/‎tripy/examples/diffusion/requirements.txt‎
Lines changed: 10 additions & 2 deletions
@@ -23,5 +23,18 @@ The model defaults to running in `float32`, but is recommended to run in `float1
 2. Run the example:
 
     ```bash
-    python3 example.py --seed 0 --steps 50 --prompt "a beautiful photograph of Mt. Fuji during cherry blossom" --fp16 --engine-dir fp16_engines
-    ```
+    python3 example.py --seed 420 --steps 50 --prompt "a beautiful photograph of Mt. Fuji during cherry blossom" --fp16 --engine-dir fp16_engines --verbose
+    ```
+
+3. (Optional) Compare with torch reference to verify accuracy:
+    ```bash
+    python3 compare_images.py --seed 420 --steps 50 --prompt "a beautiful photograph of Mt. Fuji during cherry blossom" --fp16 --engine-dir fp16_engines
+    ```
+
+    <!--
+    Tripy: TEST: EXPECTED_STDOUT Start
+    ```
+    .*Passed: Images are similar.*SSIM.*0\.8
+    ```
+    Tripy: TEST: EXPECTED_STDOUT End
+    -->
@@ -21,7 +21,6 @@
 import time
 
 import torch
-import cupy as cp
 import numpy as np
 import nvtripy as tp
 
@@ -84,18 +83,24 @@ def get_alphas_cumprod(beta_start=0.00085, beta_end=0.0120, n_training_steps=100
     return alphas_cumprod
 
 
-def run_diffusion_loop(model, unconditional_context, context, latent, steps, guidance, dtype):
+def run_diffusion_loop(model, unconditional_context, context, latent, steps, guidance, dtype, verbose=False):
     torch_dtype = torch.float16 if dtype == tp.float16 else torch.float32
     idx_timesteps = list(range(1, 1000, 1000 // steps))
     num_timesteps = len(idx_timesteps)
     timesteps = torch.tensor(idx_timesteps, dtype=torch_dtype, device="cuda")
     guidance = torch.tensor([guidance], dtype=torch_dtype, device="cuda")
 
-    print(f"[I] Running diffusion for {steps} timesteps...")
+    if verbose:
+        print(f"[I] Running diffusion for {steps} timesteps...")
     alphas = get_alphas_cumprod(dtype=torch_dtype)[idx_timesteps]
     alphas_prev = torch.cat((torch.tensor([1.0], dtype=torch_dtype, device="cuda"), alphas[:-1]))
 
-    for index in (t := tqdm(list(range(num_timesteps))[::-1])):
+    if verbose:
+        iterator = tqdm(list(range(num_timesteps))[::-1])
+    else:
+        iterator = list(range(num_timesteps))[::-1]
+
+    for index in iterator:
         latent = model(
             unconditional_context,
             context,
@@ -121,36 +126,39 @@ def save_image(image, args):
             f"seed{args.seed if args.seed else 'rand'}-"
             f"{int(time.time())}.png"
         )
+        filename = os.path.join("output", filename)
 
-    target = os.path.join("output", filename)
     # Save image
-    print(f"[I] Saving image to {target}")
-    if not os.path.isdir("output"):
-        print("[I] Creating 'output' directory.")
-        os.mkdir("output")
-    image.save(target)
+    print(f"[I] Saving image to {filename}")
+    if not os.path.isdir(os.path.dirname(filename)):
+        print(f"[I] Creating '{os.path.dirname(filename)}' directory.")
+        os.makedirs(os.path.dirname(filename))
+    image.save(filename)
 
 
 def tripy_diffusion(args):
-    run_start_time = time.perf_counter()
+    run_start_time = time.perf_counter() if args.verbose else None
 
     dtype, torch_dtype = (tp.float16, torch.float16) if args.fp16 else (tp.float32, torch.float32)
 
     if os.path.isdir(args.engine_dir):
-        print(f"[I] Loading cached engines from {args.engine_dir}...")
+        if args.verbose:
+            print(f"[I] Loading cached engines from {args.engine_dir}...")
         clip_compiled = tp.Executable.load(os.path.join(args.engine_dir, "clip_executable.tpymodel"))
         unet_compiled = tp.Executable.load(os.path.join(args.engine_dir, "unet_executable.tpymodel"))
         vae_compiled = tp.Executable.load(os.path.join(args.engine_dir, "vae_executable.tpymodel"))
     else:
         model = StableDiffusion(StableDiffusionConfig(dtype=dtype))
-        print("[I] Loading model weights...", flush=True)
+        if args.verbose:
+            print("[I] Loading model weights...", flush=True)
         load_from_diffusers(model, dtype, args.hf_token, debug=True)
-        clip_compiled = compile_clip(model.cond_stage_model.transformer.text_model, verbose=True)
-        unet_compiled = compile_unet(model, dtype, verbose=True)
-        vae_compiled = compile_vae(model.decode, dtype, verbose=True)
+        clip_compiled = compile_clip(model.cond_stage_model.transformer.text_model, verbose=args.verbose)
+        unet_compiled = compile_unet(model, dtype, verbose=args.verbose)
+        vae_compiled = compile_vae(model.decode, dtype, verbose=args.verbose)
 
         os.mkdir(args.engine_dir)
-        print(f"[I] Saving engines to ./{args.engine_dir}...")
+        if args.verbose:
+            print(f"[I] Saving engines to ./{args.engine_dir}...")
         clip_compiled.save(os.path.join(args.engine_dir, "clip_executable.tpymodel"))
         unet_compiled.save(os.path.join(args.engine_dir, "unet_executable.tpymodel"))
         vae_compiled.save(os.path.join(args.engine_dir, "vae_executable.tpymodel"))
@@ -161,147 +169,74 @@ def tripy_diffusion(args):
         args.prompt, padding="max_length", max_length=CLIPConfig.max_seq_len, truncation=True, return_tensors="pt"
     )
     prompt = tp.Tensor(torch_prompt.input_ids.to(torch.int32).to("cuda"))
-    print(f"[I] Got tokenized prompt.")
+    if args.verbose:
+        print(f"[I] Got tokenized prompt.")
     torch_unconditional_prompt = tokenizer(
         [""], padding="max_length", max_length=CLIPConfig.max_seq_len, return_tensors="pt"
     )
     unconditional_prompt = tp.Tensor(torch_unconditional_prompt.input_ids.to(torch.int32).to("cuda"))
-    print(f"[I] Got unconditional tokenized prompt.")
+    if args.verbose:
+        print(f"[I] Got unconditional tokenized prompt.")
 
-    print("[I] Getting CLIP conditional and unconditional context...", end=" ")
-    clip_run_start = time.perf_counter()
+    if args.verbose:
+        print("[I] Getting CLIP conditional and unconditional context...", end=" ")
+    clip_run_start = time.perf_counter() if args.verbose else None
     context = clip_compiled(prompt)
     unconditional_context = clip_compiled(unconditional_prompt)
-    tp.default_stream().synchronize()
-    clip_run_end = time.perf_counter()
-    print(f"took {clip_run_end - clip_run_start} seconds.")
+    if args.verbose:
+        tp.default_stream().synchronize()
+        clip_run_end = time.perf_counter()
+        print(f"took {clip_run_end - clip_run_start} seconds.")
+    else:
+        clip_run_start = None
+        clip_run_end = None
 
     # Backbone of diffusion - the UNet
     if args.seed is not None:
         torch.manual_seed(args.seed)
     torch_latent = torch.randn((1, 4, 64, 64), dtype=torch_dtype, device="cuda")
     latent = tp.Tensor(torch_latent)
 
-    diffusion_run_start = time.perf_counter()
-    latent = run_diffusion_loop(unet_compiled, unconditional_context, context, latent, args.steps, args.guidance, dtype)
-    tp.default_stream().synchronize()
-    diffusion_run_end = time.perf_counter()
-    print(f"[I] Finished diffusion denoising. Inference took {diffusion_run_end - diffusion_run_start} seconds.")
+    diffusion_run_start = time.perf_counter() if args.verbose else None
+    latent = run_diffusion_loop(
+        unet_compiled, unconditional_context, context, latent, args.steps, args.guidance, dtype, verbose=args.verbose
+    )
+    if args.verbose:
+        tp.default_stream().synchronize()
+        diffusion_run_end = time.perf_counter()
+        print(f"[I] Finished diffusion denoising. Inference took {diffusion_run_end - diffusion_run_start} seconds.")
+    else:
+        diffusion_run_start = None
+        diffusion_run_end = None
 
     # Upsample latent space to image with autoencoder
-    print(f"[I] Decoding latent...", end=" ")
-    vae_run_start = time.perf_counter()
+    if args.verbose:
+        print(f"[I] Decoding latent...", end=" ")
+    vae_run_start = time.perf_counter() if args.verbose else None
     x = vae_compiled(latent)
-    tp.default_stream().synchronize()
-    vae_run_end = time.perf_counter()
-    print(f"took {vae_run_end - vae_run_start} seconds.")
+    if args.verbose:
+        tp.default_stream().synchronize()
+        vae_run_end = time.perf_counter()
+        print(f"took {vae_run_end - vae_run_start} seconds.")
+    else:
+        vae_run_start = None
+        vae_run_end = None
 
     # Evaluate output
-    run_end_time = time.perf_counter()
-    print(f"[I] Full script took {run_end_time - run_start_time} seconds.")
+    run_end_time = time.perf_counter() if args.verbose else None
+    if args.verbose:
+        print(f"[I] Full script took {run_end_time - run_start_time} seconds.")
 
-    image = Image.fromarray(cp.from_dlpack(x).get().astype(np.uint8, copy=False))
+    image_array = np.from_dlpack(tp.copy(x, tp.device("cpu"))).astype(np.uint8, copy=False)
+    image = Image.fromarray(image_array)
 
     return image, [clip_run_start, clip_run_end, diffusion_run_start, diffusion_run_end, vae_run_start, vae_run_end]
 
 
-# referenced from https://huggingface.co/blog/stable_diffusion
-def hf_diffusion(args):
-    from transformers import CLIPTextModel, CLIPTokenizer
-    from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
-    from tqdm.auto import tqdm
-
-    run_start_time = time.perf_counter()
+def print_summary(denoising_steps, times, verbose=False):
+    if not verbose or times is None or None in times:
+        return
 
-    dtype = torch.float16 if args.fp16 else torch.float32
-    model_opts = {"variant": "fp16", "torch_dtype": torch.float16} if args.fp16 else {}
-
-    # Initialize models
-    model_id = "KiwiXR/stable-diffusion-v1-5"
-
-    print("[I] Loading models...")
-    hf_tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer")
-    hf_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder").to("cuda")
-    unet = UNet2DConditionModel.from_pretrained(
-        model_id, subfolder="unet", use_auth_token=args.hf_token, **model_opts
-    ).to("cuda")
-    vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", use_auth_token=args.hf_token, **model_opts).to(
-        "cuda"
-    )
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-    )
-
-    # Run through CLIP to get context from prompt
-    print("[I] Starting tokenization and running clip...", end=" ")
-    clip_run_start = time.perf_counter()
-    text_input = hf_tokenizer(
-        args.prompt,
-        padding="max_length",
-        max_length=hf_tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    ).to("cuda")
-    max_length = text_input.input_ids.shape[-1]  # 77
-    uncond_input = hf_tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt").to("cuda")
-    text_embeddings = hf_encoder(text_input.input_ids, output_hidden_states=True)[0]
-    uncond_embeddings = hf_encoder(uncond_input.input_ids)[0]
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype)
-    clip_run_end = time.perf_counter()
-    print(f"took {clip_run_end - clip_run_start} seconds.")
-
-    # Backbone of diffusion - the UNet
-    if args.seed is not None:
-        torch.manual_seed(args.seed)
-    torch_latent = torch.randn((1, 4, 64, 64), dtype=dtype, device="cuda")
-    torch_latent *= scheduler.init_noise_sigma
-
-    scheduler.set_timesteps(args.steps)
-
-    diffusion_run_start = time.perf_counter()
-    print(f"[I] Running diffusion for {args.steps} timesteps...")
-    for t in tqdm(scheduler.timesteps):
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([torch_latent] * 2)
-        latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
-
-        # predict the noise residual
-        with torch.no_grad():
-            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + args.guidance * (noise_pred_text - noise_pred_uncond)
-
-        # compute the previous noisy sample x_t -> x_t-1
-        torch_latent = scheduler.step(noise_pred, t, torch_latent).prev_sample
-
-    diffusion_run_end = time.perf_counter()
-    print(f"[I] Finished diffusion denoising. Inference took {diffusion_run_end - diffusion_run_start} seconds.")
-
-    # Upsample latent space to image with autoencoder
-    print(f"[I] Decoding latent...", end=" ")
-    vae_run_start = time.perf_counter()
-    torch_latent = 1 / 0.18215 * torch_latent
-    with torch.no_grad():
-        image = vae.decode(torch_latent).sample
-    vae_run_end = time.perf_counter()
-    print(f"took {vae_run_end - vae_run_start} seconds.")
-
-    # Evaluate Output
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    image = pil_images[0]
-
-    run_end_time = time.perf_counter()
-    print(f"[I] Full script took {run_end_time - run_start_time} seconds.")
-
-    return image, [clip_run_start, clip_run_end, diffusion_run_start, diffusion_run_end, vae_run_start, vae_run_end]
-
-
-def print_summary(denoising_steps, times):
     stages_ms = [1000 * (times[i + 1] - times[i]) for i in range(0, 6, 2)]
     total_ms = sum(stages_ms)
     print("|-----------------|--------------|")
@@ -316,8 +251,6 @@ def print_summary(denoising_steps, times):
     print("Throughput: {:.2f} image/s".format(1000.0 / total_ms))
 
 
-# TODO: Add torch compilation
-# TODO: Add Timing context (depends on how we measure perf)
 def main():
     default_prompt = "a beautiful photograph of Mt. Fuji during cherry blossom"
     parser = argparse.ArgumentParser(
@@ -336,6 +269,9 @@ def main():
         "--hf-token", type=str, default="", help="HuggingFace API access token for downloading model checkpoints"
     )
     parser.add_argument("--engine-dir", type=str, default="engines", help="Output directory for TensorRT engines")
+    parser.add_argument(
+        "--verbose", action="store_true", default=False, help="Enable verbose output with timing and progress bars"
+    )
     args = parser.parse_args()
 
     if args.torch_inference:
@@ -344,7 +280,7 @@ def main():
         image, times = tripy_diffusion(args)
 
     save_image(image, args)
-    print_summary(args.steps, times)
+    print_summary(args.steps, times, verbose=args.verbose)
 
 
 if __name__ == "__main__":
 
@@ -1,4 +1,12 @@
-accelerate
+-f https://nvidia.github.io/TensorRT-Incubator/packages.html
+nvtripy
+torch>=2.3.1
+torchvision>=0.18.1
+numpy>=1.24.4
+tqdm>=4.66.1
+accelerate==1.8.1
+# iopath>=0.1.10
+pillow>=9.4.0
 diffusers==0.31.0
 transformers==4.42.2
-scikit-image
+scikit-image==0.24.0