chore: clean up

keehyuna · keehyuna · commit df34e8b0c2a9 · 2025-08-07T11:13:49.000Z
diff --git a/tools/llm/quantize_utils.py b/tools/llm/quantize_utils.py
@@ -68,11 +68,6 @@ def quantize_model(model, args, tokenizer):
 class TensorRTQuantizedLinear(torch.nn.Module):
     """
     TensorRT quantized linear layer that applies quantization to both input and weight tensors.
-
-    This class implements a quantized linear layer that:
-    1. Applies quantization to input tensor using TensorQuantizer
-    2. Applies quantization to weight tensor using TensorQuantizer
-    3. Performs linear operation with quantized tensors
     """
 
     def __init__(
@@ -114,7 +109,7 @@ def forward(self, input):
 
 def convert_linear_to_tensorrt_quantized(model, model_name):
     """
-    Convert linear layers in a model to TensorRT quantized versions using pre-quantized weights.
+    Convert linear layers in a model to TensorRT quantized versions from pre-quantized weights.
 
     This function is specifically designed for Hugging Face quantized models and only
     applies quantization to linear operations. It loads pre-quantized models from
@@ -172,7 +167,7 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
 
         hf_quant_algo = hf_quant_config.pop("quant_algo", None)
         if hf_quant_algo != "FP8" and hf_quant_algo != "NVFP4":
-            raise RuntimeError("Only FP8 and NVFP4 quantization is supported")
+            raise RuntimeError("Only FP8 or NVFP4 quantization is supported")
     else:
         raise RuntimeError("No quantization config found")
 
@@ -186,7 +181,6 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
             weight_scale_name = name + ".weight_scale"
             input_scale_name = name + ".input_scale"
 
-            # Verify that required scale tensors exist in the loaded data
             if weight_scale_name not in tensors:
                 print(f"Weight scale tensor {weight_scale_name} not found")
                 continue
@@ -202,7 +196,7 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
                 input_amax = tensors.pop(input_scale_name) * 448.0
 
                 # Dequantize the weight using the scale factor
-                dequantized_weight_data = module.weight.to(torch.float16) * weight_scale
+                dequantized_weight_data = module.weight.to(torch.float32) * weight_scale
 
                 # Configure quantizer for FP8 format (4 exponent bits, 3 mantissa bits)
                 quantizer_attribute_config = QuantizerAttributeConfig(
@@ -226,7 +220,7 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
                 original_shape = list(weight_data.shape)
                 original_shape[-1] *= 2  # NVFP4 packs 2 values per element
                 nvfp4_tensor = NVFP4QTensor(
-                    torch.Size(original_shape), torch.float16, weight_data
+                    torch.Size(original_shape), torch.float32, weight_data
                 )
 
                 # Dequantize using both scales and block size configuration
@@ -242,8 +236,8 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
                     enable=True,
                 )
 
-            # Apply dequantization to the original quantized weight using the scale
-            # This ensures the weight is in the correct range for the quantized layer
+            # Restore the weight to its original full-precision format so that QDQ nodes
+            # can be properly inserted and optimized during TensorRT compilation
             module.weight.data = dequantized_weight_data
 
             # Create the quantized linear layer with calculated amax values
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -49,7 +49,6 @@ def get_model(args):
         torch.nn.Module: The loaded and configured model ready for inference,
             moved to CUDA device with the specified precision
     """
-
     with torch.no_grad():
         model = (
             AutoModelForCausalLM.from_pretrained(
@@ -112,23 +111,7 @@ def compile_torchtrt(model, input_ids, args):
     else:
         enabled_precisions = {torch.float32}
 
-    qformat = "_q_" + args.qformat if args.qformat else ""
-
-    logging_dir = f"./{args.model}_{args.precision}{qformat}"
-    # with torch_tensorrt.logging.debug() if args.debug else nullcontext():
-    with (
-        torch_tensorrt.dynamo.Debugger(
-            "debug",
-            logging_dir=logging_dir,
-            # capture_fx_graph_after=["constant_fold"],
-            # save_engine_profile=True,
-            # profile_format="trex",
-            engine_builder_monitor=False,
-            # save_layer_info=True,
-        )
-        if args.debug
-        else nullcontext()
-    ):
+    with torch_tensorrt.logging.debug() if args.debug else nullcontext():
         trt_model = torch_tensorrt.dynamo.compile(
             ep,
             inputs=[input_ids, position_ids],
@@ -151,14 +134,12 @@ def print_outputs(backend_name, gen_tokens, tokenizer):
     """
     Print the generated tokens from the model.
     """
-    out = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
     print(f"========= {backend_name} =========")
     print(
         f"{backend_name} model generated text: ",
-        out,
+        tokenizer.decode(gen_tokens[0], skip_special_tokens=True),
     )
     print("===================================")
-    return out
 
 
 def measure_perf(trt_model, input_signature, backend_name):
@@ -260,13 +241,13 @@ def measure_perf(trt_model, input_signature, backend_name):
     )
     arg_parser.add_argument(
         "--qformat",
-        help=("Apply quantization format. Options: fp8 (default: None)"),
+        help=("Apply quantization format. Options: fp8, nvfp4 (default: None)"),
         default=None,
     )
     arg_parser.add_argument(
         "--pre_quantized",
         action="store_true",
-        help="Use pre-quantized model weights (default: False)",
+        help="Use pre-quantized hf model weights (default: False)",
     )
     args = arg_parser.parse_args()
 
@@ -300,6 +281,7 @@ def measure_perf(trt_model, input_signature, backend_name):
         pyt_gen_tokens = None
         pyt_timings = None
         pyt_stats = None
+
         if args.qformat != None:
             model = quantize_model(model, args, tokenizer)
         if args.enable_pytorch_run:
@@ -380,43 +362,19 @@ def measure_perf(trt_model, input_signature, backend_name):
                 batch_size=args.batch_size,
                 compile_time_s=None,
             )
-        match_result = "N/A"
-        torch_out = "N/A"
-        model_name = args.model.replace("/", "_")
-        qformat = args.qformat if args.qformat else "no_quant"
 
         if not args.benchmark:
             if args.enable_pytorch_run:
-                torch_out = print_outputs("PyTorch", pyt_gen_tokens, tokenizer)
+                print_outputs("PyTorch", pyt_gen_tokens, tokenizer)
 
-            trt_out = print_outputs("TensorRT", trt_gen_tokens, tokenizer)
+            print_outputs("TensorRT", trt_gen_tokens, tokenizer)
 
             if args.enable_pytorch_run:
                 print(
                     f"PyTorch and TensorRT outputs match: {torch.equal(pyt_gen_tokens, trt_gen_tokens)}"
                 )
-                match_result = str(torch.equal(pyt_gen_tokens, trt_gen_tokens))
-            out_json_file = f"{model_name}_{qformat}_match.json"
-            result = {}
-            args_dict = vars(args)
-            result["args"] = args_dict
-            result["match"] = match_result
-            result["torch_out"] = torch_out
-            result["trt_out"] = trt_out
-            with open(os.path.join("result", out_json_file), "w") as f:
-                json.dump(result, f, indent=4)
-                print(f"Results saved to {out_json_file}")
+
         if args.benchmark:
-            result = {}
-            args_dict = vars(args)
-
-            result["args"] = args_dict
-            result["pyt_stats"] = pyt_stats if args.enable_pytorch_run else None
-            result["trt_stats"] = trt_stats if args.benchmark else None
-            out_json_file = f"{model_name}_{qformat}_benchmark.json"
-            with open(os.path.join("result", out_json_file), "w") as f:
-                json.dump(result, f, indent=4)
-                print(f"Results saved to {out_json_file}")
             if args.enable_pytorch_run:
                 print("=========PyTorch PERFORMANCE============ \n")
                 print(pyt_stats)
diff --git a/tools/llm/utils.py b/tools/llm/utils.py
@@ -1,4 +1,3 @@
-import os
 import timeit
 
 import numpy as np

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import os`
`2`	`1`	`import timeit`
`3`	`2`
`4`	`3`	`import numpy as np`