Merge pull request #66 from hmorimitsu/rapidflow

hmorimitsu · web-flow · commit f5c7d6b3ed3f · 2024-05-28T23:14:09.000+08:00
Adapt RAPIDFlow code to TensorRT and add simple test script
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -40,4 +40,4 @@ jobs:
           mv ptlflow ptlflow_tmp
       - name: Test with pytest
         run: |
-          python -m pytest
+          python -m pytest tests/
diff --git a/.github/workflows/lightning.yml b/.github/workflows/lightning.yml
@@ -35,4 +35,4 @@ jobs:
       - name: Test with pytest
         run: |
           pip install pytest
-          python -m pytest
+          python -m pytest tests/
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -32,4 +32,4 @@ jobs:
       - name: Test with pytest
         run: |
           pip install pytest
-          python -m pytest
+          python -m pytest tests/
diff --git a/.github/workflows/pytorch.yml b/.github/workflows/pytorch.yml
@@ -32,4 +32,4 @@ jobs:
       - name: Test with pytest
         run: |
           pip install pytest
-          python -m pytest
+          python -m pytest tests/
diff --git a/ptlflow/models/rapidflow/README.md b/ptlflow/models/rapidflow/README.md
@@ -106,11 +106,19 @@ You can also provide your own images to test by providing an additional argument
 python onnx_infer.py rapidflow_it12.onnx --image_paths /path/to/first/image /path/to/second/image
 ```
 
-### ONNX example limitations
+## Compiling model to TensorRT
 
-Directly converting the model to ONNX as shown in this example will work, but it is not optimal.
+The script [tensorrt_test.py](tensorrt_test.py) provides a simple example of how to compile RAPIDFlow models to TensorRT.
+Run it by typing:
+```bash
+python tensorrt_test.py rapidflow_it12 --checkpoint things
+```
+
+### ONNX and TensorRT example limitations
+
+Directly converting the model to ONNX and TensorRT as shown in this example will work, but it is not optimal.
 To obtain the best convertion, it would be necessary to rewrite some parts of the code to remove conditions and operations that may change according to the input size.
-Also, ONNX convertion only supports `--corr_mode allpairs`, which is not suitable for large images.
+Also, these convertions only supports `--corr_mode allpairs`, which is not suitable for large images.
 
 ## Code license
 
diff --git a/ptlflow/models/rapidflow/rapidflow.py b/ptlflow/models/rapidflow/rapidflow.py
@@ -24,7 +24,7 @@
 import torch.nn.functional as F
 
 from ptlflow.utils.utils import forward_interpolate_batch
-from .pwc_modules import rescale_flow, upsample2d_as
+from .pwc_modules import rescale_flow
 from .update import UpdateBlock
 from .corr import get_corr_block
 from .local_timm.norm import LayerNorm2d
@@ -353,8 +353,11 @@ def forward(self, inputs):
             and "prev_flows" in inputs
             and inputs["prev_flows"] is not None
         ):
-            flow = upsample2d_as(
-                inputs["prev_flows"][:, 0], pass_pyramid1[0], mode="bilinear"
+            flow = F.interpolate(
+                inputs["prev_flows"][:, 0],
+                [pass_pyramid1[0].shape[-2], pass_pyramid1[0].shape[-1]],
+                mode="bilinear",
+                align_corners=True,
             )
             flow = rescale_flow(flow, width_im, height_im, to_local=True)
             flow = forward_interpolate_batch(flow)
@@ -385,7 +388,12 @@ def forward(self, inputs):
             if net is None:
                 net = torch.tanh(net_tmp)
             else:
-                net = upsample2d_as(net, x1, mode="bilinear")
+                net = F.interpolate(
+                    net,
+                    [x1.shape[-2], x1.shape[-1]],
+                    mode="bilinear",
+                    align_corners=True,
+                )
 
                 net_skip = torch.tanh(net_tmp)
                 gate = torch.sigmoid(
@@ -395,7 +403,12 @@ def forward(self, inputs):
 
             if l > 0:
                 flow = rescale_flow(flow, x1.shape[-1], x1.shape[-2], to_local=False)
-                flow = upsample2d_as(flow, x1, mode="bilinear")
+                flow = F.interpolate(
+                    flow,
+                    [x1.shape[-2], x1.shape[-1]],
+                    mode="bilinear",
+                    align_corners=True,
+                )
 
             for k in range(iters_per_level[l]):
                 flow = flow.detach()
@@ -414,16 +427,60 @@ def forward(self, inputs):
                 out_flow = rescale_flow(flow, width_im, height_im, to_local=False)
                 if self.training:
                     if mask is not None and l == (output_level - start_level):
-                        out_flow = self.upsample_flow(out_flow, mask, pred_stride)
+                        if self.args.simple_io:
+                            # Just copied the code from self.upsample_flow to here.
+                            # For some reason, TensorRT backend does not compile when calling the function
+                            N, _, H, W = out_flow.shape
+                            mask = mask.view(N, 1, 9, pred_stride, pred_stride, H, W)
+                            mask = torch.softmax(mask, dim=2)
+
+                            up_flow = F.unfold(flow, [3, 3], padding=1)
+                            up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+                            up_flow = torch.sum(mask * up_flow, dim=2)
+                            up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+                            up_flow = up_flow.reshape(
+                                N, 2, pred_stride * H, pred_stride * W
+                            )
+                            out_flow = up_flow
+                        else:
+                            out_flow = self.upsample_flow(out_flow, mask, pred_stride)
                     else:
-                        out_flow = upsample2d_as(out_flow, x1_raw, mode="bilinear")
+                        out_flow = F.interpolate(
+                            out_flow,
+                            [x1_raw.shape[-2], x1_raw.shape[-1]],
+                            mode="bilinear",
+                            align_corners=True,
+                        )
                 elif l == (output_level - start_level) and k == (
                     iters_per_level[l] - 1
                 ):
                     if mask is not None:
-                        out_flow = self.upsample_flow(out_flow, mask, pred_stride)
+                        if self.args.simple_io:
+                            # Just copied the code from self.upsample_flow to here.
+                            # For some reason, TensorRT backend does not compile when calling the function
+                            N, _, H, W = out_flow.shape
+                            mask = mask.view(N, 1, 9, pred_stride, pred_stride, H, W)
+                            mask = torch.softmax(mask, dim=2)
+
+                            up_flow = F.unfold(flow, [3, 3], padding=1)
+                            up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+                            up_flow = torch.sum(mask * up_flow, dim=2)
+                            up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+                            up_flow = up_flow.reshape(
+                                N, 2, pred_stride * H, pred_stride * W
+                            )
+                            out_flow = up_flow
+                        else:
+                            out_flow = self.upsample_flow(out_flow, mask, pred_stride)
                     else:
-                        out_flow = upsample2d_as(out_flow, x1_raw, mode="bilinear")
+                        out_flow = F.interpolate(
+                            out_flow,
+                            [x1_raw.shape[-2], x1_raw.shape[-1]],
+                            mode="bilinear",
+                            align_corners=True,
+                        )
                 out_flow = self.postprocess_predictions(
                     out_flow, image_resizer, is_flow=True
                 )
diff --git a/ptlflow/models/rapidflow/tensorrt_test.py b/ptlflow/models/rapidflow/tensorrt_test.py
@@ -0,0 +1,199 @@
+# TensorRT conversion code comes from the tutorial:
+# https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.html
+
+
+import sys
+from argparse import ArgumentParser
+from pathlib import Path
+import time
+
+import cv2 as cv
+import numpy as np
+import torch
+import torch_tensorrt
+
+this_dir = Path(__file__).parent.resolve()
+sys.path.insert(0, str(this_dir.parent.parent.parent))
+
+from ptlflow import get_model, load_checkpoint
+from ptlflow.models.rapidflow.rapidflow import RAPIDFlow
+from ptlflow.utils import flow_utils
+
+
+def _init_parser() -> ArgumentParser:
+    parser = ArgumentParser()
+    parser.add_argument(
+        "model",
+        type=str,
+        choices=(
+            "rapidflow",
+            "rapidflow_it1",
+            "rapidflow_it2",
+            "rapidflow_it3",
+            "rapidflow_it6",
+            "rapidflow_it12",
+        ),
+        help="Name of the model to use.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="Path to the checkpoint to be loaded. It can also be one of the following names: \{chairs, things, sintel, kitti\}, in which case the respective pretrained checkpoint will be downloaded.",
+    )
+    parser.add_argument(
+        "--image_paths",
+        type=str,
+        nargs=2,
+        default=(
+            str(this_dir / "image_samples" / "000000_10.png"),
+            str(this_dir / "image_samples" / "000000_11.png"),
+        ),
+        help="Path to two images to estimate the optical flow with the TensorRT model.",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=".",
+        help="Path to the directory where the predictions will be saved.",
+    )
+    parser.add_argument(
+        "--input_size",
+        type=int,
+        nargs=2,
+        default=(384, 1280),
+        help="Size of the input image.",
+    )
+    return parser
+
+
+def compile_engine_and_infer(args):
+    # Initialize model with half precision and sample inputs
+    model = load_model(args).half().eval().to("cuda")
+    images = [torch.from_numpy(load_images(args.image_paths)).half().to("cuda")]
+
+    num_tries = 11
+    total_time_orig = 0.0
+    for i in range(num_tries):
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        model(images[0])
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        if i > 0:
+            total_time_orig += end - start
+
+    # Enabled precision for TensorRT optimization
+    enabled_precisions = {torch.half}
+
+    # Whether to print verbose logs
+    debug = True
+
+    # Workspace size for TensorRT
+    workspace_size = 20 << 30
+
+    # Maximum number of TRT Engines
+    # (Lower value allows more graph segmentation)
+    min_block_size = 7
+
+    # Operations to Run in Torch, regardless of converter support
+    torch_executed_ops = {}
+
+    # Build and compile the model with torch.compile, using Torch-TensorRT backend
+    compiled_model = torch_tensorrt.compile(
+        model,
+        ir="torch_compile",
+        inputs=images,
+        enabled_precisions=enabled_precisions,
+        debug=debug,
+        workspace_size=workspace_size,
+        min_block_size=min_block_size,
+        torch_executed_ops=torch_executed_ops,
+    )
+
+    total_time_optimized = 0.0
+    for i in range(num_tries):
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        flow_pred = compiled_model(*images)
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        if i > 0:
+            total_time_optimized += end - start
+
+    try:
+        torch_tensorrt.save(compiled_model, f"{args.model}.tc", inputs=images)
+        print(f"Saving compiled model to {args.model}.tc")
+        compiled_model = torch_tensorrt.load(f"{args.model}.tc")
+        print(f"Loading compiled model from {args.model}.tc")
+    except Exception as e:
+        print("WARNING: The compiled model was not saved due to the error:")
+        print(e)
+
+    print(f"Model: {args.model}. Average time of {num_tries - 1} runs:")
+    print(f"Time (original): {(1000 * total_time_orig / (num_tries - 1)):.2f} ms.")
+    print(f"Time (compiled): {(1000 * total_time_optimized / (num_tries - 1)):.2f} ms.")
+
+    flow_pred_npy = flow_pred[0].permute(1, 2, 0).detach().cpu().numpy()
+
+    output_dir = Path(args.output_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    flo_output_path = output_dir / f"flow_pred.flo"
+    flow_utils.flow_write(flo_output_path, flow_pred_npy)
+    print(f"Saved flow prediction to: {flo_output_path}")
+
+    viz_output_path = output_dir / f"flow_pred_viz.png"
+    flow_viz = flow_utils.flow_to_rgb(flow_pred_npy)
+    cv.imwrite(str(viz_output_path), cv.cvtColor(flow_viz, cv.COLOR_RGB2BGR))
+    print(f"Saved flow prediction visualization to: {viz_output_path}")
+
+    # Finally, we use Torch utilities to clean up the workspace
+    torch._dynamo.reset()
+
+
+def load_images(image_paths):
+    images = [cv.imread(p) for p in image_paths]
+    images = [cv.resize(im, args.input_size[::-1]) for im in images]
+    images = np.stack(images)
+    images = images.transpose(0, 3, 1, 2)[None]
+    images = images.astype(np.float32) / 255.0
+    return images
+
+
+def load_model(args):
+    model = get_model(args.model, args=args)
+    ckpt = load_checkpoint(args.checkpoint, RAPIDFlow, "rapidflow")
+    state_dict = fuse_checkpoint_next1d_layers(ckpt["state_dict"])
+    model.load_state_dict(state_dict, strict=True)
+    return model
+
+
+def fuse_checkpoint_next1d_layers(state_dict):
+    fused_sd = {}
+    hv_pairs = {}
+    for name, param in state_dict.items():
+        if name.endswith("weight_h") or name.endswith("weight_v"):
+            name_prefix = name[: -(len("weight_h") + 1)]
+            orientation = name[-1]
+            if name_prefix not in hv_pairs:
+                hv_pairs[name_prefix] = {}
+            hv_pairs[name_prefix][orientation] = param
+        else:
+            fused_sd[name] = param
+
+    for name_prefix, param_pairs in hv_pairs.items():
+        weight = torch.einsum("cijk,cimj->cimk", param_pairs["h"], param_pairs["v"])
+        fused_sd[f"{name_prefix}.weight"] = weight
+    return fused_sd
+
+
+if __name__ == "__main__":
+    parser = _init_parser()
+    parser = RAPIDFlow.add_model_specific_args(parser)
+    args = parser.parse_args()
+    args.corr_mode = "allpairs"
+    args.fuse_next1d_weights = True
+    args.simple_io = True
+
+    compile_engine_and_infer(args)