Support OpenCV convention for transforms

stotko · stotko · commit 1d03fbace8e9 · 2025-07-21T10:41:28.000+02:00
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ pip install torchhull
 torchhull gets as input mask images with camera information:
 
 - `masks`: Single-channel images `M` with binary values {0, 1}.
-- `transforms`: Fused extrinsic and intrinsic matrix `K * T`, i.e. transformation from world coordinates to OpenGL clip space (right before perspective division).
+- `transforms`: Fused extrinsic and intrinsic matrix `K * T`, i.e. from world coordinates to image coordinates (right before perspective division), either in OpenGL or OpenCV convention.
 
 The visual hull is then evaluated inside a cube with bottom-front-left corner `cube_corner_bfl` and extent `cube_length` at extracted at octree level `level`. The remaining flags control how the output mesh `(verts, faces)` should look like.
 
@@ -91,6 +91,7 @@ verts, faces = torchhull.visual_hull(masks,  # [B, H, W, 1]
                                      cube_corner_bfl,
                                      cube_length,
                                      masks_partial=False,
+                                     transforms_convention="opengl",
                                      unique_verts=True,
                                     )
 ```
diff --git a/benchmarks/test_bench_gaussian_blur.py b/benchmarks/test_bench_gaussian_blur.py
@@ -42,7 +42,7 @@ def test_gaussian_blur(
     data_dir = pathlib.Path(__file__).parents[1] / "data"
     file = "Armadillo.ply"
 
-    _, _, masks = generate_dataset(
+    _, _, masks, _, _, _ = generate_dataset(
         mesh_file=data_dir / file,
         number_cameras=number_cameras,
         device=DEVICE,
diff --git a/benchmarks/test_bench_visual_hull.py b/benchmarks/test_bench_visual_hull.py
@@ -27,7 +27,7 @@ def test_visual_hull(benchmark, level: int, number_cameras: int) -> None:  # noq
     data_dir = pathlib.Path(__file__).parents[1] / "data"
     file = "Armadillo.ply"
 
-    projection_matrices, view_matrices, masks = generate_dataset(
+    projection_matrices, view_matrices, masks, _, _, _ = generate_dataset(
         mesh_file=data_dir / file,
         number_cameras=number_cameras,
         device=DEVICE,
@@ -47,6 +47,7 @@ def test_visual_hull(benchmark, level: int, number_cameras: int) -> None:  # noq
         cube_corner_bfl=(-scale, -scale, -scale),
         cube_length=2.0 * scale,
         masks_partial=False,
+        transforms_convention="opengl",
         unique_verts=True,
     )
 
@@ -58,5 +59,6 @@ def test_visual_hull(benchmark, level: int, number_cameras: int) -> None:  # noq
         cube_corner_bfl=(-scale, -scale, -scale),
         cube_length=2.0 * scale,
         masks_partial=False,
+        transforms_convention="opengl",
         unique_verts=True,
     )
diff --git a/data/generate_dataset.py b/data/generate_dataset.py
@@ -30,6 +30,26 @@ def perspective(
     )
 
 
+def perspective_cv(
+    fovy: float,
+    aspect: float,
+    height: int,
+    width: int,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> torch.Tensor:
+    return torch.tensor(
+        [
+            [(width / 2.0) / (np.tan(fovy / 2.0) * aspect), 0.0, width / 2.0, 0.0],
+            [0.0, (height / 2.0) / np.tan(fovy / 2.0), height / 2.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ],
+        dtype=dtype,
+        device=device,
+    )
+
+
 def rotate(
     angle: float,
     x: float,
@@ -77,6 +97,7 @@ def generate_random_camera(
     device: torch.device,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     projection_matrix = perspective(fovy, width / height, near, far, dtype, device)
+    projection_matrix_cv = perspective_cv(fovy, width / height, height, width, dtype, device)
 
     random_axis = scipy.stats.uniform_direction.rvs(3)
     rng = np.random.default_rng(1337)
@@ -87,8 +108,13 @@ def generate_random_camera(
     random_t = translate(random_translate[0], random_translate[1], random_translate[2], dtype, device)
 
     view_matrix = translate(0, 0, -camera_origin_distance, dtype, device) @ random_t @ random_r
+    view_matrix_cv = (
+        torch.tensor([1, -1, -1, 1], dtype=dtype, device=device).diag()
+        @ view_matrix
+        @ torch.tensor([1, -1, -1, 1], dtype=dtype, device=device).diag()
+    )
 
-    return projection_matrix, view_matrix
+    return projection_matrix, view_matrix, projection_matrix_cv, view_matrix_cv
 
 
 def render_masks(
@@ -180,8 +206,15 @@ def generate_dataset(
 
     projection_matrices = torch.empty([number_cameras, 4, 4], dtype=dtype, device=device)
     view_matrices = torch.empty([number_cameras, 4, 4], dtype=dtype, device=device)
+    projection_matrices_cv = torch.empty([number_cameras, 4, 4], dtype=dtype, device=device)
+    view_matrices_cv = torch.empty([number_cameras, 4, 4], dtype=dtype, device=device)
     for i in range(number_cameras):
-        projection_matrices[i, :, :], view_matrices[i, :, :] = generate_random_camera(
+        (
+            projection_matrices[i, :, :],
+            view_matrices[i, :, :],
+            projection_matrices_cv[i, :, :],
+            view_matrices_cv[i, :, :],
+        ) = generate_random_camera(
             fovy,
             near,
             far,
@@ -204,7 +237,7 @@ def generate_dataset(
         device=device,
     )
 
-    return projection_matrices, view_matrices, masks
+    return projection_matrices, view_matrices, masks, projection_matrices_cv, view_matrices_cv, masks.flip((1,))
 
 
 def main() -> None:
@@ -215,7 +248,7 @@ def main() -> None:
 
     output_dir.mkdir(exist_ok=True)
 
-    projection_matrices, view_matrices, masks = generate_dataset(
+    projection_matrices, view_matrices, masks, _, _, _ = generate_dataset(
         mesh_file=data_dir / file,
         dtype=torch.float32,
         device=torch.device("cuda"),
diff --git a/src/torchhull/_C/include/torchhull/image_utils.h b/src/torchhull/_C/include/torchhull/image_utils.h
@@ -70,6 +70,17 @@ unnormalize_ndc_false(const float coordinate, const int64_t size)
     return (coordinate + 1.f) * static_cast<float>(size) / 2.f - 0.5f;
 }
 
+// NOTE
+// ----
+// false refers to torch.nn.functional.grid_sample()'s align_corners=false
+//
+// A pixel with integer coordinates (y, x) covers the area inside [y - 0.5, y + 0.5] and [x - 0.5, x + 0.5].
+inline C10_HOST_DEVICE float
+align_cv_false(const float coordinate)
+{
+    return coordinate - 0.5f;
+}
+
 template <typename ValueT>
 inline C10_DEVICE ValueT
 sample_zeros_padding(const torch::PackedTensorAccessor64<ValueT, 4, torch::RestrictPtrTraits> image,
diff --git a/src/torchhull/_C/include/torchhull/visual_hull.h b/src/torchhull/_C/include/torchhull/visual_hull.h
@@ -15,6 +15,7 @@ visual_hull(const torch::Tensor& masks,
             const std::array<float, 3>& cube_corner_bfl,
             const float cube_length,
             const bool masks_partial,
+            const std::string& transforms_convention,
             const bool unique_verts);
 
 std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor>>
@@ -24,6 +25,7 @@ visual_hull_with_candidate_voxels(const torch::Tensor& masks,
                                   const std::array<float, 3>& cube_corner_bfl,
                                   const float cube_length,
                                   const bool masks_partial,
+                                  const std::string& transforms_convention,
                                   const bool unique_verts);
 
 std::vector<std::tuple<torch::Tensor, torch::Tensor>>
@@ -37,6 +39,7 @@ sparse_visual_hull_field(const torch::Tensor& masks,
                          const int level,
                          const std::array<float, 3>& cube_corner_bfl,
                          const float cube_length,
-                         const bool masks_partial);
+                         const bool masks_partial,
+                         const std::string& transforms_convention);
 
 } // namespace torchhull
diff --git a/src/torchhull/_C/python/bindings.cpp b/src/torchhull/_C/python/bindings.cpp
@@ -17,6 +17,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "cube_corner_bfl"_a,
           "cube_length"_a,
           "masks_partial"_a,
+          "transforms_convention"_a,
           "unique_verts"_a = true,
           R"(
         Compute the visual hull of the given masks in terms of a mesh.
@@ -34,7 +35,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         masks
             Single-channel mask images with binary values {0, 1}. B x H x W x 1.
         transforms
-            The combined transformations from world coordinates to OpenGL clip space (right before perspective division). B x 4 x 4.
+            The combined transformations, i.e. intrinsics * extrinsics, from world coordinates to image coordinates (right before perspective division). B x 4 x 4.
         level
             The hierarchy level to compute the visual hull at.
         cube_corner_bfl
@@ -43,6 +44,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
             The length of the cube in world space.
         masks_partial
             Whether some masks may only contain the object partially.
+        transforms_convention
+            Convention used to specify the transformations. Options: `opengl`, `opencv`.
         unique_verts
             Whether a compact mesh without duplicate vertices (\|F\| approx. 2 * \|V\|) if true, or a triangle soup
             (\|F\| = (1/3) * \|V\|) if false should be returned.
@@ -61,6 +64,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "cube_corner_bfl"_a,
           "cube_length"_a,
           "masks_partial"_a,
+          "transforms_convention"_a,
           "unique_verts"_a = true,
           R"(
         Compute the visual hull of the given masks in terms of a mesh.
@@ -78,7 +82,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         masks
             Single-channel mask images with binary values {0, 1}. B x H x W x 1.
         transforms
-            The combined transformations from world coordinates to OpenGL clip space (right before perspective division). B x 4 x 4.
+            The combined transformations, i.e. intrinsics * extrinsics, from world coordinates to image coordinates (right before perspective division). B x 4 x 4.
         level
             The hierarchy level to compute the visual hull at.
         cube_corner_bfl
@@ -87,6 +91,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
             The length of the cube in world space.
         masks_partial
             Whether some masks may only contain the object partially.
+        transforms_convention
+            Convention used to specify the transformations. Options: `opengl`, `opencv`.
         unique_verts
             Whether a compact mesh without duplicate vertices (\|F\| approx. 2 * \|V\|) if true, or a triangle soup
             (\|F\| = (1/3) * \|V\|) if false should be returned.
@@ -136,6 +142,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "cube_corner_bfl"_a,
           "cube_length"_a,
           "masks_partial"_a,
+          "transforms_convention"_a,
           R"(
         Compute a sparse scalar field of the sum of projected foreground pixels per detected candidate voxel. In this
         field, the visual hull is located at isolevel \|M\| - 0.5.
@@ -153,7 +160,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         masks
             Single-channel mask images with binary values {0, 1}. B x H x W x 1.
         transforms
-            The combined transformations from world coordinates to OpenGL clip space (right before perspective division). B x 4 x 4.
+            The combined transformations, i.e. intrinsics * extrinsics, from world coordinates to image coordinates (right before perspective division). B x 4 x 4.
         level
             The hierarchy level to compute the counts at.
         cube_corner_bfl
@@ -162,6 +169,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
             The length of the cube in world space.
         masks_partial
             Whether some masks may only contain the object partially.
+        transforms_convention
+            Convention used to specify the transformations. Options: `opengl`, `opencv`.
 
         Returns
         -------
diff --git a/src/torchhull/_C/src/visual_hull.cpp b/src/torchhull/_C/src/visual_hull.cpp
@@ -16,7 +16,8 @@ sparse_visual_hull_field_cuda_ravelled(const torch::Tensor& masks,
                                        const int level,
                                        const std::array<float, 3>& cube_corner_bfl,
                                        const float cube_length,
-                                       const bool masks_partial);
+                                       const bool masks_partial,
+                                       const std::string& transforms_convention);
 
 std::tuple<torch::Tensor, torch::Tensor>
 marching_cubes_cuda_sparse(const RavelledSparseTensor& sparse_volume,
@@ -37,14 +38,16 @@ visual_hull_cuda(const torch::Tensor& masks,
                  const std::array<float, 3>& cube_corner_bfl,
                  const float cube_length,
                  const bool masks_partial,
+                 const std::string& transforms_convention,
                  const bool unique_verts)
 {
     auto [volume, _] = sparse_visual_hull_field_cuda_ravelled(masks,
                                                               transforms,
                                                               level,
                                                               cube_corner_bfl,
                                                               cube_length,
-                                                              masks_partial);
+                                                              masks_partial,
+                                                              transforms_convention);
 
     auto isolevel = 0.5f;
     auto mesh = marching_cubes_cuda_sparse(volume, isolevel, false, unique_verts);
@@ -61,11 +64,19 @@ visual_hull(const torch::Tensor& masks,
             const std::array<float, 3>& cube_corner_bfl,
             const float cube_length,
             const bool masks_partial,
+            const std::string& transforms_convention,
             const bool unique_verts)
 {
     if (masks.is_cuda())
     {
-        return visual_hull_cuda(masks, transforms, level, cube_corner_bfl, cube_length, masks_partial, unique_verts);
+        return visual_hull_cuda(masks,
+                                transforms,
+                                level,
+                                cube_corner_bfl,
+                                cube_length,
+                                masks_partial,
+                                transforms_convention,
+                                unique_verts);
     }
 
     TORCH_CHECK(false, "No backend implementation available for device \"" + masks.device().str() + "\".");
@@ -78,14 +89,16 @@ visual_hull_cuda_with_candidate_voxels_cuda(const torch::Tensor& masks,
                                             const std::array<float, 3>& cube_corner_bfl,
                                             const float cube_length,
                                             const bool masks_partial,
+                                            const std::string& transforms_convention,
                                             const bool unique_verts)
 {
     auto [volume, candidates_octree] = sparse_visual_hull_field_cuda_ravelled(masks,
                                                                               transforms,
                                                                               level,
                                                                               cube_corner_bfl,
                                                                               cube_length,
-                                                                              masks_partial);
+                                                                              masks_partial,
+                                                                              transforms_convention);
 
     auto isolevel = 0.5f;
     auto mesh = marching_cubes_cuda_sparse(volume, isolevel, false, unique_verts);
@@ -102,6 +115,7 @@ visual_hull_with_candidate_voxels(const torch::Tensor& masks,
                                   const std::array<float, 3>& cube_corner_bfl,
                                   const float cube_length,
                                   const bool masks_partial,
+                                  const std::string& transforms_convention,
                                   const bool unique_verts)
 {
     if (masks.is_cuda())
@@ -112,6 +126,7 @@ visual_hull_with_candidate_voxels(const torch::Tensor& masks,
                                                            cube_corner_bfl,
                                                            cube_length,
                                                            masks_partial,
+                                                           transforms_convention,
                                                            unique_verts);
     }
 
@@ -145,19 +160,27 @@ sparse_visual_hull_field_cuda(const torch::Tensor& masks,
                               const int level,
                               const std::array<float, 3>& cube_corner_bfl,
                               const float cube_length,
-                              const bool masks_partial);
+                              const bool masks_partial,
+                              const std::string& transforms_convention);
 
 torch::Tensor
 sparse_visual_hull_field(const torch::Tensor& masks,
                          const torch::Tensor& transforms,
                          const int level,
                          const std::array<float, 3>& cube_corner_bfl,
                          const float cube_length,
-                         const bool masks_partial)
+                         const bool masks_partial,
+                         const std::string& transforms_convention)
 {
     if (masks.is_cuda())
     {
-        return sparse_visual_hull_field_cuda(masks, transforms, level, cube_corner_bfl, cube_length, masks_partial);
+        return sparse_visual_hull_field_cuda(masks,
+                                             transforms,
+                                             level,
+                                             cube_corner_bfl,
+                                             cube_length,
+                                             masks_partial,
+                                             transforms_convention);
     }
 
     TORCH_CHECK(false, "No backend implementation available for device \"" + masks.device().str() + "\".");
diff --git a/src/torchhull/_C/src/visual_hull_cuda.cu b/src/torchhull/_C/src/visual_hull_cuda.cu
diff --git a/tests/test_gaussian_blur.py b/tests/test_gaussian_blur.py
diff --git a/tests/test_visual_hull.py b/tests/test_visual_hull.py