From 2a361efc9b005ef8a512b2ab2c604366e4c75abe Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Fri, 20 Jun 2025 13:32:30 -0700 Subject: [PATCH 01/11] Adjust rotated clamping conditions Test Plan: ```bash pytest test/test_transforms_v2.py -k box -v ``` --- test/common_utils.py | 6 +- test/test_transforms_v2.py | 8 ++- .../transforms/v2/functional/_geometry.py | 15 ++--- torchvision/transforms/v2/functional/_meta.py | 60 +++++++++---------- 4 files changed, 47 insertions(+), 42 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index 9da3cf52d1c..b0481b1badf 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -469,9 +469,9 @@ def sample_position(values, max_value): raise ValueError(f"Format {format} is not supported") out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device) if tv_tensors.is_rotated_bounding_format(format): - # The rotated bounding boxes are not guaranteed to be within the canvas by design, - # so we apply clamping. We also add a 2 buffer to the canvas size to avoid - # numerical issues during the testing + # Rotated bounding boxes are not inherently confined within the canvas, so clamping is applied. + # Transform tests allow a 2-pixel tolerance relative to the canvas size. + # To prevent discrepancies when clamping with different canvas sizes, we add a 2-pixel buffer. buffer = 4 out_boxes = clamp_bounding_boxes( out_boxes, format=format, canvas_size=(canvas_size[0] - buffer, canvas_size[1] - buffer) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 7e667586ac1..4ef91cbf605 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -4421,9 +4421,15 @@ def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, h else reference_affine_bounding_boxes_helper ) + bounding_boxes = helper( + bounding_boxes, + affine_matrix=crop_affine_matrix, + new_canvas_size=(height, width) + ) + return helper( bounding_boxes, - affine_matrix=affine_matrix, + affine_matrix=resize_affine_matrix, new_canvas_size=size, ) diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 7e9766bdaf5..b28f2aced28 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -1104,8 +1104,9 @@ def _affine_bounding_boxes_with_expand( original_shape = bounding_boxes.shape dtype = bounding_boxes.dtype - need_cast = not bounding_boxes.is_floating_point() - bounding_boxes = bounding_boxes.float() if need_cast else bounding_boxes.clone() + acceptable_dtypes = [torch.float64] # Ensure consistency between CPU and GPU. + need_cast = dtype not in acceptable_dtypes + bounding_boxes = bounding_boxes.to(torch.float64) if need_cast else bounding_boxes.clone() device = bounding_boxes.device is_rotated = tv_tensors.is_rotated_bounding_format(format) intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY @@ -2397,11 +2398,11 @@ def elastic_bounding_boxes( original_shape = bounding_boxes.shape # TODO: first cast to float if bbox is int64 before convert_bounding_box_format - intermediate_format = tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY + intermediate_format = tv_tensors.BoundingBoxFormat.CXCYWHR if is_rotated else tv_tensors.BoundingBoxFormat.XYXY bounding_boxes = ( convert_bounding_box_format(bounding_boxes.clone(), old_format=format, new_format=intermediate_format) - ).reshape(-1, 8 if is_rotated else 4) + ).reshape(-1, 5 if is_rotated else 4) id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement @@ -2409,7 +2410,7 @@ def elastic_bounding_boxes( inv_grid = id_grid.sub_(displacement) # Get points from bboxes - points = bounding_boxes if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]] + points = bounding_boxes[:, :2] if is_rotated else bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]] points = points.reshape(-1, 2) if points.is_floating_point(): points = points.ceil_() @@ -2421,8 +2422,8 @@ def elastic_bounding_boxes( transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5) if is_rotated: - transformed_points = transformed_points.reshape(-1, 8) - out_bboxes = _parallelogram_to_bounding_boxes(transformed_points).to(bounding_boxes.dtype) + transformed_points = transformed_points.reshape(-1, 2) + out_bboxes = torch.cat([transformed_points, bounding_boxes[:, 2:]], dim=1).to(bounding_boxes.dtype) else: transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1) diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 1729aa4bbaf..96ee69c46c0 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -409,23 +409,17 @@ def _order_bounding_boxes_points( if indices is None: output_xyxyxyxy = bounding_boxes.reshape(-1, 8) x, y = output_xyxyxyxy[..., 0::2], output_xyxyxyxy[..., 1::2] - y_max = torch.max(y, dim=1, keepdim=True)[0] - _, x1 = ((y_max - y) / y_max + (x + 1) * 100).min(dim=1) + y_max = torch.max(y.abs(), dim=1, keepdim=True)[0] + _, x1 = (y / y_max + (x + 1) * 100).min(dim=1) indices = torch.ones_like(output_xyxyxyxy) indices[..., 0] = x1.mul(2) indices.cumsum_(1).remainder_(8) return indices, bounding_boxes.gather(1, indices.to(torch.int64)) -def _area(box: torch.Tensor) -> torch.Tensor: - x1, y1, x2, y2, x3, y3, x4, y4 = box.reshape(-1, 8).unbind(-1) - w = torch.sqrt((y2 - y1) ** 2 + (x2 - x1) ** 2) - h = torch.sqrt((y3 - y2) ** 2 + (x3 - x2) ** 2) - return w * h - - def _clamp_along_y_axis( bounding_boxes: torch.Tensor, + canvas_size: tuple[int, int], ) -> torch.Tensor: """ Adjusts bounding boxes along the y-axis based on specific conditions. @@ -448,29 +442,33 @@ def _clamp_along_y_axis( b2 = y2 + x2 / a b3 = y3 - a * x3 b4 = y4 + x4 / a - b23 = (b2 - b3) / 2 * a / (1 + a**2) - z = torch.zeros_like(b1) - case_a = torch.cat([x.unsqueeze(1) for x in [z, b1, x2, y2, x3, y3, x3 - x2, y3 + b1 - y2]], dim=1) - case_b = torch.cat([x.unsqueeze(1) for x in [z, b4, x2 - x1, y2 - y1 + b4, x3, y3, x4, y4]], dim=1) - case_c = torch.cat( - [x.unsqueeze(1) for x in [z, (b2 + b3) / 2, b23, -b23 / a + b2, x3, y3, b23, b23 * a + b3]], dim=1 + c = a / (1 + a**2) + b1 = b2.clamp(0).clamp(b1, b3) + b4 = b3.clamp(max=canvas_size[0]).clamp(b2, b4) + case_a = torch.stack( + ( + (b4 - b1) * c, + (b4 - b1) * c * a + b1, + (b2 - b1) * c, + (b1 - b2) * c / a + b2, + x3, + y3, + (b4 - b3) * c, + (b3 - b4) * c / a + b4, + ), + dim=-1, ) - case_d = torch.zeros_like(case_c) - case_e = torch.cat([x.unsqueeze(1) for x in [x1.clamp(0), y1, x2.clamp(0), y2, x3, y3, x4, y4]], dim=1) - - cond_a = (x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 >= 0) - cond_a = cond_a.logical_and(_area(case_a) > _area(case_b)) - cond_a = cond_a.logical_or((x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 <= 0)) - cond_b = (x1 < 0).logical_and(x2 >= 0).logical_and(x3 >= 0).logical_and(x4 >= 0) - cond_b = cond_b.logical_and(_area(case_a) <= _area(case_b)) - cond_b = cond_b.logical_or((x1 < 0).logical_and(x2 <= 0).logical_and(x3 >= 0).logical_and(x4 >= 0)) - cond_c = (x1 < 0).logical_and(x2 <= 0).logical_and(x3 >= 0).logical_and(x4 <= 0) - cond_d = (x1 < 0).logical_and(x2 <= 0).logical_and(x3 <= 0).logical_and(x4 <= 0) - cond_e = x1.isclose(x2) - + case_b = bounding_boxes.clone() + case_b[..., 0].clamp_(0) + case_b[..., 6].clamp_(0) + case_c = torch.zeros_like(case_b) + + cond_a = x1 < 0 + cond_b = y1.isclose(y2, rtol=1e-05, atol=1e-05) + cond_c = (x1 <= 0).logical_and(x2 <= 0).logical_and(x3 <= 0).logical_and(x4 <= 0) for cond, case in zip( - [cond_a, cond_b, cond_c, cond_d, cond_e], - [case_a, case_b, case_c, case_d, case_e], + [cond_a, cond_b, cond_c], + [case_a, case_b, case_c], ): bounding_boxes = torch.where(cond.unsqueeze(1).repeat(1, 8), case.reshape(-1, 8), bounding_boxes) return bounding_boxes.to(original_dtype).reshape(original_shape) @@ -512,7 +510,7 @@ def _clamp_rotated_bounding_boxes( for _ in range(4): # Iterate over the 4 vertices. indices, out_boxes = _order_bounding_boxes_points(out_boxes) - out_boxes = _clamp_along_y_axis(out_boxes) + out_boxes = _clamp_along_y_axis(out_boxes, canvas_size) _, out_boxes = _order_bounding_boxes_points(out_boxes, indices) # rotate 90 degrees counter clock wise out_boxes[:, ::2], out_boxes[:, 1::2] = ( From 4261ed3201ced834533259d5b361a2d8df0cd2c0 Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Fri, 20 Jun 2025 13:46:41 -0700 Subject: [PATCH 02/11] apply linting --- test/common_utils.py | 2 +- test/test_transforms_v2.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index b0481b1badf..8ecfd81d3a0 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -469,7 +469,7 @@ def sample_position(values, max_value): raise ValueError(f"Format {format} is not supported") out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device) if tv_tensors.is_rotated_bounding_format(format): - # Rotated bounding boxes are not inherently confined within the canvas, so clamping is applied. + # Rotated bounding boxes are not inherently confined within the canvas, so clamping is applied. # Transform tests allow a 2-pixel tolerance relative to the canvas size. # To prevent discrepancies when clamping with different canvas sizes, we add a 2-pixel buffer. buffer = 4 diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 4ef91cbf605..19b832a14bd 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -4413,7 +4413,6 @@ def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, h [0, 0, 1], ], ) - affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :] helper = ( reference_affine_rotated_bounding_boxes_helper @@ -4421,11 +4420,7 @@ def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, h else reference_affine_bounding_boxes_helper ) - bounding_boxes = helper( - bounding_boxes, - affine_matrix=crop_affine_matrix, - new_canvas_size=(height, width) - ) + bounding_boxes = helper(bounding_boxes, affine_matrix=crop_affine_matrix, new_canvas_size=(height, width)) return helper( bounding_boxes, From 42bae572fd426368354d024346fd59ff8f3086c7 Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Thu, 26 Jun 2025 19:51:36 -0700 Subject: [PATCH 03/11] Fix hard clamping --- test/common_utils.py | 12 -- test/test_transforms_v2.py | 18 ++- torchvision/transforms/v2/functional/_meta.py | 148 ++++++++++++++---- 3 files changed, 124 insertions(+), 54 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index 8ecfd81d3a0..61feee4c896 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -468,18 +468,6 @@ def sample_position(values, max_value): else: raise ValueError(f"Format {format} is not supported") out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device) - if tv_tensors.is_rotated_bounding_format(format): - # Rotated bounding boxes are not inherently confined within the canvas, so clamping is applied. - # Transform tests allow a 2-pixel tolerance relative to the canvas size. - # To prevent discrepancies when clamping with different canvas sizes, we add a 2-pixel buffer. - buffer = 4 - out_boxes = clamp_bounding_boxes( - out_boxes, format=format, canvas_size=(canvas_size[0] - buffer, canvas_size[1] - buffer) - ) - if format is tv_tensors.BoundingBoxFormat.XYWHR or format is tv_tensors.BoundingBoxFormat.CXCYWHR: - out_boxes[:, :2] += buffer // 2 - elif format is tv_tensors.BoundingBoxFormat.XYXYXYXY: - out_boxes[:, :] += buffer // 2 return tv_tensors.BoundingBoxes(out_boxes, format=format, canvas_size=canvas_size) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 19b832a14bd..9b44f0f5a1a 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1298,7 +1298,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.B ) helper = ( - functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True, clamp=False) if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) else reference_affine_bounding_boxes_helper ) @@ -1907,7 +1907,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.Bou ) helper = ( - functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True, clamp=False) if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) else reference_affine_bounding_boxes_helper ) @@ -2196,7 +2196,7 @@ def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes ) - def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): + def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center, canvas_size=None): if center is None: center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] cx, cy = center @@ -2222,7 +2222,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen output = helper( bounding_boxes, affine_matrix=affine_matrix, - new_canvas_size=new_canvas_size, + new_canvas_size=new_canvas_size if canvas_size is None else canvas_size, clamp=False, ) @@ -2239,9 +2239,10 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center, canvas_size=actual.canvas_size) torch.testing.assert_close(actual, expected) - torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("expand", [False, True]) @@ -2259,9 +2260,10 @@ def test_transform_bounding_boxes_correctness(self, format, expand, center, seed actual = transform(bounding_boxes) expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) - - torch.testing.assert_close(actual, expected) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center, canvas_size=actual.canvas_size) + torch.testing.assert_close(actual, expected) def _recenter_keypoints_after_expand(self, keypoints, *, recenter_xy): x, y = recenter_xy @@ -4437,7 +4439,7 @@ def test_functional_bounding_boxes_correctness(self, format): bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE ) - torch.testing.assert_close(actual, expected) + torch.testing.assert_close(actual, expected, atol=1e-5, rtol=1e-5) assert_equal(F.get_size(actual), F.get_size(expected)) def _reference_resized_crop_keypoints(self, keypoints, *, top, left, height, width, size): diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 96ee69c46c0..7d14d2eb084 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -410,16 +410,87 @@ def _order_bounding_boxes_points( output_xyxyxyxy = bounding_boxes.reshape(-1, 8) x, y = output_xyxyxyxy[..., 0::2], output_xyxyxyxy[..., 1::2] y_max = torch.max(y.abs(), dim=1, keepdim=True)[0] - _, x1 = (y / y_max + (x + 1) * 100).min(dim=1) + x_max = torch.max(x.abs(), dim=1, keepdim=True)[0] + _, x1 = (y / y_max + (x / x_max) * 100).min(dim=1) indices = torch.ones_like(output_xyxyxyxy) indices[..., 0] = x1.mul(2) indices.cumsum_(1).remainder_(8) return indices, bounding_boxes.gather(1, indices.to(torch.int64)) +def _get_slope_and_intercept(box: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the slope and y-intercept of the lines defined by consecutive vertices in a bounding box. + This function computes the slope (a) and y-intercept (b) for each line segment in a bounding box, + where each line is defined by two consecutive vertices. + """ + x, y = box[..., ::2], box[..., 1::2] + a = y.diff(append=y[..., 0:1]) / x.diff(append=x[..., 0:1]) + b = y - a * x + return a, b + + +def _get_intersection_point(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Calculate the intersection point of two lines defined by their slopes and y-intercepts. + This function computes the intersection points between pairs of lines, where each line + is defined by the equation y = ax + b (slope and y-intercept form). + """ + batch_size = a.shape[0] + x = b.diff(prepend=b[..., 3:4]).neg() / a.diff(prepend=a[..., 3:4]) + y = a * x + b + return torch.cat((x.unsqueeze(-1), y.unsqueeze(-1)), dim=-1).view(batch_size, 8) + + +def _clamp_y_intercept( + bounding_boxes: torch.Tensor, + original_bounding_boxes: torch.Tensor, + canvas_size: tuple[int, int], + clamping: str = "hard", +) -> torch.Tensor: + """ + Apply clamping to bounding box y-intercepts. This function handles two clamping strategies: + - Hard clamping: Ensures all box vertices stay within canvas boundaries, finding the largest + angle-preserving box enclosed within the original box and the image canvas. + - Soft clamping: Allows some vertices to extend beyond the canvas, finding the smallest + angle-preserving box that encloses the intersection of the original box and the image canvas. + + The function first calculates the slopes and y-intercepts of the lines forming the bounding box, + then applies various constraints to ensure the clamping conditions are respected. + """ + + a, b = _get_slope_and_intercept(bounding_boxes) + a1, a2, a3, a4 = a.unbind(-1) + b1, b2, b3, b4 = b.unbind(-1) + + # Clamp y-intercepts (soft clamping) + b1 = b2.clamp(0).clamp(b1, b3) + b4 = b3.clamp(max=canvas_size[0]).clamp(b2, b4) + + if clamping == "hard": + # Get y-intercepts from original bounding boxes + _, b = _get_slope_and_intercept(original_bounding_boxes) + _, b2, b3, _ = b.unbind(-1) + + # Set b1 and b4 to the average of their clamped values + b1 = b4 = (b1.clamp(0, canvas_size[0]) + b4.clamp(0, canvas_size[0])) / 2 + + # Ensure b2 and b3 defined the box of maximum area after clamping b1 and b4 + b2.clamp_(b1 * a2 / a1, b4).clamp_((a1 - a2) * canvas_size[1] + b1) + b2.clamp_(b3 * a2 / a3, b4).clamp_((a3 - a2) * canvas_size[1] + b3) + b3.clamp_(max=canvas_size[0] * (1 - a3 / a4) + b4 * a3 / a4) + b3.clamp_(max=canvas_size[0] * (1 - a3 / a2) + b2 * a3 / a2) + b3.clamp_(b1, (a2 - a3) * canvas_size[1] + b2) + b3.clamp_(b1, (a4 - a3) * canvas_size[1] + b4) + + return torch.stack([b1, b2, b3, b4], dim=-1) + + def _clamp_along_y_axis( bounding_boxes: torch.Tensor, + original_bounding_boxes: torch.Tensor, canvas_size: tuple[int, int], + clamping: str = "hard", ) -> torch.Tensor: """ Adjusts bounding boxes along the y-axis based on specific conditions. @@ -430,52 +501,53 @@ def _clamp_along_y_axis( Args: bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates. + original_bounding_boxes (torch.Tensor): The original bounding boxes before any clamping is applied. + canvas_size (tuple[int, int]): The size of the canvas as (height, width). + clamping (str, optional): The clamping strategy to use. Defaults to "hard". Returns: torch.Tensor: The adjusted bounding boxes. """ - original_dtype = bounding_boxes.dtype + dtype = bounding_boxes.dtype + acceptable_dtypes = [torch.float64] # Ensure consistency between CPU and GPU. + need_cast = dtype not in acceptable_dtypes + eps = 1e-06 # Ensure consistency between CPU and GPU. original_shape = bounding_boxes.shape - x1, y1, x2, y2, x3, y3, x4, y4 = bounding_boxes.reshape(-1, 8).unbind(-1) - a = (y2 - y1) / (x2 - x1) - b1 = y1 - a * x1 - b2 = y2 + x2 / a - b3 = y3 - a * x3 - b4 = y4 + x4 / a - c = a / (1 + a**2) - b1 = b2.clamp(0).clamp(b1, b3) - b4 = b3.clamp(max=canvas_size[0]).clamp(b2, b4) - case_a = torch.stack( - ( - (b4 - b1) * c, - (b4 - b1) * c * a + b1, - (b2 - b1) * c, - (b1 - b2) * c / a + b2, - x3, - y3, - (b4 - b3) * c, - (b3 - b4) * c / a + b4, - ), - dim=-1, - ) + bounding_boxes = bounding_boxes.reshape(-1, 8) + original_bounding_boxes = original_bounding_boxes.reshape(-1, 8) + + # Calculate slopes (a) and y-intercepts (b) for all lines in the bounding boxes + a, b = _get_slope_and_intercept(bounding_boxes) + x1, y1, x2, y2, x3, y3, x4, y4 = bounding_boxes.unbind(-1) + b = _clamp_y_intercept(bounding_boxes, original_bounding_boxes, canvas_size, clamping) + + case_a = _get_intersection_point(a, b) case_b = bounding_boxes.clone() - case_b[..., 0].clamp_(0) - case_b[..., 6].clamp_(0) + case_b[..., 0].clamp_(0) # Clamp x1 to 0 + case_b[..., 6].clamp_(0) # Clamp x4 to 0 case_c = torch.zeros_like(case_b) - cond_a = x1 < 0 - cond_b = y1.isclose(y2, rtol=1e-05, atol=1e-05) - cond_c = (x1 <= 0).logical_and(x2 <= 0).logical_and(x3 <= 0).logical_and(x4 <= 0) - for cond, case in zip( + cond_a = (x1 < eps) & ~case_a.isnan().any(-1) # First point is outside left boundary + cond_b = y1.isclose(y2, rtol=eps, atol=eps) | y3.isclose(y4, rtol=eps, atol=eps) # First line is nearly vertical + cond_c = (x1 <= 0) & (x2 <= 0) & (x3 <= 0) & (x4 <= 0) # All points outside left boundary + cond_c = cond_c | y1.isclose(y4, rtol=eps, atol=eps) | y2.isclose(y3, rtol=eps, atol=eps) | (cond_b & x1.isclose(x2, rtol=eps, atol=eps)) # First line is nearly horizontal + + for (cond, case) in zip( [cond_a, cond_b, cond_c], [case_a, case_b, case_c], ): bounding_boxes = torch.where(cond.unsqueeze(1).repeat(1, 8), case.reshape(-1, 8), bounding_boxes) - return bounding_boxes.to(original_dtype).reshape(original_shape) + bounding_boxes[..., 0].clamp_(0) # Clamp x1 to 0 + + if need_cast: + if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): + bounding_boxes.round_() + bounding_boxes = bounding_boxes.to(dtype) + return bounding_boxes.reshape(original_shape) def _clamp_rotated_bounding_boxes( - bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int] + bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: tuple[int, int], clamping_mode: str = "soft" ) -> torch.Tensor: """ Clamp rotated bounding boxes to ensure they stay within the canvas boundaries. @@ -508,15 +580,22 @@ def _clamp_rotated_bounding_boxes( ) ).reshape(-1, 8) + original_boxes = out_boxes.clone() for _ in range(4): # Iterate over the 4 vertices. indices, out_boxes = _order_bounding_boxes_points(out_boxes) - out_boxes = _clamp_along_y_axis(out_boxes, canvas_size) + _, original_boxes = _order_bounding_boxes_points(original_boxes, indices) + out_boxes = _clamp_along_y_axis(out_boxes, original_boxes, canvas_size, clamping_mode) _, out_boxes = _order_bounding_boxes_points(out_boxes, indices) + _, original_boxes = _order_bounding_boxes_points(original_boxes, indices) # rotate 90 degrees counter clock wise out_boxes[:, ::2], out_boxes[:, 1::2] = ( out_boxes[:, 1::2].clone(), canvas_size[1] - out_boxes[:, ::2].clone(), ) + original_boxes[:, ::2], original_boxes[:, 1::2] = ( + original_boxes[:, 1::2].clone(), + canvas_size[1] - original_boxes[:, ::2].clone(), + ) canvas_size = (canvas_size[1], canvas_size[0]) out_boxes = convert_bounding_box_format( @@ -525,7 +604,8 @@ def _clamp_rotated_bounding_boxes( if need_cast: if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): - out_boxes.round_() + # Adding epsilon to ensure consistency between CPU and GPU rounding. + out_boxes.add_(1e-7).round_() out_boxes = out_boxes.to(dtype) return out_boxes From 9e3f7c03e86e8e84a39e79a8f7512a24386a4fd5 Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Thu, 26 Jun 2025 20:00:18 -0700 Subject: [PATCH 04/11] Adjust soft clamping --- torchvision/transforms/v2/functional/_meta.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 7d14d2eb084..1ea73a7cf8e 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -446,7 +446,7 @@ def _clamp_y_intercept( bounding_boxes: torch.Tensor, original_bounding_boxes: torch.Tensor, canvas_size: tuple[int, int], - clamping: str = "hard", + clamping_mode: str = "hard", ) -> torch.Tensor: """ Apply clamping to bounding box y-intercepts. This function handles two clamping strategies: @@ -464,10 +464,10 @@ def _clamp_y_intercept( b1, b2, b3, b4 = b.unbind(-1) # Clamp y-intercepts (soft clamping) - b1 = b2.clamp(0).clamp(b1, b3) - b4 = b3.clamp(max=canvas_size[0]).clamp(b2, b4) + b1 = b2.clamp(b1, b3).clamp(0, canvas_size[0]) + b4 = b3.clamp(b2, b4).clamp(0, canvas_size[0]) - if clamping == "hard": + if clamping_mode == "hard": # Get y-intercepts from original bounding boxes _, b = _get_slope_and_intercept(original_bounding_boxes) _, b2, b3, _ = b.unbind(-1) @@ -490,7 +490,7 @@ def _clamp_along_y_axis( bounding_boxes: torch.Tensor, original_bounding_boxes: torch.Tensor, canvas_size: tuple[int, int], - clamping: str = "hard", + clamping_mode: str = "hard", ) -> torch.Tensor: """ Adjusts bounding boxes along the y-axis based on specific conditions. @@ -503,7 +503,7 @@ def _clamp_along_y_axis( bounding_boxes (torch.Tensor): A tensor containing bounding box coordinates. original_bounding_boxes (torch.Tensor): The original bounding boxes before any clamping is applied. canvas_size (tuple[int, int]): The size of the canvas as (height, width). - clamping (str, optional): The clamping strategy to use. Defaults to "hard". + clamping_mode (str, optional): The clamping strategy to use. Defaults to "hard". Returns: torch.Tensor: The adjusted bounding boxes. @@ -519,7 +519,7 @@ def _clamp_along_y_axis( # Calculate slopes (a) and y-intercepts (b) for all lines in the bounding boxes a, b = _get_slope_and_intercept(bounding_boxes) x1, y1, x2, y2, x3, y3, x4, y4 = bounding_boxes.unbind(-1) - b = _clamp_y_intercept(bounding_boxes, original_bounding_boxes, canvas_size, clamping) + b = _clamp_y_intercept(bounding_boxes, original_bounding_boxes, canvas_size, clamping_mode) case_a = _get_intersection_point(a, b) case_b = bounding_boxes.clone() @@ -537,7 +537,8 @@ def _clamp_along_y_axis( [case_a, case_b, case_c], ): bounding_boxes = torch.where(cond.unsqueeze(1).repeat(1, 8), case.reshape(-1, 8), bounding_boxes) - bounding_boxes[..., 0].clamp_(0) # Clamp x1 to 0 + if clamping_mode == "hard": + bounding_boxes[..., 0].clamp_(0) # Clamp x1 to 0 if need_cast: if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): From be3619feb4a90dac2938836768e0c6f6611679d3 Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Thu, 26 Jun 2025 20:08:52 -0700 Subject: [PATCH 05/11] lint --- test/common_utils.py | 2 +- test/test_transforms_v2.py | 10 +++++++--- torchvision/transforms/v2/functional/_meta.py | 7 ++++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index 61feee4c896..afb48dce541 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -21,7 +21,7 @@ from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair from torchvision import io, tv_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value -from torchvision.transforms.v2.functional import clamp_bounding_boxes, to_image, to_pil_image +from torchvision.transforms.v2.functional import to_image, to_pil_image IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"]) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 9b44f0f5a1a..bea3fe8976d 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -2241,7 +2241,9 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center, canvas_size=actual.canvas_size) + expected = self._reference_rotate_bounding_boxes( + bounding_boxes, angle=angle, expand=expand, center=center, canvas_size=actual.canvas_size + ) torch.testing.assert_close(actual, expected) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @@ -2261,8 +2263,10 @@ def test_transform_bounding_boxes_correctness(self, format, expand, center, seed expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - - expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center, canvas_size=actual.canvas_size) + + expected = self._reference_rotate_bounding_boxes( + bounding_boxes, **params, expand=expand, center=center, canvas_size=actual.canvas_size + ) torch.testing.assert_close(actual, expected) def _recenter_keypoints_after_expand(self, keypoints, *, recenter_xy): diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 1ea73a7cf8e..c30838905fd 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -530,7 +530,12 @@ def _clamp_along_y_axis( cond_a = (x1 < eps) & ~case_a.isnan().any(-1) # First point is outside left boundary cond_b = y1.isclose(y2, rtol=eps, atol=eps) | y3.isclose(y4, rtol=eps, atol=eps) # First line is nearly vertical cond_c = (x1 <= 0) & (x2 <= 0) & (x3 <= 0) & (x4 <= 0) # All points outside left boundary - cond_c = cond_c | y1.isclose(y4, rtol=eps, atol=eps) | y2.isclose(y3, rtol=eps, atol=eps) | (cond_b & x1.isclose(x2, rtol=eps, atol=eps)) # First line is nearly horizontal + cond_c = ( + cond_c + | y1.isclose(y4, rtol=eps, atol=eps) + | y2.isclose(y3, rtol=eps, atol=eps) + | (cond_b & x1.isclose(x2, rtol=eps, atol=eps)) + ) # First line is nearly horizontal for (cond, case) in zip( [cond_a, cond_b, cond_c], From 62f5f78347a335b30442ca27869e185303c63706 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 30 Jun 2025 10:31:49 +0100 Subject: [PATCH 06/11] remove debug stuff --- test/test_transforms_v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 87abd7516b2..c4209c5e05c 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -5615,7 +5615,6 @@ class TestSetClampingMode: def test_setter(self, format, constructor_clamping_mode, desired_clamping_mode): in_boxes = make_bounding_boxes(format=format, clamping_mode=constructor_clamping_mode) - assert in_boxes.clamping_mode == constructor_clamping_mode # input is unchanged: no leak out_boxes = transforms.SetClampingMode(clamping_mode=desired_clamping_mode)(in_boxes) assert in_boxes.clamping_mode == constructor_clamping_mode # input is unchanged: no leak From 9e0c2ddcc483d3a47589a544d2aa0b7ec190686e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 30 Jun 2025 13:48:34 +0100 Subject: [PATCH 07/11] Make soft the default clamping_mode, and add a test --- test/common_utils.py | 2 +- test/test_tv_tensors.py | 5 +++++ torchvision/transforms/v2/functional/_geometry.py | 14 +++++++------- torchvision/tv_tensors/_bounding_boxes.py | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index c4e950997ca..ee3a2d5cbde 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -410,7 +410,7 @@ def make_bounding_boxes( canvas_size=DEFAULT_SIZE, *, format=tv_tensors.BoundingBoxFormat.XYXY, - clamping_mode="hard", # TODOBB + clamping_mode="soft", num_boxes=1, dtype=None, device="cpu", diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py index 43efceba5c9..bed419b312c 100644 --- a/test/test_tv_tensors.py +++ b/test/test_tv_tensors.py @@ -406,3 +406,8 @@ def test_return_type_input(): tv_tensors.set_return_type("typo") tv_tensors.set_return_type("tensor") + + +def test_box_clamping_mode_default(): + assert tv_tensors.BoundingBoxes([0, 0, 10, 10], format="XYXY", canvas_size=(100, 100)).clamping_mode == "soft" + assert tv_tensors.BoundingBoxes([0, 0, 10, 10, 0], format="XYWHR", canvas_size=(100, 100)).clamping_mode == "soft" diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 8fd7a776de9..57646d957aa 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -522,7 +522,7 @@ def resize_bounding_boxes( size: Optional[list[int]], max_size: Optional[int] = None, format: tv_tensors.BoundingBoxFormat = tv_tensors.BoundingBoxFormat.XYXY, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: # We set the default format as `tv_tensors.BoundingBoxFormat.XYXY` # to ensure backward compatibility. @@ -1108,7 +1108,7 @@ def _affine_bounding_boxes_with_expand( shear: list[float], center: Optional[list[float]] = None, expand: bool = False, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: if bounding_boxes.numel() == 0: return bounding_boxes, canvas_size @@ -1211,7 +1211,7 @@ def affine_bounding_boxes( scale: float, shear: list[float], center: Optional[list[float]] = None, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> torch.Tensor: out_box, _ = _affine_bounding_boxes_with_expand( bounding_boxes, @@ -1740,7 +1740,7 @@ def pad_bounding_boxes( canvas_size: tuple[int, int], padding: list[int], padding_mode: str = "constant", - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: if padding_mode not in ["constant"]: # TODO: add support of other padding modes @@ -1858,7 +1858,7 @@ def crop_bounding_boxes( left: int, height: int, width: int, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: # Crop or implicit pad if left and/or top have negative values: @@ -2098,7 +2098,7 @@ def perspective_bounding_boxes( startpoints: Optional[list[list[int]]], endpoints: Optional[list[list[int]]], coefficients: Optional[list[float]] = None, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> torch.Tensor: if bounding_boxes.numel() == 0: return bounding_boxes @@ -2413,7 +2413,7 @@ def elastic_bounding_boxes( format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int], displacement: torch.Tensor, - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB soft + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> torch.Tensor: expected_shape = (1, canvas_size[0], canvas_size[1], 2) if not isinstance(displacement, torch.Tensor): diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py index 22a32b7dfa5..72a2825aad1 100644 --- a/torchvision/tv_tensors/_bounding_boxes.py +++ b/torchvision/tv_tensors/_bounding_boxes.py @@ -105,7 +105,7 @@ def __new__( *, format: BoundingBoxFormat | str, canvas_size: tuple[int, int], - clamping_mode: CLAMPING_MODE_TYPE = "hard", # TODOBB change default to soft! + clamping_mode: CLAMPING_MODE_TYPE = "soft", dtype: torch.dtype | None = None, device: torch.device | str | int | None = None, requires_grad: bool | None = None, From 64e104e3ab5d939af750cba3f6957d1db35df095 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 30 Jun 2025 14:37:26 +0100 Subject: [PATCH 08/11] set clamp=False in reference for both rotated and non-rotated --- test/test_transforms_v2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index c4209c5e05c..69f5def29c5 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1301,11 +1301,11 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.B ) helper = ( - functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True, clamp=False) + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) else reference_affine_bounding_boxes_helper ) - return helper(bounding_boxes, affine_matrix=affine_matrix) + return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @pytest.mark.parametrize( @@ -1910,11 +1910,11 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.Bou ) helper = ( - functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True, clamp=False) + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) else reference_affine_bounding_boxes_helper ) - return helper(bounding_boxes, affine_matrix=affine_matrix) + return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) From 064eb9ff7cc7601220fbf272beebb9b4a3a40d05 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 30 Jun 2025 15:27:49 +0100 Subject: [PATCH 09/11] Simplify test and fix some kernels that I forgot --- test/test_transforms_v2.py | 21 ++++++------ .../transforms/v2/functional/_geometry.py | 33 ++++++++++++++++--- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 69f5def29c5..77c39f6a414 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -551,6 +551,7 @@ def affine_bounding_boxes(bounding_boxes): ), format=format, canvas_size=canvas_size, + clamping_mode=clamping_mode, ) @@ -639,6 +640,7 @@ def affine_rotated_bounding_boxes(bounding_boxes): ).reshape(bounding_boxes.shape), format=format, canvas_size=canvas_size, + clamping_mode=clamping_mode, ) @@ -4355,7 +4357,6 @@ def test_functional(self, make_input): (F.resized_crop_image, torch.Tensor), (F._geometry._resized_crop_image_pil, PIL.Image.Image), (F.resized_crop_image, tv_tensors.Image), - (F.resized_crop_bounding_boxes, tv_tensors.BoundingBoxes), (F.resized_crop_mask, tv_tensors.Mask), (F.resized_crop_video, tv_tensors.Video), (F.resized_crop_keypoints, tv_tensors.KeyPoints), @@ -4422,30 +4423,30 @@ def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, h ], ) + affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :] + helper = ( reference_affine_rotated_bounding_boxes_helper if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) else reference_affine_bounding_boxes_helper ) - bounding_boxes = helper(bounding_boxes, affine_matrix=crop_affine_matrix, new_canvas_size=(height, width)) - - return helper( - bounding_boxes, - affine_matrix=resize_affine_matrix, - new_canvas_size=size, - ) + return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=size, clamp=False) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) def test_functional_bounding_boxes_correctness(self, format): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) + # Note that we don't want to clamp because in + # _reference_resized_crop_bounding_boxes we are fusing the crop and the + # resize operation, where none of the croppings happen - particularly, + # the intermediate one. + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, clamping_mode="none") actual = F.resized_crop(bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) expected = self._reference_resized_crop_bounding_boxes( bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE ) - torch.testing.assert_close(actual, expected, atol=1e-5, rtol=1e-5) + torch.testing.assert_close(actual, expected) assert_equal(F.get_size(actual), F.get_size(expected)) def _reference_resized_crop_keypoints(self, keypoints, *, top, left, height, width, size): diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 57646d957aa..4a3b20b7fb3 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -2620,11 +2620,18 @@ def center_crop_bounding_boxes( format: tv_tensors.BoundingBoxFormat, canvas_size: tuple[int, int], output_size: list[int], + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: crop_height, crop_width = _center_crop_parse_output_size(output_size) crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size) return crop_bounding_boxes( - bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width + bounding_boxes, + format, + top=crop_top, + left=crop_left, + height=crop_height, + width=crop_width, + clamping_mode=clamping_mode, ) @@ -2633,7 +2640,11 @@ def _center_crop_bounding_boxes_dispatch( inpt: tv_tensors.BoundingBoxes, output_size: list[int] ) -> tv_tensors.BoundingBoxes: output, canvas_size = center_crop_bounding_boxes( - inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size + inpt.as_subclass(torch.Tensor), + format=inpt.format, + canvas_size=inpt.canvas_size, + output_size=output_size, + clamping_mode=inpt.clamping_mode, ) return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size) @@ -2780,9 +2791,14 @@ def resized_crop_bounding_boxes( height: int, width: int, size: list[int], + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: - bounding_boxes, canvas_size = crop_bounding_boxes(bounding_boxes, format, top, left, height, width) - return resize_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size, size=size) + bounding_boxes, canvas_size = crop_bounding_boxes( + bounding_boxes, format, top, left, height, width, clamping_mode=clamping_mode + ) + return resize_bounding_boxes( + bounding_boxes, format=format, canvas_size=canvas_size, size=size, clamping_mode=clamping_mode + ) @_register_kernel_internal(resized_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False) @@ -2790,7 +2806,14 @@ def _resized_crop_bounding_boxes_dispatch( inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: list[int], **kwargs ) -> tv_tensors.BoundingBoxes: output, canvas_size = resized_crop_bounding_boxes( - inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size + inpt.as_subclass(torch.Tensor), + format=inpt.format, + top=top, + left=left, + height=height, + width=width, + size=size, + clamping_mode=inpt.clamping_mode, ) return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size) From 90a578bc66dc9b306ded73a77923414ebf509b12 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 30 Jun 2025 15:43:19 +0100 Subject: [PATCH 10/11] Add clamping_mode to rotate and set it to none in test --- test/test_transforms_v2.py | 21 +++++-------------- .../transforms/v2/functional/_geometry.py | 3 +++ 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 77c39f6a414..416b2e4facb 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -2081,7 +2081,6 @@ def test_functional(self, make_input): (F.rotate_image, torch.Tensor), (F._geometry._rotate_image_pil, PIL.Image.Image), (F.rotate_image, tv_tensors.Image), - (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes), (F.rotate_mask, tv_tensors.Mask), (F.rotate_video, tv_tensors.Video), (F.rotate_keypoints, tv_tensors.KeyPoints), @@ -2201,7 +2200,7 @@ def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes ) - def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center, canvas_size=None): + def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): if center is None: center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] cx, cy = center @@ -2227,28 +2226,22 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen output = helper( bounding_boxes, affine_matrix=affine_matrix, - new_canvas_size=new_canvas_size if canvas_size is None else canvas_size, + new_canvas_size=new_canvas_size, clamp=False, ) - return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to( - bounding_boxes - ) + return self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy).to(bounding_boxes) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) @pytest.mark.parametrize("expand", [False, True]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): - bounding_boxes = make_bounding_boxes(format=format) + bounding_boxes = make_bounding_boxes(format=format, clamping_mode="none") actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - - expected = self._reference_rotate_bounding_boxes( - bounding_boxes, angle=angle, expand=expand, center=center, canvas_size=actual.canvas_size - ) torch.testing.assert_close(actual, expected) @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) @@ -2256,7 +2249,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @pytest.mark.parametrize("seed", list(range(5))) def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): - bounding_boxes = make_bounding_boxes(format=format) + bounding_boxes = make_bounding_boxes(format=format, clamping_mode="none") transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) @@ -2268,10 +2261,6 @@ def test_transform_bounding_boxes_correctness(self, format, expand, center, seed expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - - expected = self._reference_rotate_bounding_boxes( - bounding_boxes, **params, expand=expand, center=center, canvas_size=actual.canvas_size - ) torch.testing.assert_close(actual, expected) def _recenter_keypoints_after_expand(self, keypoints, *, recenter_xy): diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 4a3b20b7fb3..f109247dc6b 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -1449,6 +1449,7 @@ def rotate_bounding_boxes( angle: float, expand: bool = False, center: Optional[list[float]] = None, + clamping_mode: CLAMPING_MODE_TYPE = "soft", ) -> tuple[torch.Tensor, tuple[int, int]]: return _affine_bounding_boxes_with_expand( bounding_boxes, @@ -1460,6 +1461,7 @@ def rotate_bounding_boxes( shear=[0.0, 0.0], center=center, expand=expand, + clamping_mode=clamping_mode, ) @@ -1474,6 +1476,7 @@ def _rotate_bounding_boxes_dispatch( angle=angle, expand=expand, center=center, + clamping_mode=inpt.clamping_mode, ) return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size) From 9b91d9448a7c132457173f6441f81d55214e3881 Mon Sep 17 00:00:00 2001 From: Antoine Simoulin Date: Mon, 30 Jun 2025 07:57:06 -0700 Subject: [PATCH 11/11] Adjust hard clamping Test Plan: ```bash pytest test/test_transforms_v2.py -k box -v ``` --- torchvision/transforms/v2/functional/_meta.py | 55 ++++++++++++++----- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 4cc3c2f3f8e..bca7a6de088 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -466,29 +466,54 @@ def _clamp_y_intercept( then applies various constraints to ensure the clamping conditions are respected. """ + # Calculate slopes and y-intercepts for bounding boxes a, b = _get_slope_and_intercept(bounding_boxes) a1, a2, a3, a4 = a.unbind(-1) b1, b2, b3, b4 = b.unbind(-1) - # Clamp y-intercepts (soft clamping) + # Get y-intercepts from original bounding boxes + _, bm = _get_slope_and_intercept(original_bounding_boxes) + b1m, b2m, b3m, b4m = bm.unbind(-1) + + # Soft clamping: Clamp y-intercepts within canvas boundaries b1 = b2.clamp(b1, b3).clamp(0, canvas_size[0]) b4 = b3.clamp(b2, b4).clamp(0, canvas_size[0]) if clamping_mode == "hard": - # Get y-intercepts from original bounding boxes - _, b = _get_slope_and_intercept(original_bounding_boxes) - _, b2, b3, _ = b.unbind(-1) - - # Set b1 and b4 to the average of their clamped values - b1 = b4 = (b1.clamp(0, canvas_size[0]) + b4.clamp(0, canvas_size[0])) / 2 - - # Ensure b2 and b3 defined the box of maximum area after clamping b1 and b4 - b2.clamp_(b1 * a2 / a1, b4).clamp_((a1 - a2) * canvas_size[1] + b1) - b2.clamp_(b3 * a2 / a3, b4).clamp_((a3 - a2) * canvas_size[1] + b3) - b3.clamp_(max=canvas_size[0] * (1 - a3 / a4) + b4 * a3 / a4) - b3.clamp_(max=canvas_size[0] * (1 - a3 / a2) + b2 * a3 / a2) - b3.clamp_(b1, (a2 - a3) * canvas_size[1] + b2) - b3.clamp_(b1, (a4 - a3) * canvas_size[1] + b4) + # Hard clamping: Average b1 and b4, and adjust b2 and b3 for maximum area + b1 = b4 = (b1 + b4) / 2 + + # Calculate candidate values for b2 based on geometric constraints + b2_candidates = torch.stack( + [ + b1 * a2 / a1, # Constraint at y=0 + b3 * a2 / a3, # Constraint at y=0 + (a1 - a2) * canvas_size[1] + b1, # Constraint at x=canvas_width + (a3 - a2) * canvas_size[1] + b3, # Constraint at x=canvas_width + ], + dim=1, + ) + # Take maximum value that doesn't exceed original b2 + b2 = torch.max(b2_candidates, dim=1)[0].clamp(max=b2) + + # Calculate candidate values for b3 based on geometric constraints + b3_candidates = torch.stack( + [ + canvas_size[0] * (1 - a3 / a4) + b4 * a3 / a4, # Constraint at y=canvas_height + canvas_size[0] * (1 - a3 / a2) + b2 * a3 / a2, # Constraint at y=canvas_height + (a2 - a3) * canvas_size[1] + b2, # Constraint at x=canvas_width + (a4 - a3) * canvas_size[1] + b4, # Constraint at x=canvas_width + ], + dim=1, + ) + # Take minimum value that doesn't go below original b3 + b3 = torch.min(b3_candidates, dim=1)[0].clamp(min=b3) + + # Final clamping to ensure y-intercepts are within original box bounds + b1.clamp_(b1m, b3m) + b3.clamp_(b1m, b3m) + b2.clamp_(b2m, b4m) + b4.clamp_(b2m, b4m) return torch.stack([b1, b2, b3, b4], dim=-1)