From eb0d3f48d3d3597356e359adcd6d0d07bdc78537 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Fri, 10 Jan 2025 15:15:57 +1100 Subject: [PATCH 1/8] intraprocess bond::Bond::BondStatusCB use after free The Bond mechanism includes creation of a subscription using a reference to a member function(bondStatusCB) of the Bond class. This member function operates on member variables. The lifecycle_node was calling bond_.reset() which releases the memory as far as the lifecycle_node is concerned but this is not immediately released from the rclcpp internal mechanisms (especially intraprocess). As a result the bondStatusCB function can called after it has been freed. This use after free shows up reliably with asan when running the test_bond test. This change allows the test_bond to suceed by calling bond_->breakBond() (rather than bond_.reset()) to break the bond rather than expecting it to be done cleanly by the ~Bond() destructor. Is it enough is TBC. Other possibilities might be to get the Bond to inherit from std::enable_shared_from_this(), as Ros2 Nodes do, so that the pointer to the Bond member function bondStatusCB function remains valid until the subscription is released during destruction. Signed-off-by: Mike Wake --- nav2_util/src/lifecycle_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nav2_util/src/lifecycle_node.cpp b/nav2_util/src/lifecycle_node.cpp index 49fdf51ec70..57a520e6f35 100644 --- a/nav2_util/src/lifecycle_node.cpp +++ b/nav2_util/src/lifecycle_node.cpp @@ -149,7 +149,7 @@ void LifecycleNode::destroyBond() RCLCPP_INFO(get_logger(), "Destroying bond (%s) to lifecycle manager.", this->get_name()); if (bond_) { - bond_.reset(); + bond_->breakBond(); } } } From 7b2cf7cea4a419701e540f37f42fc95237da33b1 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Mon, 13 Jan 2025 16:55:48 +1100 Subject: [PATCH 2/8] Store/Create Bond as a shared_ptr to allow shared_from_this() In order for Bond to inherit from std::enable_shared_from_this to allow its lifetime to be managed to avoid a use after free where its subcriber callback is called before it has been cleaned up from internal rclcpp mechanism. This is being explored either by modification to rclcpp or using a lambda that protects against the the member function going out of scope by testing a weak_ptr to shared_from_this for expiry before the member function bondStatusCB is called. Signed-off-by: Mike Wake --- nav2_util/include/nav2_util/lifecycle_node.hpp | 2 +- nav2_util/src/lifecycle_node.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nav2_util/include/nav2_util/lifecycle_node.hpp b/nav2_util/include/nav2_util/lifecycle_node.hpp index a838fae2dbd..0ed25f8f1b7 100644 --- a/nav2_util/include/nav2_util/lifecycle_node.hpp +++ b/nav2_util/include/nav2_util/lifecycle_node.hpp @@ -210,7 +210,7 @@ class LifecycleNode : public rclcpp_lifecycle::LifecycleNode void runCleanups(); // Connection to tell that server is still up - std::unique_ptr bond_{nullptr}; + std::shared_ptr bond_{nullptr}; double bond_heartbeat_period; rclcpp::TimerBase::SharedPtr autostart_timer_; }; diff --git a/nav2_util/src/lifecycle_node.cpp b/nav2_util/src/lifecycle_node.cpp index 57a520e6f35..eb0b993476d 100644 --- a/nav2_util/src/lifecycle_node.cpp +++ b/nav2_util/src/lifecycle_node.cpp @@ -73,7 +73,7 @@ void LifecycleNode::createBond() if (bond_heartbeat_period > 0.0) { RCLCPP_INFO(get_logger(), "Creating bond (%s) to lifecycle manager.", this->get_name()); - bond_ = std::make_unique( + bond_ = std::make_shared( std::string("bond"), this->get_name(), shared_from_this()); From 39426e8a43edc9fc0628b7fefb13d8e595e9fe46 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Mon, 13 Jan 2025 16:58:46 +1100 Subject: [PATCH 3/8] Revert experiment bond_->breakBond() back to bond_.reset() Signed-off-by: Mike Wake --- nav2_lifecycle_manager/test/test_bond.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nav2_lifecycle_manager/test/test_bond.cpp b/nav2_lifecycle_manager/test/test_bond.cpp index 38c772e6b1b..4cb70bad2a4 100644 --- a/nav2_lifecycle_manager/test/test_bond.cpp +++ b/nav2_lifecycle_manager/test/test_bond.cpp @@ -90,7 +90,7 @@ class TestLifecycleNode : public nav2_util::LifecycleNode void breakBond() { - bond_->breakBond(); + bond_.reset(); } std::string getState() From b57c9a300406dd1fad8e021b1032d692785ba639 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Mon, 13 Jan 2025 17:03:27 +1100 Subject: [PATCH 4/8] LifecycleBondTest Use nav2_util::generate_internal_node This wraps the creation of the node currently (ipc experiment branch) sets the node option .use_intra_process_comms(true) Signed-off-by: Mike Wake --- nav2_lifecycle_manager/test/test_bond.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nav2_lifecycle_manager/test/test_bond.cpp b/nav2_lifecycle_manager/test/test_bond.cpp index 4cb70bad2a4..fb0fa15f9b3 100644 --- a/nav2_lifecycle_manager/test/test_bond.cpp +++ b/nav2_lifecycle_manager/test/test_bond.cpp @@ -170,7 +170,7 @@ TEST(LifecycleBondTest, POSITIVE) TEST(LifecycleBondTest, NEGATIVE) { - auto node = std::make_shared("lifecycle_manager_test_service_client"); + auto node = nav2_util::generate_internal_node("lifecycle_manager_test_service_client"); nav2_lifecycle_manager::LifecycleManagerClient client("lifecycle_manager_test", node); // create node, now without bond setup to connect to. Should fail because no bond From e08f89b9137b0b0e96aeff18555e36f4e0839479 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Tue, 28 Jan 2025 14:46:44 +1100 Subject: [PATCH 5/8] underlay.repos temporarily use ewak/bondcpp feature/mw/memory_safe_subscription_callback Signed-off-by: Mike Wake --- tools/underlay.repos | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/underlay.repos b/tools/underlay.repos index 005f11ec3f0..97de32c5e7d 100644 --- a/tools/underlay.repos +++ b/tools/underlay.repos @@ -11,18 +11,18 @@ repositories: # type: git # url: https://github.com/ros-perception/vision_opencv.git # version: rolling - # ros/bond_core: - # type: git - # url: https://github.com/ros/bond_core.git - # version: ros2 + ros/bond_core: + type: git + url: https://github.com/ewak/bond_core.git + version: feature/mw/memory_safe_subscription_callback # ros/diagnostics: # type: git # url: https://github.com/ros/diagnostics.git - # version: ros2-devel + # version: ros2-devel # ros/geographic_info: # type: git # url: https://github.com/ros-geographic-info/geographic_info.git - # version: ros2 + # version: ros2 # ompl/ompl: # type: git # url: https://github.com/ompl/ompl.git From ed40eaedda5c77361a072f2b40ce7e45363633fd Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Tue, 28 Jan 2025 14:58:08 +1100 Subject: [PATCH 6/8] lint Signed-off-by: Mike Wake --- nav2_costmap_2d/src/costmap_2d_cloud.cpp | 6 ++---- .../include/nav2_graceful_controller/utils.hpp | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nav2_costmap_2d/src/costmap_2d_cloud.cpp b/nav2_costmap_2d/src/costmap_2d_cloud.cpp index 5aa78255e72..1119c508273 100644 --- a/nav2_costmap_2d/src/costmap_2d_cloud.cpp +++ b/nav2_costmap_2d/src/costmap_2d_cloud.cpp @@ -193,15 +193,13 @@ void voxelCallback(const nav2_msgs::msg::VoxelGrid::ConstSharedPtr grid) pcl_header.frame_id = frame_id; pcl_header.stamp = stamp; - if (pub_marked->get_subscription_count() > 0) - { + if (pub_marked->get_subscription_count() > 0) { auto cloud = std::make_unique(); pointCloud2Helper(cloud, num_marked, pcl_header, g_marked); pub_marked->publish(std::move(cloud)); } - if (pub_unknown->get_subscription_count() > 0) - { + if (pub_unknown->get_subscription_count() > 0) { auto cloud = std::make_unique(); pointCloud2Helper(cloud, num_unknown, pcl_header, g_unknown); pub_unknown->publish(std::move(cloud)); diff --git a/nav2_graceful_controller/include/nav2_graceful_controller/utils.hpp b/nav2_graceful_controller/include/nav2_graceful_controller/utils.hpp index 062edbf4242..2bc65fae444 100644 --- a/nav2_graceful_controller/include/nav2_graceful_controller/utils.hpp +++ b/nav2_graceful_controller/include/nav2_graceful_controller/utils.hpp @@ -15,6 +15,8 @@ #ifndef NAV2_GRACEFUL_CONTROLLER__UTILS_HPP_ #define NAV2_GRACEFUL_CONTROLLER__UTILS_HPP_ +#include + #include "geometry_msgs/msg/point_stamped.hpp" #include "geometry_msgs/msg/pose_stamped.hpp" #include "visualization_msgs/msg/marker.hpp" From 0e7e3f68d799b910375ea5f15923ea8d8c0621ad Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Tue, 28 Jan 2025 23:28:57 +1100 Subject: [PATCH 7/8] opennav_docking - attempt remove test_docking_server flakeyness Wait some time for tf to propogate. Allows docking_server time to activate and not reject initial action request. Will try to sleep before assert to allow logs of docking server to be produced before being killed by end of test shutdown due to assert. Signed-off-by: Mike Wake --- .../test/test_docking_server.py | 56 +++++++++++++++++-- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/nav2_docking/opennav_docking/test/test_docking_server.py b/nav2_docking/opennav_docking/test/test_docking_server.py index 43e3e2df329..f4fd05cdf38 100644 --- a/nav2_docking/opennav_docking/test/test_docking_server.py +++ b/nav2_docking/opennav_docking/test/test_docking_server.py @@ -24,9 +24,12 @@ from nav2_msgs.action import DockRobot, NavigateToPose, UndockRobot import pytest import rclpy -from rclpy.action import ActionClient, ActionServer +import rclpy.time +import rclpy.duration +from rclpy.action.client import ActionClient +from rclpy.action.server import ActionServer from sensor_msgs.msg import BatteryState -from tf2_ros import TransformBroadcaster +from tf2_ros import TransformBroadcaster, Buffer, TransformListener # This test can be run standalone with: @@ -142,9 +145,40 @@ def nav_execute_callback(self, goal_handle): result.error_code = 0 return result + def check_transform(self, timeout_sec, source_frame='odom', target_frame='base_link'): + """Try to look up the transform we're publishing""" + if self.transform_available: + return True + + try: + # Wait for transform to become available with timeout + when = rclpy.time.Time() + transform = self.tf_buffer.lookup_transform( + target_frame, + source_frame, + when, + timeout=rclpy.duration.Duration(seconds=timeout_sec) + ) + + self.node.get_logger().info('Successfully found transform:') + self.node.get_logger().info(f'Translation: [{transform.transform.translation.x}, ' + f'{transform.transform.translation.y}, ' + f'{transform.transform.translation.z}]') + self.transform_available = True + except Exception as e: + self.node.get_logger().error(f'Error looking up transform: {str(e)}') + self.transform_available = False + + return self.transform_available + def test_docking_server(self): # Publish TF for odometry self.tf_broadcaster = TransformBroadcaster(self.node) + # Create tf buffer and listener + self.tf_buffer = Buffer() + self.tf_listener = TransformListener(self.tf_buffer, self.node) + self.transform_available = False + # Create a timer to run "control loop" at 20hz self.timer = self.node.create_timer(0.05, self.timer_callback) @@ -175,12 +209,21 @@ def test_docking_server(self): 'navigate_to_pose', self.nav_execute_callback) - # Spin once so that TF is published - rclpy.spin_once(self.node, timeout_sec=0.1) + # Publish transform + self.publish() + + # Run for 2 seconds to allow tf to propogate + for i in range(20): + rclpy.spin_once(self.node, timeout_sec=0.1) + time.sleep(0.1) + + assert self.check_transform(timeout_sec=1.0), 'transform not available' # Test docking action self.action_result = [] - self.dock_action_client.wait_for_server(timeout_sec=5.0) + assert self.dock_action_client.wait_for_server(timeout_sec=5.0), \ + 'dock_robot service not available' + goal = DockRobot.Goal() goal.use_dock_id = True goal.dock_id = 'test_dock' @@ -188,7 +231,8 @@ def test_docking_server(self): goal, feedback_callback=self.action_feedback_callback) rclpy.spin_until_future_complete(self.node, future) self.goal_handle = future.result() - assert self.goal_handle.accepted + assert self.goal_handle is not None, 'goal_handle should not be None' + assert self.goal_handle.accepted, 'goal not accepted' result_future_original = self.goal_handle.get_result_async() # Run for 2 seconds From 10c8272ceaa2cc0870ce8484e90461fc35c78057 Mon Sep 17 00:00:00 2001 From: Mike Wake Date: Wed, 29 Jan 2025 07:33:35 +1100 Subject: [PATCH 8/8] WIP test_docking_server flakiness extra debug needs pruning Committing full debug WIP to simplify flake8 fixup will back out unnecessary when it works. Signed-off-by: Mike Wake --- .../test/test_docking_server.py | 69 +++++++++++++++---- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/nav2_docking/opennav_docking/test/test_docking_server.py b/nav2_docking/opennav_docking/test/test_docking_server.py index f4fd05cdf38..6adddd4405a 100644 --- a/nav2_docking/opennav_docking/test/test_docking_server.py +++ b/nav2_docking/opennav_docking/test/test_docking_server.py @@ -17,32 +17,47 @@ import unittest from action_msgs.msg import GoalStatus +from ament_index_python.packages import get_package_prefix from geometry_msgs.msg import TransformStamped, Twist, TwistStamped from launch import LaunchDescription +from launch.actions import ( + LogInfo, + SetEnvironmentVariable, +) from launch_ros.actions import Node import launch_testing +import launch_testing.actions +import launch_testing.asserts +import launch_testing.markers +import launch_testing.util from nav2_msgs.action import DockRobot, NavigateToPose, UndockRobot import pytest import rclpy -import rclpy.time -import rclpy.duration from rclpy.action.client import ActionClient from rclpy.action.server import ActionServer +import rclpy.duration +import rclpy.time from sensor_msgs.msg import BatteryState -from tf2_ros import TransformBroadcaster, Buffer, TransformListener +from tf2_ros import Buffer, TransformBroadcaster, TransformListener # This test can be run standalone with: # python3 -u -m pytest test_docking_server.py -s @pytest.mark.rostest +@launch_testing.markers.keep_alive def generate_test_description(): - - return LaunchDescription([ - Node( + tmux_gdb_prefix = ( + 'tmux split-window ' + + get_package_prefix('nav2_bringup') + + '/lib/nav2_bringup/gdb_tmux_splitwindow.sh' + ) + logme = LogInfo(msg=f'tmux_gdb_prefix={tmux_gdb_prefix}') + docking_server = Node( package='opennav_docking', executable='opennav_docking', name='docking_server', + # arguments=['--ros-args', '--log-level', 'debug'], parameters=[{'wait_charge_timeout': 1.0, 'controller': { 'use_collision_detection': False, @@ -59,17 +74,42 @@ def generate_test_description(): 'pose': [10.0, 0.0, 0.0] }}], output='screen', - ), - Node( + ) + lifecycle_manager_navigation = Node( package='nav2_lifecycle_manager', executable='lifecycle_manager', name='lifecycle_manager_navigation', output='screen', + # prefix=tmux_gdb_prefix, parameters=[{'autostart': True}, {'node_names': ['docking_server']}] - ), + ) + + return LaunchDescription([ + SetEnvironmentVariable('RCUTILS_LOGGING_BUFFERED_STREAM', '1'), + SetEnvironmentVariable('RCUTILS_LOGGING_USE_STDOUT', '1'), + logme, + docking_server, + lifecycle_manager_navigation, + # In tests where all of the procs under tests terminate themselves, it's necessary + # to add a dummy process not under test to keep the launch alive. launch_test + # provides a simple launch action that does this: + launch_testing.util.KeepAliveProc(), launch_testing.actions.ReadyToTest(), - ]) + ]), locals() + + +@launch_testing.post_shutdown_test() +class TestPostShutdown(unittest.TestCase): + + def test_action_graceful_shutdown(self, + proc_info, + docking_server, + lifecycle_manager_navigation): + """Test graceful shutdown.""" + launch_testing.asserts.assertExitCodes(proc_info, process=docking_server) + launch_testing.asserts.assertExitCodes(proc_info, + process=lifecycle_manager_navigation) class TestDockingServer(unittest.TestCase): @@ -146,7 +186,7 @@ def nav_execute_callback(self, goal_handle): return result def check_transform(self, timeout_sec, source_frame='odom', target_frame='base_link'): - """Try to look up the transform we're publishing""" + """Try to look up the transform we're publishing.""" if self.transform_available: return True @@ -162,8 +202,8 @@ def check_transform(self, timeout_sec, source_frame='odom', target_frame='base_l self.node.get_logger().info('Successfully found transform:') self.node.get_logger().info(f'Translation: [{transform.transform.translation.x}, ' - f'{transform.transform.translation.y}, ' - f'{transform.transform.translation.z}]') + f'{transform.transform.translation.y}, ' + f'{transform.transform.translation.z}]') self.transform_available = True except Exception as e: self.node.get_logger().error(f'Error looking up transform: {str(e)}') @@ -179,7 +219,6 @@ def test_docking_server(self): self.tf_listener = TransformListener(self.tf_buffer, self.node) self.transform_available = False - # Create a timer to run "control loop" at 20hz self.timer = self.node.create_timer(0.05, self.timer_callback) @@ -222,7 +261,7 @@ def test_docking_server(self): # Test docking action self.action_result = [] assert self.dock_action_client.wait_for_server(timeout_sec=5.0), \ - 'dock_robot service not available' + 'dock_robot service not available' goal = DockRobot.Goal() goal.use_dock_id = True