Nav2 Navigation Stack - jazzy  jazzy
ROS 2 Navigation Stack
lifecycle_manager.cpp
1 // Copyright (c) 2019 Intel Corporation
2 // Copyright (c) 2022 Samsung Research America
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "nav2_lifecycle_manager/lifecycle_manager.hpp"
17 
18 #include <chrono>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "rclcpp/rclcpp.hpp"
24 
25 using namespace std::chrono_literals;
26 using namespace std::placeholders;
27 
28 using lifecycle_msgs::msg::Transition;
29 using lifecycle_msgs::msg::State;
31 
32 namespace nav2_lifecycle_manager
33 {
34 
35 LifecycleManager::LifecycleManager(const rclcpp::NodeOptions & options)
36 : Node("lifecycle_manager", options), diagnostics_updater_(this)
37 {
38  RCLCPP_INFO(get_logger(), "Creating");
39 
40  // The list of names is parameterized, allowing this module to be used with a different set
41  // of nodes
42  declare_parameter("node_names", rclcpp::PARAMETER_STRING_ARRAY);
43  declare_parameter("autostart", rclcpp::ParameterValue(false));
44  declare_parameter("bond_timeout", 4.0);
45  declare_parameter("bond_respawn_max_duration", 10.0);
46  declare_parameter("attempt_respawn_reconnection", true);
47 
49 
50  node_names_ = get_parameter("node_names").as_string_array();
51  get_parameter("autostart", autostart_);
52  double bond_timeout_s;
53  get_parameter("bond_timeout", bond_timeout_s);
54  bond_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
55  std::chrono::duration<double>(bond_timeout_s));
56 
57  double respawn_timeout_s;
58  get_parameter("bond_respawn_max_duration", respawn_timeout_s);
59  bond_respawn_max_duration_ = rclcpp::Duration::from_seconds(respawn_timeout_s);
60 
61  get_parameter("attempt_respawn_reconnection", attempt_respawn_reconnection_);
62 
63  callback_group_ = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive, false);
64  manager_srv_ = create_service<ManageLifecycleNodes>(
65  get_name() + std::string("/manage_nodes"),
66  std::bind(&LifecycleManager::managerCallback, this, _1, _2, _3),
67  rclcpp::SystemDefaultsQoS(),
68  callback_group_);
69 
70  is_active_srv_ = create_service<std_srvs::srv::Trigger>(
71  get_name() + std::string("/is_active"),
72  std::bind(&LifecycleManager::isActiveCallback, this, _1, _2, _3),
73  rclcpp::SystemDefaultsQoS(),
74  callback_group_);
75 
76  transition_state_map_[Transition::TRANSITION_CONFIGURE] = State::PRIMARY_STATE_INACTIVE;
77  transition_state_map_[Transition::TRANSITION_CLEANUP] = State::PRIMARY_STATE_UNCONFIGURED;
78  transition_state_map_[Transition::TRANSITION_ACTIVATE] = State::PRIMARY_STATE_ACTIVE;
79  transition_state_map_[Transition::TRANSITION_DEACTIVATE] = State::PRIMARY_STATE_INACTIVE;
80  transition_state_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
81  State::PRIMARY_STATE_FINALIZED;
82 
83  transition_label_map_[Transition::TRANSITION_CONFIGURE] = std::string("Configuring ");
84  transition_label_map_[Transition::TRANSITION_CLEANUP] = std::string("Cleaning up ");
85  transition_label_map_[Transition::TRANSITION_ACTIVATE] = std::string("Activating ");
86  transition_label_map_[Transition::TRANSITION_DEACTIVATE] = std::string("Deactivating ");
87  transition_label_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
88  std::string("Shutting down ");
89 
90  init_timer_ = this->create_wall_timer(
91  0s,
92  [this]() -> void {
93  init_timer_->cancel();
95  if (autostart_) {
96  init_timer_ = this->create_wall_timer(
97  0s,
98  [this]() -> void {
99  init_timer_->cancel();
100  startup();
101  },
102  callback_group_);
103  }
104  auto executor = std::make_shared<rclcpp::executors::SingleThreadedExecutor>();
105  executor->add_callback_group(callback_group_, get_node_base_interface());
106  service_thread_ = std::make_unique<nav2_util::NodeThread>(executor);
107  });
108  diagnostics_updater_.setHardwareID("Nav2");
109  diagnostics_updater_.add("Nav2 Health", this, &LifecycleManager::CreateDiagnostic);
110 }
111 
113 {
114  RCLCPP_INFO(get_logger(), "Destroying %s", get_name());
115  service_thread_.reset();
116 }
117 
118 void
120  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
121  const std::shared_ptr<ManageLifecycleNodes::Request> request,
122  std::shared_ptr<ManageLifecycleNodes::Response> response)
123 {
124  switch (request->command) {
125  case ManageLifecycleNodes::Request::STARTUP:
126  response->success = startup();
127  break;
128  case ManageLifecycleNodes::Request::CONFIGURE:
129  response->success = configure();
130  break;
131  case ManageLifecycleNodes::Request::CLEANUP:
132  response->success = cleanup();
133  break;
134  case ManageLifecycleNodes::Request::RESET:
135  response->success = reset();
136  break;
137  case ManageLifecycleNodes::Request::SHUTDOWN:
138  response->success = shutdown();
139  break;
140  case ManageLifecycleNodes::Request::PAUSE:
141  response->success = pause();
142  break;
143  case ManageLifecycleNodes::Request::RESUME:
144  response->success = resume();
145  break;
146  }
147 }
148 
149 inline bool
151 {
152  return managed_nodes_state_ == NodeState::ACTIVE;
153 }
154 
155 void
157  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
158  const std::shared_ptr<std_srvs::srv::Trigger::Request>/*request*/,
159  std::shared_ptr<std_srvs::srv::Trigger::Response> response)
160 {
161  response->success = isActive();
162 }
163 
164 void
165 LifecycleManager::CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper & stat)
166 {
167  unsigned char error_level;
168  std::string message;
169  switch (managed_nodes_state_) {
170  case NodeState::ACTIVE:
171  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
172  message = "Managed nodes are active";
173  break;
174  case NodeState::INACTIVE:
175  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
176  message = "Managed nodes are inactive";
177  break;
178  case NodeState::UNCONFIGURED:
179  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
180  message = "Managed nodes are unconfigured";
181  break;
182  case NodeState::FINALIZED:
183  error_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
184  message = "Managed nodes have been shut down";
185  break;
186  default: // NodeState::UNKNOWN
187  error_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
188  message = "An error has occurred during a node state transition";
189  break;
190  }
191  stat.summary(error_level, message);
192 }
193 
194 void
196 {
197  message("Creating and initializing lifecycle service clients");
198  for (auto & node_name : node_names_) {
199  node_map_[node_name] =
200  std::make_shared<LifecycleServiceClient>(node_name, shared_from_this());
201  }
202 }
203 
204 void
206 {
207  message("Destroying lifecycle service clients");
208  for (auto & kv : node_map_) {
209  kv.second.reset();
210  }
211 }
212 
213 bool
214 LifecycleManager::createBondConnection(const std::string & node_name)
215 {
216  const double timeout_ns =
217  std::chrono::duration_cast<std::chrono::nanoseconds>(bond_timeout_).count();
218  const double timeout_s = timeout_ns / 1e9;
219 
220  if (bond_map_.find(node_name) == bond_map_.end() && bond_timeout_.count() > 0.0) {
221  bond_map_[node_name] =
222  std::make_shared<bond::Bond>("bond", node_name, shared_from_this());
223  bond_map_[node_name]->setHeartbeatTimeout(timeout_s);
224  bond_map_[node_name]->setHeartbeatPeriod(0.10);
225  bond_map_[node_name]->start();
226  if (
227  !bond_map_[node_name]->waitUntilFormed(
228  rclcpp::Duration(rclcpp::Duration::from_nanoseconds(timeout_ns / 2))))
229  {
230  RCLCPP_ERROR(
231  get_logger(),
232  "Server %s was unable to be reached after %0.2fs by bond. "
233  "This server may be misconfigured.",
234  node_name.c_str(), timeout_s);
235  return false;
236  }
237  RCLCPP_INFO(get_logger(), "Server %s connected with bond.", node_name.c_str());
238  }
239 
240  return true;
241 }
242 
243 bool
244 LifecycleManager::changeStateForNode(const std::string & node_name, std::uint8_t transition)
245 {
246  message(transition_label_map_[transition] + node_name);
247 
248  if (!node_map_[node_name]->change_state(transition) ||
249  !(node_map_[node_name]->get_state() == transition_state_map_[transition]))
250  {
251  RCLCPP_ERROR(get_logger(), "Failed to change state for node: %s", node_name.c_str());
252  return false;
253  }
254 
255  if (transition == Transition::TRANSITION_ACTIVATE) {
256  return createBondConnection(node_name);
257  } else if (transition == Transition::TRANSITION_DEACTIVATE) {
258  bond_map_.erase(node_name);
259  }
260 
261  return true;
262 }
263 
264 bool
265 LifecycleManager::changeStateForAllNodes(std::uint8_t transition, bool hard_change)
266 {
267  // Hard change will continue even if a node fails
268  if (transition == Transition::TRANSITION_CONFIGURE ||
269  transition == Transition::TRANSITION_ACTIVATE)
270  {
271  for (auto & node_name : node_names_) {
272  try {
273  if (!changeStateForNode(node_name, transition) && !hard_change) {
274  return false;
275  }
276  } catch (const std::runtime_error & e) {
277  RCLCPP_ERROR(
278  get_logger(),
279  "Failed to change state for node: %s. Exception: %s.", node_name.c_str(), e.what());
280  return false;
281  }
282  }
283  } else {
284  std::vector<std::string>::reverse_iterator rit;
285  for (rit = node_names_.rbegin(); rit != node_names_.rend(); ++rit) {
286  try {
287  if (!changeStateForNode(*rit, transition) && !hard_change) {
288  return false;
289  }
290  } catch (const std::runtime_error & e) {
291  RCLCPP_ERROR(
292  get_logger(),
293  "Failed to change state for node: %s. Exception: %s.", (*rit).c_str(), e.what());
294  return false;
295  }
296  }
297  }
298  return true;
299 }
300 
301 void
303 {
304  message("Deactivate, cleanup, and shutdown nodes");
305  managed_nodes_state_ = NodeState::FINALIZED;
306  changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE);
307  changeStateForAllNodes(Transition::TRANSITION_CLEANUP);
308  changeStateForAllNodes(Transition::TRANSITION_UNCONFIGURED_SHUTDOWN);
309 }
310 
311 bool
313 {
314  message("Starting managed nodes bringup...");
315  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE) ||
316  !changeStateForAllNodes(Transition::TRANSITION_ACTIVATE))
317  {
318  RCLCPP_ERROR(get_logger(), "Failed to bring up all requested nodes. Aborting bringup.");
319  managed_nodes_state_ = NodeState::UNKNOWN;
320  return false;
321  }
322  message("Managed nodes are active");
323  managed_nodes_state_ = NodeState::ACTIVE;
324  createBondTimer();
325  return true;
326 }
327 
328 bool
330 {
331  message("Configuring managed nodes...");
332  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE)) {
333  RCLCPP_ERROR(get_logger(), "Failed to configure all requested nodes. Aborting bringup.");
334  managed_nodes_state_ = NodeState::UNKNOWN;
335  return false;
336  }
337  message("Managed nodes are now configured");
338  managed_nodes_state_ = NodeState::INACTIVE;
339  return true;
340 }
341 
342 bool
344 {
345  message("Cleaning up managed nodes...");
346  if (!changeStateForAllNodes(Transition::TRANSITION_CLEANUP)) {
347  RCLCPP_ERROR(get_logger(), "Failed to cleanup all requested nodes. Aborting cleanup.");
348  managed_nodes_state_ = NodeState::UNKNOWN;
349  return false;
350  }
351  message("Managed nodes have been cleaned up");
352  managed_nodes_state_ = NodeState::UNCONFIGURED;
353  return true;
354 }
355 
356 bool
358 {
360 
361  message("Shutting down managed nodes...");
364  message("Managed nodes have been shut down");
365  return true;
366 }
367 
368 bool
369 LifecycleManager::reset(bool hard_reset)
370 {
372 
373  message("Resetting managed nodes...");
374  // Should transition in reverse order
375  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE, hard_reset) ||
376  !changeStateForAllNodes(Transition::TRANSITION_CLEANUP, hard_reset))
377  {
378  if (!hard_reset) {
379  RCLCPP_ERROR(get_logger(), "Failed to reset nodes: aborting reset");
380  managed_nodes_state_ = NodeState::UNKNOWN;
381  return false;
382  }
383  }
384 
385  message("Managed nodes have been reset");
386  managed_nodes_state_ = NodeState::UNCONFIGURED;
387  return true;
388 }
389 
390 bool
392 {
394 
395  message("Pausing managed nodes...");
396  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE)) {
397  RCLCPP_ERROR(get_logger(), "Failed to pause nodes: aborting pause");
398  managed_nodes_state_ = NodeState::UNKNOWN;
399  return false;
400  }
401 
402  message("Managed nodes have been paused");
403  managed_nodes_state_ = NodeState::INACTIVE;
404  return true;
405 }
406 
407 bool
409 {
410  message("Resuming managed nodes...");
411  if (!changeStateForAllNodes(Transition::TRANSITION_ACTIVATE)) {
412  RCLCPP_ERROR(get_logger(), "Failed to resume nodes: aborting resume");
413  managed_nodes_state_ = NodeState::UNKNOWN;
414  return false;
415  }
416 
417  message("Managed nodes are active");
418  managed_nodes_state_ = NodeState::ACTIVE;
419  createBondTimer();
420  return true;
421 }
422 
423 void
425 {
426  if (bond_timeout_.count() <= 0) {
427  return;
428  }
429 
430  message("Creating bond timer...");
431  bond_timer_ = this->create_wall_timer(
432  200ms,
433  std::bind(&LifecycleManager::checkBondConnections, this),
434  callback_group_);
435 }
436 
437 void
439 {
440  if (bond_timer_) {
441  message("Terminating bond timer...");
442  bond_timer_->cancel();
443  bond_timer_.reset();
444  }
445 }
446 
447 void
449 {
450  RCLCPP_INFO(
451  get_logger(), "Running Nav2 LifecycleManager rcl preshutdown (%s)",
452  this->get_name());
453 
455 
456  /*
457  * Dropping the bond map is what we really need here, but we drop the others
458  * to prevent the bond map being used. Likewise, squash the service thread.
459  */
460  service_thread_.reset();
461  node_names_.clear();
462  node_map_.clear();
463  bond_map_.clear();
464 }
465 
466 void
468 {
469  rclcpp::Context::SharedPtr context = get_node_base_interface()->get_context();
470 
471  context->add_pre_shutdown_callback(
472  std::bind(&LifecycleManager::onRclPreshutdown, this)
473  );
474 }
475 
476 void
478 {
479  if (!isActive() || !rclcpp::ok() || bond_map_.empty()) {
480  return;
481  }
482 
483  for (auto & node_name : node_names_) {
484  if (!rclcpp::ok()) {
485  return;
486  }
487 
488  if (bond_map_[node_name]->isBroken()) {
489  message(
490  std::string(
491  "Have not received a heartbeat from " + node_name + "."));
492 
493  // if one is down, bring them all down
494  RCLCPP_ERROR(
495  get_logger(),
496  "CRITICAL FAILURE: SERVER %s IS DOWN after not receiving a heartbeat for %i ms."
497  " Shutting down related nodes.",
498  node_name.c_str(), static_cast<int>(bond_timeout_.count()));
499  reset(true); // hard reset to transition all still active down
500  // if a server crashed, it won't get cleared due to failed transition, clear manually
501  bond_map_.clear();
502 
503  // Initialize the bond respawn timer to check if server comes back online
504  // after a failure, within a maximum timeout period.
505  if (attempt_respawn_reconnection_) {
506  bond_respawn_timer_ = this->create_wall_timer(
507  1s,
509  callback_group_);
510  }
511  return;
512  }
513  }
514 }
515 
516 void
518 {
519  // First attempt in respawn, start maximum duration to respawn
520  if (bond_respawn_start_time_.nanoseconds() == 0) {
521  bond_respawn_start_time_ = now();
522  }
523 
524  // Note: isActive() is inverted since this should be in a failure
525  // condition. If another outside user actives the system again, this should not process.
526  if (isActive() || !rclcpp::ok() || node_names_.empty()) {
527  bond_respawn_start_time_ = rclcpp::Time(0);
528  bond_respawn_timer_.reset();
529  return;
530  }
531 
532  // Check number of live connections after a bond failure
533  int live_servers = 0;
534  const int max_live_servers = node_names_.size();
535  for (auto & node_name : node_names_) {
536  if (!rclcpp::ok()) {
537  return;
538  }
539 
540  try {
541  node_map_[node_name]->get_state(); // Only won't throw if the server exists
542  live_servers++;
543  } catch (...) {
544  break;
545  }
546  }
547 
548  // If all are alive, kill timer and retransition system to active
549  // Else, check if maximum timeout has occurred
550  if (live_servers == max_live_servers) {
551  message("Successfully re-established connections from server respawns, starting back up.");
552  bond_respawn_start_time_ = rclcpp::Time(0);
553  bond_respawn_timer_.reset();
554  startup();
555  } else if (now() - bond_respawn_start_time_ >= bond_respawn_max_duration_) {
556  message("Failed to re-establish connection from a server crash after maximum timeout.");
557  bond_respawn_start_time_ = rclcpp::Time(0);
558  bond_respawn_timer_.reset();
559  }
560 }
561 
562 #define ANSI_COLOR_RESET "\x1b[0m"
563 #define ANSI_COLOR_BLUE "\x1b[34m"
564 
565 void
566 LifecycleManager::message(const std::string & msg)
567 {
568  RCLCPP_INFO(get_logger(), ANSI_COLOR_BLUE "\33[1m%s\33[0m" ANSI_COLOR_RESET, msg.c_str());
569 }
570 
571 } // namespace nav2_lifecycle_manager
572 
573 #include "rclcpp_components/register_node_macro.hpp"
574 RCLCPP_COMPONENTS_REGISTER_NODE(nav2_lifecycle_manager::LifecycleManager)
Implements service interface to transition the lifecycle nodes of Nav2 stack. It receives transition ...
void createBondTimer()
Support function for creating bond timer.
void destroyLifecycleServiceClients()
Destroy all the lifecycle service clients.
void destroyBondTimer()
Support function for killing bond connections.
bool cleanup()
Cleanups the managed nodes.
bool pause()
Pause all the managed nodes.
bool shutdown()
Deactivate, clean up and shut down all the managed nodes.
bool resume()
Resume all the managed nodes.
void onRclPreshutdown()
Perform preshutdown activities before our Context is shutdown. Note that this is related to our Conte...
bool isActive()
function to check if managed nodes are active
bool createBondConnection(const std::string &node_name)
Support function for creating bond connections.
void CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper &stat)
function to check the state of Nav2 nodes
void message(const std::string &msg)
Helper function to highlight the output on the console.
~LifecycleManager()
A destructor for nav2_lifecycle_manager::LifecycleManager.
bool reset(bool hard_reset=false)
Reset all the managed nodes.
void shutdownAllNodes()
Support function for shutdown.
void managerCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< ManageLifecycleNodes::Request > request, std::shared_ptr< ManageLifecycleNodes::Response > response)
Lifecycle node manager callback function.
bool changeStateForNode(const std::string &node_name, std::uint8_t transition)
For a node, transition to the new target state.
bool configure()
Configures the managed nodes.
bool changeStateForAllNodes(std::uint8_t transition, bool hard_change=false)
For each node in the map, transition to the new target state.
void createLifecycleServiceClients()
Support function for creating service clients.
void isActiveCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< std_srvs::srv::Trigger::Request > request, std::shared_ptr< std_srvs::srv::Trigger::Response > response)
Trigger callback function checks if the managed nodes are in active state.
Helper functions to interact with a lifecycle node.