Nav2 Navigation Stack - rolling  main
ROS 2 Navigation Stack
lifecycle_manager.cpp
1 // Copyright (c) 2019 Intel Corporation
2 // Copyright (c) 2022 Samsung Research America
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "nav2_lifecycle_manager/lifecycle_manager.hpp"
17 
18 #include <chrono>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "rclcpp/rclcpp.hpp"
24 
25 using namespace std::chrono_literals;
26 using namespace std::placeholders;
27 
28 using lifecycle_msgs::msg::Transition;
29 using lifecycle_msgs::msg::State;
31 
32 namespace nav2_lifecycle_manager
33 {
34 
35 LifecycleManager::LifecycleManager(const rclcpp::NodeOptions & options)
36 : Node("lifecycle_manager", options), diagnostics_updater_(this)
37 {
38  RCLCPP_INFO(get_logger(), "Creating");
39 
40  // The list of names is parameterized, allowing this module to be used with a different set
41  // of nodes
42  declare_parameter("node_names", rclcpp::PARAMETER_STRING_ARRAY);
43  declare_parameter("autostart", rclcpp::ParameterValue(false));
44  declare_parameter("bond_timeout", 4.0);
45  declare_parameter("service_timeout", 5.0);
46  declare_parameter("bond_respawn_max_duration", 10.0);
47  declare_parameter("attempt_respawn_reconnection", true);
48 
50 
51  node_names_ = get_parameter("node_names").as_string_array();
52  get_parameter("autostart", autostart_);
53  double bond_timeout_s;
54  get_parameter("bond_timeout", bond_timeout_s);
55  bond_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
56  std::chrono::duration<double>(bond_timeout_s));
57 
58  double service_timeout_s;
59  get_parameter("service_timeout", service_timeout_s);
60  service_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
61  std::chrono::duration<double>(service_timeout_s));
62 
63  double respawn_timeout_s;
64  get_parameter("bond_respawn_max_duration", respawn_timeout_s);
65  bond_respawn_max_duration_ = rclcpp::Duration::from_seconds(respawn_timeout_s);
66 
67  get_parameter("attempt_respawn_reconnection", attempt_respawn_reconnection_);
68 
69  callback_group_ = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive, false);
70 
71  transition_state_map_[Transition::TRANSITION_CONFIGURE] = State::PRIMARY_STATE_INACTIVE;
72  transition_state_map_[Transition::TRANSITION_CLEANUP] = State::PRIMARY_STATE_UNCONFIGURED;
73  transition_state_map_[Transition::TRANSITION_ACTIVATE] = State::PRIMARY_STATE_ACTIVE;
74  transition_state_map_[Transition::TRANSITION_DEACTIVATE] = State::PRIMARY_STATE_INACTIVE;
75  transition_state_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
76  State::PRIMARY_STATE_FINALIZED;
77 
78  transition_label_map_[Transition::TRANSITION_CONFIGURE] = std::string("Configuring ");
79  transition_label_map_[Transition::TRANSITION_CLEANUP] = std::string("Cleaning up ");
80  transition_label_map_[Transition::TRANSITION_ACTIVATE] = std::string("Activating ");
81  transition_label_map_[Transition::TRANSITION_DEACTIVATE] = std::string("Deactivating ");
82  transition_label_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
83  std::string("Shutting down ");
84 
85  init_timer_ = this->create_wall_timer(
86  0s,
87  [this]() -> void {
88  init_timer_->cancel();
92  if (autostart_) {
93  init_timer_ = this->create_wall_timer(
94  0s,
95  [this]() -> void {
96  init_timer_->cancel();
97  startup();
98  },
99  callback_group_);
100  }
101  auto executor = std::make_shared<rclcpp::executors::SingleThreadedExecutor>();
102  executor->add_callback_group(callback_group_, get_node_base_interface());
103  service_thread_ = std::make_unique<nav2::NodeThread>(executor);
104  });
105  diagnostics_updater_.setHardwareID("Nav2");
106  diagnostics_updater_.add("Nav2 Health", this, &LifecycleManager::CreateDiagnostic);
107 }
108 
110 {
111  RCLCPP_INFO(get_logger(), "Destroying %s", get_name());
112  service_thread_.reset();
113 }
114 
115 void
117  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
118  const std::shared_ptr<ManageLifecycleNodes::Request> request,
119  std::shared_ptr<ManageLifecycleNodes::Response> response)
120 {
121  switch (request->command) {
122  case ManageLifecycleNodes::Request::STARTUP:
123  response->success = startup();
124  break;
125  case ManageLifecycleNodes::Request::CONFIGURE:
126  response->success = configure();
127  break;
128  case ManageLifecycleNodes::Request::CLEANUP:
129  response->success = cleanup();
130  break;
131  case ManageLifecycleNodes::Request::RESET:
132  response->success = reset();
133  break;
134  case ManageLifecycleNodes::Request::SHUTDOWN:
135  response->success = shutdown();
136  break;
137  case ManageLifecycleNodes::Request::PAUSE:
138  response->success = pause();
139  break;
140  case ManageLifecycleNodes::Request::RESUME:
141  response->success = resume();
142  break;
143  }
144 }
145 
146 void
147 LifecycleManager::setState(const NodeState & state)
148 {
149  managed_nodes_state_ = state;
151 }
152 
153 inline bool
155 {
156  return managed_nodes_state_ == NodeState::ACTIVE;
157 }
158 
159 void
161 {
162  if (is_active_pub_ && is_active_pub_->is_activated()) {
163  auto message = std::make_unique<std_msgs::msg::Bool>();
164  message->data = isActive();
165  is_active_pub_->publish(std::move(message));
166  }
167 }
168 
169 void
171  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
172  const std::shared_ptr<std_srvs::srv::Trigger::Request>/*request*/,
173  std::shared_ptr<std_srvs::srv::Trigger::Response> response)
174 {
175  response->success = isActive();
176 }
177 
178 void
179 LifecycleManager::CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper & stat)
180 {
181  unsigned char error_level;
182  std::string message;
183  switch (managed_nodes_state_) {
184  case NodeState::ACTIVE:
185  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
186  message = "Managed nodes are active";
187  break;
188  case NodeState::INACTIVE:
189  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
190  message = "Managed nodes are inactive";
191  break;
192  case NodeState::UNCONFIGURED:
193  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
194  message = "Managed nodes are unconfigured";
195  break;
196  case NodeState::FINALIZED:
197  error_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
198  message = "Managed nodes have been shut down";
199  break;
200  default: // NodeState::UNKNOWN
201  error_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
202  message = "An error has occurred during a node state transition";
203  break;
204  }
205  stat.summary(error_level, message);
206 }
207 
208 void
210 {
211  message("Creating and initializing lifecycle service clients");
212  for (auto & node_name : node_names_) {
213  node_map_[node_name] =
214  std::make_shared<LifecycleServiceClient>(node_name, shared_from_this());
215  }
216 }
217 
218 void
220 {
221  message("Creating and initializing lifecycle service servers");
222  // Since the LifecycleManager is a special node in that it manages other nodes,
223  // this is an rclcpp::Node, meaning we can't use the node->create_server API.
224  // to make a Nav2 ServiceServer. This must be constructed manually using the interfaces.
225  manager_srv_ = nav2::interfaces::create_service<ManageLifecycleNodes>(
226  shared_from_this(),
227  get_name() + std::string("/manage_nodes"),
228  std::bind(&LifecycleManager::managerCallback, this, _1, _2, _3),
229  callback_group_);
230 
231  is_active_srv_ = nav2::interfaces::create_service<std_srvs::srv::Trigger>(
232  shared_from_this(),
233  get_name() + std::string("/is_active"),
234  std::bind(&LifecycleManager::isActiveCallback, this, _1, _2, _3),
235  callback_group_);
236 }
237 
238 void
240 {
241  message("Creating and initializing lifecycle publishers");
242 
243  is_active_pub_ = nav2::interfaces::create_publisher<std_msgs::msg::Bool>(
244  shared_from_this(),
245  get_name() + std::string("/managed_nodes_activated"),
247  callback_group_);
248  is_active_pub_->on_activate();
249  // Publish the initial state once at startup
251 }
252 
253 void
255 {
256  message("Destroying lifecycle service clients");
257  for (auto & kv : node_map_) {
258  kv.second.reset();
259  }
260 }
261 
262 void
264 {
265  message("Destroying lifecycle publishers");
266  if (is_active_pub_) {
267  is_active_pub_->on_deactivate();
268  is_active_pub_.reset();
269  }
270 }
271 
272 bool
273 LifecycleManager::createBondConnection(const std::string & node_name)
274 {
275  const double timeout_ns =
276  std::chrono::duration_cast<std::chrono::nanoseconds>(bond_timeout_).count();
277  const double timeout_s = timeout_ns / 1e9;
278 
279  if (bond_map_.find(node_name) == bond_map_.end() && bond_timeout_.count() > 0.0) {
280  bond_map_[node_name] =
281  std::make_shared<bond::Bond>("bond", node_name, shared_from_this());
282  bond_map_[node_name]->setHeartbeatTimeout(timeout_s);
283  bond_map_[node_name]->setHeartbeatPeriod(0.10);
284  bond_map_[node_name]->start();
285  if (
286  !bond_map_[node_name]->waitUntilFormed(
287  rclcpp::Duration(rclcpp::Duration::from_nanoseconds(timeout_ns / 2))))
288  {
289  RCLCPP_ERROR(
290  get_logger(),
291  "Server %s was unable to be reached after %0.2fs by bond. "
292  "This server may be misconfigured.",
293  node_name.c_str(), timeout_s);
294  return false;
295  }
296  RCLCPP_INFO(get_logger(), "Server %s connected with bond.", node_name.c_str());
297  }
298 
299  return true;
300 }
301 
302 bool
303 LifecycleManager::changeStateForNode(const std::string & node_name, std::uint8_t transition)
304 {
305  message(transition_label_map_[transition] + node_name);
306 
307  if (!node_map_[node_name]->change_state(transition, std::chrono::milliseconds(-1),
308  service_timeout_) ||
309  !(node_map_[node_name]->get_state(service_timeout_) == transition_state_map_[transition]))
310  {
311  RCLCPP_ERROR(get_logger(), "Failed to change state for node: %s", node_name.c_str());
312  return false;
313  }
314 
315  if (transition == Transition::TRANSITION_ACTIVATE) {
316  return createBondConnection(node_name);
317  } else if (transition == Transition::TRANSITION_DEACTIVATE) {
318  bond_map_.erase(node_name);
319  }
320 
321  return true;
322 }
323 
324 bool
325 LifecycleManager::changeStateForAllNodes(std::uint8_t transition, bool hard_change)
326 {
327  // Hard change will continue even if a node fails
328  if (transition == Transition::TRANSITION_CONFIGURE ||
329  transition == Transition::TRANSITION_ACTIVATE)
330  {
331  for (auto & node_name : node_names_) {
332  try {
333  if (!changeStateForNode(node_name, transition) && !hard_change) {
334  return false;
335  }
336  } catch (const std::runtime_error & e) {
337  RCLCPP_ERROR(
338  get_logger(),
339  "Failed to change state for node: %s. Exception: %s.", node_name.c_str(), e.what());
340  return false;
341  }
342  }
343  } else {
344  std::vector<std::string>::reverse_iterator rit;
345  for (rit = node_names_.rbegin(); rit != node_names_.rend(); ++rit) {
346  try {
347  if (!changeStateForNode(*rit, transition) && !hard_change) {
348  return false;
349  }
350  } catch (const std::runtime_error & e) {
351  RCLCPP_ERROR(
352  get_logger(),
353  "Failed to change state for node: %s. Exception: %s.", (*rit).c_str(), e.what());
354  return false;
355  }
356  }
357  }
358  return true;
359 }
360 
361 void
363 {
364  message("Deactivate, cleanup, and shutdown nodes");
365  setState(NodeState::FINALIZED);
366  changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE);
367  changeStateForAllNodes(Transition::TRANSITION_CLEANUP);
368  changeStateForAllNodes(Transition::TRANSITION_UNCONFIGURED_SHUTDOWN);
369 }
370 
371 bool
373 {
374  message("Starting managed nodes bringup...");
375  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE) ||
376  !changeStateForAllNodes(Transition::TRANSITION_ACTIVATE))
377  {
378  RCLCPP_ERROR(get_logger(), "Failed to bring up all requested nodes. Aborting bringup.");
379  setState(NodeState::UNKNOWN);
380  return false;
381  }
382  message("Managed nodes are active");
383  setState(NodeState::ACTIVE);
384  createBondTimer();
385  return true;
386 }
387 
388 bool
390 {
391  message("Configuring managed nodes...");
392  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE)) {
393  RCLCPP_ERROR(get_logger(), "Failed to configure all requested nodes. Aborting bringup.");
394  setState(NodeState::UNKNOWN);
395  return false;
396  }
397  message("Managed nodes are now configured");
398  setState(NodeState::INACTIVE);
399  return true;
400 }
401 
402 bool
404 {
405  message("Cleaning up managed nodes...");
406  if (!changeStateForAllNodes(Transition::TRANSITION_CLEANUP)) {
407  RCLCPP_ERROR(get_logger(), "Failed to cleanup all requested nodes. Aborting cleanup.");
408  setState(NodeState::UNKNOWN);
409  return false;
410  }
411  message("Managed nodes have been cleaned up");
412  setState(NodeState::UNCONFIGURED);
413  return true;
414 }
415 
416 bool
418 {
420 
421  message("Shutting down managed nodes...");
425  message("Managed nodes have been shut down");
426  return true;
427 }
428 
429 bool
430 LifecycleManager::reset(bool hard_reset)
431 {
433 
434  message("Resetting managed nodes...");
435  // Should transition in reverse order
436  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE, hard_reset) ||
437  !changeStateForAllNodes(Transition::TRANSITION_CLEANUP, hard_reset))
438  {
439  if (!hard_reset) {
440  RCLCPP_ERROR(get_logger(), "Failed to reset nodes: aborting reset");
441  setState(NodeState::UNKNOWN);
442  return false;
443  }
444  }
445 
446  message("Managed nodes have been reset");
447  setState(NodeState::UNCONFIGURED);
448  return true;
449 }
450 
451 bool
453 {
455 
456  message("Pausing managed nodes...");
457  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE)) {
458  RCLCPP_ERROR(get_logger(), "Failed to pause nodes: aborting pause");
459  setState(NodeState::UNKNOWN);
460  return false;
461  }
462 
463  message("Managed nodes have been paused");
464  setState(NodeState::INACTIVE);
465  return true;
466 }
467 
468 bool
470 {
471  message("Resuming managed nodes...");
472  if (!changeStateForAllNodes(Transition::TRANSITION_ACTIVATE)) {
473  RCLCPP_ERROR(get_logger(), "Failed to resume nodes: aborting resume");
474  setState(NodeState::UNKNOWN);
475  return false;
476  }
477 
478  message("Managed nodes are active");
479  setState(NodeState::ACTIVE);
480  createBondTimer();
481  return true;
482 }
483 
484 void
486 {
487  if (bond_timeout_.count() <= 0) {
488  return;
489  }
490 
491  message("Creating bond timer...");
492  bond_timer_ = this->create_wall_timer(
493  200ms,
494  std::bind(&LifecycleManager::checkBondConnections, this),
495  callback_group_);
496 }
497 
498 void
500 {
501  if (bond_timer_) {
502  message("Terminating bond timer...");
503  bond_timer_->cancel();
504  bond_timer_.reset();
505  }
506 }
507 
508 void
510 {
511  RCLCPP_INFO(
512  get_logger(), "Running Nav2 LifecycleManager rcl preshutdown (%s)",
513  this->get_name());
514 
516 
517  /*
518  * Dropping the bond map is what we really need here, but we drop the others
519  * to prevent the bond map being used. Likewise, squash the service thread.
520  */
521  service_thread_.reset();
522  node_names_.clear();
523  node_map_.clear();
524  bond_map_.clear();
525 }
526 
527 void
529 {
530  rclcpp::Context::SharedPtr context = get_node_base_interface()->get_context();
531 
532  context->add_pre_shutdown_callback(
533  std::bind(&LifecycleManager::onRclPreshutdown, this)
534  );
535 }
536 
537 void
539 {
540  if (!isActive() || !rclcpp::ok() || bond_map_.empty()) {
541  return;
542  }
543 
544  for (auto & node_name : node_names_) {
545  if (!rclcpp::ok()) {
546  return;
547  }
548 
549  if (bond_map_[node_name]->isBroken()) {
550  message(
551  std::string(
552  "Have not received a heartbeat from " + node_name + "."));
553 
554  // if one is down, bring them all down
555  RCLCPP_ERROR(
556  get_logger(),
557  "CRITICAL FAILURE: SERVER %s IS DOWN after not receiving a heartbeat for %i ms."
558  " Shutting down related nodes.",
559  node_name.c_str(), static_cast<int>(bond_timeout_.count()));
560  reset(true); // hard reset to transition all still active down
561  // if a server crashed, it won't get cleared due to failed transition, clear manually
562  bond_map_.clear();
563 
564  // Initialize the bond respawn timer to check if server comes back online
565  // after a failure, within a maximum timeout period.
566  if (attempt_respawn_reconnection_) {
567  bond_respawn_timer_ = this->create_wall_timer(
568  1s,
570  callback_group_);
571  }
572  return;
573  }
574  }
575 }
576 
577 void
579 {
580  // First attempt in respawn, start maximum duration to respawn
581  if (bond_respawn_start_time_.nanoseconds() == 0) {
582  bond_respawn_start_time_ = now();
583  }
584 
585  // Note: isActive() is inverted since this should be in a failure
586  // condition. If another outside user actives the system again, this should not process.
587  if (isActive() || !rclcpp::ok() || node_names_.empty()) {
588  bond_respawn_start_time_ = rclcpp::Time(0);
589  bond_respawn_timer_.reset();
590  return;
591  }
592 
593  // Check number of live connections after a bond failure
594  int live_servers = 0;
595  const int max_live_servers = node_names_.size();
596  for (auto & node_name : node_names_) {
597  if (!rclcpp::ok()) {
598  return;
599  }
600 
601  try {
602  node_map_[node_name]->get_state(service_timeout_); // Only won't throw if the server exists
603  live_servers++;
604  } catch (...) {
605  break;
606  }
607  }
608 
609  // If all are alive, kill timer and retransition system to active
610  // Else, check if maximum timeout has occurred
611  if (live_servers == max_live_servers) {
612  message("Successfully re-established connections from server respawns, starting back up.");
613  bond_respawn_start_time_ = rclcpp::Time(0);
614  bond_respawn_timer_.reset();
615  startup();
616  } else if (now() - bond_respawn_start_time_ >= bond_respawn_max_duration_) {
617  message("Failed to re-establish connection from a server crash after maximum timeout.");
618  bond_respawn_start_time_ = rclcpp::Time(0);
619  bond_respawn_timer_.reset();
620  }
621 }
622 
623 #define ANSI_COLOR_RESET "\x1b[0m"
624 #define ANSI_COLOR_BLUE "\x1b[34m"
625 
626 void
627 LifecycleManager::message(const std::string & msg)
628 {
629  RCLCPP_INFO(get_logger(), ANSI_COLOR_BLUE "\33[1m%s\33[0m" ANSI_COLOR_RESET, msg.c_str());
630 }
631 
632 } // namespace nav2_lifecycle_manager
633 
634 #include "rclcpp_components/register_node_macro.hpp"
635 RCLCPP_COMPONENTS_REGISTER_NODE(nav2_lifecycle_manager::LifecycleManager)
A QoS profile for latched, reliable topics with a history of 1 messages.
Implements service interface to transition the lifecycle nodes of Nav2 stack. It receives transition ...
void createBondTimer()
Support function for creating bond timer.
void destroyLifecycleServiceClients()
Destroy all the lifecycle service clients.
void destroyBondTimer()
Support function for killing bond connections.
bool cleanup()
Cleanups the managed nodes.
bool pause()
Pause all the managed nodes.
bool shutdown()
Deactivate, clean up and shut down all the managed nodes.
bool resume()
Resume all the managed nodes.
void createLifecycleServiceServers()
Support function for creating service servers.
void onRclPreshutdown()
Perform preshutdown activities before our Context is shutdown. Note that this is related to our Conte...
void destroyLifecyclePublishers()
Destroy all the lifecycle publishers.
bool isActive()
function to check if managed nodes are active
bool createBondConnection(const std::string &node_name)
Support function for creating bond connections.
void createLifecyclePublishers()
Support function for creating publishers.
void CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper &stat)
function to check the state of Nav2 nodes
void publishIsActiveState()
Publish the is_active state.
void message(const std::string &msg)
Helper function to highlight the output on the console.
void setState(const NodeState &state)
Set the state of managed nodes.
~LifecycleManager()
A destructor for nav2_lifecycle_manager::LifecycleManager.
bool reset(bool hard_reset=false)
Reset all the managed nodes.
void shutdownAllNodes()
Support function for shutdown.
void managerCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< ManageLifecycleNodes::Request > request, std::shared_ptr< ManageLifecycleNodes::Response > response)
Lifecycle node manager callback function.
bool changeStateForNode(const std::string &node_name, std::uint8_t transition)
For a node, transition to the new target state.
bool configure()
Configures the managed nodes.
bool changeStateForAllNodes(std::uint8_t transition, bool hard_change=false)
For each node in the map, transition to the new target state.
void createLifecycleServiceClients()
Support function for creating service clients.
void isActiveCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< std_srvs::srv::Trigger::Request > request, std::shared_ptr< std_srvs::srv::Trigger::Response > response)
Trigger callback function checks if the managed nodes are in active state.
Helper functions to interact with a lifecycle node.