Nav2 Navigation Stack - rolling  main
ROS 2 Navigation Stack
lifecycle_manager.cpp
1 // Copyright (c) 2019 Intel Corporation
2 // Copyright (c) 2022 Samsung Research America
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "nav2_lifecycle_manager/lifecycle_manager.hpp"
17 
18 #include <chrono>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "rclcpp/rclcpp.hpp"
24 
25 using namespace std::chrono_literals;
26 using namespace std::placeholders;
27 
28 using lifecycle_msgs::msg::Transition;
29 using lifecycle_msgs::msg::State;
31 
32 namespace nav2_lifecycle_manager
33 {
34 
35 LifecycleManager::LifecycleManager(const rclcpp::NodeOptions & options)
36 : Node("lifecycle_manager", options), diagnostics_updater_(this)
37 {
38  RCLCPP_INFO(get_logger(), "Creating");
39 
40  // The list of names is parameterized, allowing this module to be used with a different set
41  // of nodes
42  declare_parameter("node_names", rclcpp::PARAMETER_STRING_ARRAY);
43  declare_parameter("autostart", rclcpp::ParameterValue(false));
44  declare_parameter("bond_timeout", 4.0);
45  declare_parameter("service_timeout", 5.0);
46  declare_parameter("bond_respawn_max_duration", 10.0);
47  declare_parameter("attempt_respawn_reconnection", true);
48 
50 
51  node_names_ = get_parameter("node_names").as_string_array();
52  get_parameter("autostart", autostart_);
53  double bond_timeout_s;
54  get_parameter("bond_timeout", bond_timeout_s);
55  bond_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
56  std::chrono::duration<double>(bond_timeout_s));
57 
58  double service_timeout_s;
59  get_parameter("service_timeout", service_timeout_s);
60  service_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
61  std::chrono::duration<double>(service_timeout_s));
62 
63  double respawn_timeout_s;
64  get_parameter("bond_respawn_max_duration", respawn_timeout_s);
65  bond_respawn_max_duration_ = rclcpp::Duration::from_seconds(respawn_timeout_s);
66 
67  get_parameter("attempt_respawn_reconnection", attempt_respawn_reconnection_);
68 
69  callback_group_ = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive, false);
70 
71  transition_state_map_[Transition::TRANSITION_CONFIGURE] = State::PRIMARY_STATE_INACTIVE;
72  transition_state_map_[Transition::TRANSITION_CLEANUP] = State::PRIMARY_STATE_UNCONFIGURED;
73  transition_state_map_[Transition::TRANSITION_ACTIVATE] = State::PRIMARY_STATE_ACTIVE;
74  transition_state_map_[Transition::TRANSITION_DEACTIVATE] = State::PRIMARY_STATE_INACTIVE;
75  transition_state_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
76  State::PRIMARY_STATE_FINALIZED;
77 
78  transition_label_map_[Transition::TRANSITION_CONFIGURE] = std::string("Configuring ");
79  transition_label_map_[Transition::TRANSITION_CLEANUP] = std::string("Cleaning up ");
80  transition_label_map_[Transition::TRANSITION_ACTIVATE] = std::string("Activating ");
81  transition_label_map_[Transition::TRANSITION_DEACTIVATE] = std::string("Deactivating ");
82  transition_label_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
83  std::string("Shutting down ");
84 
85  init_timer_ = this->create_wall_timer(
86  0s,
87  [this]() -> void {
88  init_timer_->cancel();
91  if (autostart_) {
92  init_timer_ = this->create_wall_timer(
93  0s,
94  [this]() -> void {
95  init_timer_->cancel();
96  startup();
97  },
98  callback_group_);
99  }
100  auto executor = std::make_shared<rclcpp::executors::SingleThreadedExecutor>();
101  executor->add_callback_group(callback_group_, get_node_base_interface());
102  service_thread_ = std::make_unique<nav2::NodeThread>(executor);
103  });
104  diagnostics_updater_.setHardwareID("Nav2");
105  diagnostics_updater_.add("Nav2 Health", this, &LifecycleManager::CreateDiagnostic);
106 }
107 
109 {
110  RCLCPP_INFO(get_logger(), "Destroying %s", get_name());
111  service_thread_.reset();
112 }
113 
114 void
116  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
117  const std::shared_ptr<ManageLifecycleNodes::Request> request,
118  std::shared_ptr<ManageLifecycleNodes::Response> response)
119 {
120  switch (request->command) {
121  case ManageLifecycleNodes::Request::STARTUP:
122  response->success = startup();
123  break;
124  case ManageLifecycleNodes::Request::CONFIGURE:
125  response->success = configure();
126  break;
127  case ManageLifecycleNodes::Request::CLEANUP:
128  response->success = cleanup();
129  break;
130  case ManageLifecycleNodes::Request::RESET:
131  response->success = reset();
132  break;
133  case ManageLifecycleNodes::Request::SHUTDOWN:
134  response->success = shutdown();
135  break;
136  case ManageLifecycleNodes::Request::PAUSE:
137  response->success = pause();
138  break;
139  case ManageLifecycleNodes::Request::RESUME:
140  response->success = resume();
141  break;
142  }
143 }
144 
145 inline bool
147 {
148  return managed_nodes_state_ == NodeState::ACTIVE;
149 }
150 
151 void
153  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
154  const std::shared_ptr<std_srvs::srv::Trigger::Request>/*request*/,
155  std::shared_ptr<std_srvs::srv::Trigger::Response> response)
156 {
157  response->success = isActive();
158 }
159 
160 void
161 LifecycleManager::CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper & stat)
162 {
163  unsigned char error_level;
164  std::string message;
165  switch (managed_nodes_state_) {
166  case NodeState::ACTIVE:
167  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
168  message = "Managed nodes are active";
169  break;
170  case NodeState::INACTIVE:
171  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
172  message = "Managed nodes are inactive";
173  break;
174  case NodeState::UNCONFIGURED:
175  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
176  message = "Managed nodes are unconfigured";
177  break;
178  case NodeState::FINALIZED:
179  error_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
180  message = "Managed nodes have been shut down";
181  break;
182  default: // NodeState::UNKNOWN
183  error_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
184  message = "An error has occurred during a node state transition";
185  break;
186  }
187  stat.summary(error_level, message);
188 }
189 
190 void
192 {
193  message("Creating and initializing lifecycle service clients");
194  for (auto & node_name : node_names_) {
195  node_map_[node_name] =
196  std::make_shared<LifecycleServiceClient>(node_name, shared_from_this());
197  }
198 }
199 
200 void
202 {
203  message("Creating and initializing lifecycle service servers");
204  // Since the LifecycleManager is a special node in that it manages other nodes,
205  // this is an rclcpp::Node, meaning we can't use the node->create_server API.
206  // to make a Nav2 ServiceServer. This must be constructed manually using the interfaces.
207  manager_srv_ = nav2::interfaces::create_service<ManageLifecycleNodes>(
208  shared_from_this(),
209  get_name() + std::string("/manage_nodes"),
210  std::bind(&LifecycleManager::managerCallback, this, _1, _2, _3),
211  callback_group_);
212 
213  is_active_srv_ = nav2::interfaces::create_service<std_srvs::srv::Trigger>(
214  shared_from_this(),
215  get_name() + std::string("/is_active"),
216  std::bind(&LifecycleManager::isActiveCallback, this, _1, _2, _3),
217  callback_group_);
218 }
219 
220 void
222 {
223  message("Destroying lifecycle service clients");
224  for (auto & kv : node_map_) {
225  kv.second.reset();
226  }
227 }
228 
229 bool
230 LifecycleManager::createBondConnection(const std::string & node_name)
231 {
232  const double timeout_ns =
233  std::chrono::duration_cast<std::chrono::nanoseconds>(bond_timeout_).count();
234  const double timeout_s = timeout_ns / 1e9;
235 
236  if (bond_map_.find(node_name) == bond_map_.end() && bond_timeout_.count() > 0.0) {
237  bond_map_[node_name] =
238  std::make_shared<bond::Bond>("bond", node_name, shared_from_this());
239  bond_map_[node_name]->setHeartbeatTimeout(timeout_s);
240  bond_map_[node_name]->setHeartbeatPeriod(0.10);
241  bond_map_[node_name]->start();
242  if (
243  !bond_map_[node_name]->waitUntilFormed(
244  rclcpp::Duration(rclcpp::Duration::from_nanoseconds(timeout_ns / 2))))
245  {
246  RCLCPP_ERROR(
247  get_logger(),
248  "Server %s was unable to be reached after %0.2fs by bond. "
249  "This server may be misconfigured.",
250  node_name.c_str(), timeout_s);
251  return false;
252  }
253  RCLCPP_INFO(get_logger(), "Server %s connected with bond.", node_name.c_str());
254  }
255 
256  return true;
257 }
258 
259 bool
260 LifecycleManager::changeStateForNode(const std::string & node_name, std::uint8_t transition)
261 {
262  message(transition_label_map_[transition] + node_name);
263 
264  if (!node_map_[node_name]->change_state(transition, std::chrono::milliseconds(-1),
265  service_timeout_) ||
266  !(node_map_[node_name]->get_state(service_timeout_) == transition_state_map_[transition]))
267  {
268  RCLCPP_ERROR(get_logger(), "Failed to change state for node: %s", node_name.c_str());
269  return false;
270  }
271 
272  if (transition == Transition::TRANSITION_ACTIVATE) {
273  return createBondConnection(node_name);
274  } else if (transition == Transition::TRANSITION_DEACTIVATE) {
275  bond_map_.erase(node_name);
276  }
277 
278  return true;
279 }
280 
281 bool
282 LifecycleManager::changeStateForAllNodes(std::uint8_t transition, bool hard_change)
283 {
284  // Hard change will continue even if a node fails
285  if (transition == Transition::TRANSITION_CONFIGURE ||
286  transition == Transition::TRANSITION_ACTIVATE)
287  {
288  for (auto & node_name : node_names_) {
289  try {
290  if (!changeStateForNode(node_name, transition) && !hard_change) {
291  return false;
292  }
293  } catch (const std::runtime_error & e) {
294  RCLCPP_ERROR(
295  get_logger(),
296  "Failed to change state for node: %s. Exception: %s.", node_name.c_str(), e.what());
297  return false;
298  }
299  }
300  } else {
301  std::vector<std::string>::reverse_iterator rit;
302  for (rit = node_names_.rbegin(); rit != node_names_.rend(); ++rit) {
303  try {
304  if (!changeStateForNode(*rit, transition) && !hard_change) {
305  return false;
306  }
307  } catch (const std::runtime_error & e) {
308  RCLCPP_ERROR(
309  get_logger(),
310  "Failed to change state for node: %s. Exception: %s.", (*rit).c_str(), e.what());
311  return false;
312  }
313  }
314  }
315  return true;
316 }
317 
318 void
320 {
321  message("Deactivate, cleanup, and shutdown nodes");
322  managed_nodes_state_ = NodeState::FINALIZED;
323  changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE);
324  changeStateForAllNodes(Transition::TRANSITION_CLEANUP);
325  changeStateForAllNodes(Transition::TRANSITION_UNCONFIGURED_SHUTDOWN);
326 }
327 
328 bool
330 {
331  message("Starting managed nodes bringup...");
332  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE) ||
333  !changeStateForAllNodes(Transition::TRANSITION_ACTIVATE))
334  {
335  RCLCPP_ERROR(get_logger(), "Failed to bring up all requested nodes. Aborting bringup.");
336  managed_nodes_state_ = NodeState::UNKNOWN;
337  return false;
338  }
339  message("Managed nodes are active");
340  managed_nodes_state_ = NodeState::ACTIVE;
341  createBondTimer();
342  return true;
343 }
344 
345 bool
347 {
348  message("Configuring managed nodes...");
349  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE)) {
350  RCLCPP_ERROR(get_logger(), "Failed to configure all requested nodes. Aborting bringup.");
351  managed_nodes_state_ = NodeState::UNKNOWN;
352  return false;
353  }
354  message("Managed nodes are now configured");
355  managed_nodes_state_ = NodeState::INACTIVE;
356  return true;
357 }
358 
359 bool
361 {
362  message("Cleaning up managed nodes...");
363  if (!changeStateForAllNodes(Transition::TRANSITION_CLEANUP)) {
364  RCLCPP_ERROR(get_logger(), "Failed to cleanup all requested nodes. Aborting cleanup.");
365  managed_nodes_state_ = NodeState::UNKNOWN;
366  return false;
367  }
368  message("Managed nodes have been cleaned up");
369  managed_nodes_state_ = NodeState::UNCONFIGURED;
370  return true;
371 }
372 
373 bool
375 {
377 
378  message("Shutting down managed nodes...");
381  message("Managed nodes have been shut down");
382  return true;
383 }
384 
385 bool
386 LifecycleManager::reset(bool hard_reset)
387 {
389 
390  message("Resetting managed nodes...");
391  // Should transition in reverse order
392  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE, hard_reset) ||
393  !changeStateForAllNodes(Transition::TRANSITION_CLEANUP, hard_reset))
394  {
395  if (!hard_reset) {
396  RCLCPP_ERROR(get_logger(), "Failed to reset nodes: aborting reset");
397  managed_nodes_state_ = NodeState::UNKNOWN;
398  return false;
399  }
400  }
401 
402  message("Managed nodes have been reset");
403  managed_nodes_state_ = NodeState::UNCONFIGURED;
404  return true;
405 }
406 
407 bool
409 {
411 
412  message("Pausing managed nodes...");
413  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE)) {
414  RCLCPP_ERROR(get_logger(), "Failed to pause nodes: aborting pause");
415  managed_nodes_state_ = NodeState::UNKNOWN;
416  return false;
417  }
418 
419  message("Managed nodes have been paused");
420  managed_nodes_state_ = NodeState::INACTIVE;
421  return true;
422 }
423 
424 bool
426 {
427  message("Resuming managed nodes...");
428  if (!changeStateForAllNodes(Transition::TRANSITION_ACTIVATE)) {
429  RCLCPP_ERROR(get_logger(), "Failed to resume nodes: aborting resume");
430  managed_nodes_state_ = NodeState::UNKNOWN;
431  return false;
432  }
433 
434  message("Managed nodes are active");
435  managed_nodes_state_ = NodeState::ACTIVE;
436  createBondTimer();
437  return true;
438 }
439 
440 void
442 {
443  if (bond_timeout_.count() <= 0) {
444  return;
445  }
446 
447  message("Creating bond timer...");
448  bond_timer_ = this->create_wall_timer(
449  200ms,
450  std::bind(&LifecycleManager::checkBondConnections, this),
451  callback_group_);
452 }
453 
454 void
456 {
457  if (bond_timer_) {
458  message("Terminating bond timer...");
459  bond_timer_->cancel();
460  bond_timer_.reset();
461  }
462 }
463 
464 void
466 {
467  RCLCPP_INFO(
468  get_logger(), "Running Nav2 LifecycleManager rcl preshutdown (%s)",
469  this->get_name());
470 
472 
473  /*
474  * Dropping the bond map is what we really need here, but we drop the others
475  * to prevent the bond map being used. Likewise, squash the service thread.
476  */
477  service_thread_.reset();
478  node_names_.clear();
479  node_map_.clear();
480  bond_map_.clear();
481 }
482 
483 void
485 {
486  rclcpp::Context::SharedPtr context = get_node_base_interface()->get_context();
487 
488  context->add_pre_shutdown_callback(
489  std::bind(&LifecycleManager::onRclPreshutdown, this)
490  );
491 }
492 
493 void
495 {
496  if (!isActive() || !rclcpp::ok() || bond_map_.empty()) {
497  return;
498  }
499 
500  for (auto & node_name : node_names_) {
501  if (!rclcpp::ok()) {
502  return;
503  }
504 
505  if (bond_map_[node_name]->isBroken()) {
506  message(
507  std::string(
508  "Have not received a heartbeat from " + node_name + "."));
509 
510  // if one is down, bring them all down
511  RCLCPP_ERROR(
512  get_logger(),
513  "CRITICAL FAILURE: SERVER %s IS DOWN after not receiving a heartbeat for %i ms."
514  " Shutting down related nodes.",
515  node_name.c_str(), static_cast<int>(bond_timeout_.count()));
516  reset(true); // hard reset to transition all still active down
517  // if a server crashed, it won't get cleared due to failed transition, clear manually
518  bond_map_.clear();
519 
520  // Initialize the bond respawn timer to check if server comes back online
521  // after a failure, within a maximum timeout period.
522  if (attempt_respawn_reconnection_) {
523  bond_respawn_timer_ = this->create_wall_timer(
524  1s,
526  callback_group_);
527  }
528  return;
529  }
530  }
531 }
532 
533 void
535 {
536  // First attempt in respawn, start maximum duration to respawn
537  if (bond_respawn_start_time_.nanoseconds() == 0) {
538  bond_respawn_start_time_ = now();
539  }
540 
541  // Note: isActive() is inverted since this should be in a failure
542  // condition. If another outside user actives the system again, this should not process.
543  if (isActive() || !rclcpp::ok() || node_names_.empty()) {
544  bond_respawn_start_time_ = rclcpp::Time(0);
545  bond_respawn_timer_.reset();
546  return;
547  }
548 
549  // Check number of live connections after a bond failure
550  int live_servers = 0;
551  const int max_live_servers = node_names_.size();
552  for (auto & node_name : node_names_) {
553  if (!rclcpp::ok()) {
554  return;
555  }
556 
557  try {
558  node_map_[node_name]->get_state(service_timeout_); // Only won't throw if the server exists
559  live_servers++;
560  } catch (...) {
561  break;
562  }
563  }
564 
565  // If all are alive, kill timer and retransition system to active
566  // Else, check if maximum timeout has occurred
567  if (live_servers == max_live_servers) {
568  message("Successfully re-established connections from server respawns, starting back up.");
569  bond_respawn_start_time_ = rclcpp::Time(0);
570  bond_respawn_timer_.reset();
571  startup();
572  } else if (now() - bond_respawn_start_time_ >= bond_respawn_max_duration_) {
573  message("Failed to re-establish connection from a server crash after maximum timeout.");
574  bond_respawn_start_time_ = rclcpp::Time(0);
575  bond_respawn_timer_.reset();
576  }
577 }
578 
579 #define ANSI_COLOR_RESET "\x1b[0m"
580 #define ANSI_COLOR_BLUE "\x1b[34m"
581 
582 void
583 LifecycleManager::message(const std::string & msg)
584 {
585  RCLCPP_INFO(get_logger(), ANSI_COLOR_BLUE "\33[1m%s\33[0m" ANSI_COLOR_RESET, msg.c_str());
586 }
587 
588 } // namespace nav2_lifecycle_manager
589 
590 #include "rclcpp_components/register_node_macro.hpp"
591 RCLCPP_COMPONENTS_REGISTER_NODE(nav2_lifecycle_manager::LifecycleManager)
Implements service interface to transition the lifecycle nodes of Nav2 stack. It receives transition ...
void createBondTimer()
Support function for creating bond timer.
void destroyLifecycleServiceClients()
Destroy all the lifecycle service clients.
void destroyBondTimer()
Support function for killing bond connections.
bool cleanup()
Cleanups the managed nodes.
bool pause()
Pause all the managed nodes.
bool shutdown()
Deactivate, clean up and shut down all the managed nodes.
bool resume()
Resume all the managed nodes.
void createLifecycleServiceServers()
Support function for creating service servers.
void onRclPreshutdown()
Perform preshutdown activities before our Context is shutdown. Note that this is related to our Conte...
bool isActive()
function to check if managed nodes are active
bool createBondConnection(const std::string &node_name)
Support function for creating bond connections.
void CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper &stat)
function to check the state of Nav2 nodes
void message(const std::string &msg)
Helper function to highlight the output on the console.
~LifecycleManager()
A destructor for nav2_lifecycle_manager::LifecycleManager.
bool reset(bool hard_reset=false)
Reset all the managed nodes.
void shutdownAllNodes()
Support function for shutdown.
void managerCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< ManageLifecycleNodes::Request > request, std::shared_ptr< ManageLifecycleNodes::Response > response)
Lifecycle node manager callback function.
bool changeStateForNode(const std::string &node_name, std::uint8_t transition)
For a node, transition to the new target state.
bool configure()
Configures the managed nodes.
bool changeStateForAllNodes(std::uint8_t transition, bool hard_change=false)
For each node in the map, transition to the new target state.
void createLifecycleServiceClients()
Support function for creating service clients.
void isActiveCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< std_srvs::srv::Trigger::Request > request, std::shared_ptr< std_srvs::srv::Trigger::Response > response)
Trigger callback function checks if the managed nodes are in active state.
Helper functions to interact with a lifecycle node.