Nav2 Navigation Stack - kilted  kilted
ROS 2 Navigation Stack
lifecycle_manager.cpp
1 // Copyright (c) 2019 Intel Corporation
2 // Copyright (c) 2022 Samsung Research America
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "nav2_lifecycle_manager/lifecycle_manager.hpp"
17 
18 #include <chrono>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "rclcpp/rclcpp.hpp"
24 
25 using namespace std::chrono_literals;
26 using namespace std::placeholders;
27 
28 using lifecycle_msgs::msg::Transition;
29 using lifecycle_msgs::msg::State;
31 
32 namespace nav2_lifecycle_manager
33 {
34 
35 LifecycleManager::LifecycleManager(const rclcpp::NodeOptions & options)
36 : Node("lifecycle_manager", options), diagnostics_updater_(this)
37 {
38  RCLCPP_INFO(get_logger(), "Creating");
39 
40  // The list of names is parameterized, allowing this module to be used with a different set
41  // of nodes
42  declare_parameter("node_names", rclcpp::PARAMETER_STRING_ARRAY);
43  declare_parameter("autostart", rclcpp::ParameterValue(false));
44  declare_parameter("bond_timeout", 4.0);
45  declare_parameter("service_timeout", 5.0);
46  declare_parameter("bond_respawn_max_duration", 10.0);
47  declare_parameter("attempt_respawn_reconnection", true);
48 
50 
51  node_names_ = get_parameter("node_names").as_string_array();
52  get_parameter("autostart", autostart_);
53  double bond_timeout_s;
54  get_parameter("bond_timeout", bond_timeout_s);
55  bond_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
56  std::chrono::duration<double>(bond_timeout_s));
57 
58  double service_timeout_s;
59  get_parameter("service_timeout", service_timeout_s);
60  service_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
61  std::chrono::duration<double>(service_timeout_s));
62 
63  double respawn_timeout_s;
64  get_parameter("bond_respawn_max_duration", respawn_timeout_s);
65  bond_respawn_max_duration_ = rclcpp::Duration::from_seconds(respawn_timeout_s);
66 
67  get_parameter("attempt_respawn_reconnection", attempt_respawn_reconnection_);
68 
69  callback_group_ = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive, false);
70 
71  transition_state_map_[Transition::TRANSITION_CONFIGURE] = State::PRIMARY_STATE_INACTIVE;
72  transition_state_map_[Transition::TRANSITION_CLEANUP] = State::PRIMARY_STATE_UNCONFIGURED;
73  transition_state_map_[Transition::TRANSITION_ACTIVATE] = State::PRIMARY_STATE_ACTIVE;
74  transition_state_map_[Transition::TRANSITION_DEACTIVATE] = State::PRIMARY_STATE_INACTIVE;
75  transition_state_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
76  State::PRIMARY_STATE_FINALIZED;
77 
78  transition_label_map_[Transition::TRANSITION_CONFIGURE] = std::string("Configuring ");
79  transition_label_map_[Transition::TRANSITION_CLEANUP] = std::string("Cleaning up ");
80  transition_label_map_[Transition::TRANSITION_ACTIVATE] = std::string("Activating ");
81  transition_label_map_[Transition::TRANSITION_DEACTIVATE] = std::string("Deactivating ");
82  transition_label_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
83  std::string("Shutting down ");
84 
85  init_timer_ = this->create_wall_timer(
86  0s,
87  [this]() -> void {
88  init_timer_->cancel();
91  if (autostart_) {
92  init_timer_ = this->create_wall_timer(
93  0s,
94  [this]() -> void {
95  init_timer_->cancel();
96  startup();
97  },
98  callback_group_);
99  }
100  auto executor = std::make_shared<rclcpp::executors::SingleThreadedExecutor>();
101  executor->add_callback_group(callback_group_, get_node_base_interface());
102  service_thread_ = std::make_unique<nav2_util::NodeThread>(executor);
103  });
104  diagnostics_updater_.setHardwareID("Nav2");
105  diagnostics_updater_.add("Nav2 Health", this, &LifecycleManager::CreateDiagnostic);
106 }
107 
109 {
110  RCLCPP_INFO(get_logger(), "Destroying %s", get_name());
111  service_thread_.reset();
112 }
113 
114 void
116  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
117  const std::shared_ptr<ManageLifecycleNodes::Request> request,
118  std::shared_ptr<ManageLifecycleNodes::Response> response)
119 {
120  switch (request->command) {
121  case ManageLifecycleNodes::Request::STARTUP:
122  response->success = startup();
123  break;
124  case ManageLifecycleNodes::Request::CONFIGURE:
125  response->success = configure();
126  break;
127  case ManageLifecycleNodes::Request::CLEANUP:
128  response->success = cleanup();
129  break;
130  case ManageLifecycleNodes::Request::RESET:
131  response->success = reset();
132  break;
133  case ManageLifecycleNodes::Request::SHUTDOWN:
134  response->success = shutdown();
135  break;
136  case ManageLifecycleNodes::Request::PAUSE:
137  response->success = pause();
138  break;
139  case ManageLifecycleNodes::Request::RESUME:
140  response->success = resume();
141  break;
142  }
143 }
144 
145 inline bool
147 {
148  return managed_nodes_state_ == NodeState::ACTIVE;
149 }
150 
151 void
153  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
154  const std::shared_ptr<std_srvs::srv::Trigger::Request>/*request*/,
155  std::shared_ptr<std_srvs::srv::Trigger::Response> response)
156 {
157  response->success = isActive();
158 }
159 
160 void
161 LifecycleManager::CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper & stat)
162 {
163  unsigned char error_level;
164  std::string message;
165  switch (managed_nodes_state_) {
166  case NodeState::ACTIVE:
167  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
168  message = "Managed nodes are active";
169  break;
170  case NodeState::INACTIVE:
171  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
172  message = "Managed nodes are inactive";
173  break;
174  case NodeState::UNCONFIGURED:
175  error_level = diagnostic_msgs::msg::DiagnosticStatus::OK;
176  message = "Managed nodes are unconfigured";
177  break;
178  case NodeState::FINALIZED:
179  error_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
180  message = "Managed nodes have been shut down";
181  break;
182  default: // NodeState::UNKNOWN
183  error_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
184  message = "An error has occurred during a node state transition";
185  break;
186  }
187  stat.summary(error_level, message);
188 }
189 
190 void
192 {
193  message("Creating and initializing lifecycle service clients");
194  for (auto & node_name : node_names_) {
195  node_map_[node_name] =
196  std::make_shared<LifecycleServiceClient>(node_name, shared_from_this());
197  }
198 }
199 
200 void
202 {
203  message("Creating and initializing lifecycle service servers");
204  manager_srv_ = std::make_shared<nav2_util::ServiceServer<ManageLifecycleNodes>>(
205  get_name() + std::string("/manage_nodes"),
206  shared_from_this(),
207  std::bind(&LifecycleManager::managerCallback, this, _1, _2, _3),
208  rclcpp::SystemDefaultsQoS(),
209  callback_group_);
210 
211  is_active_srv_ = std::make_shared<nav2_util::ServiceServer<std_srvs::srv::Trigger>>(
212  get_name() + std::string("/is_active"),
213  shared_from_this(),
214  std::bind(&LifecycleManager::isActiveCallback, this, _1, _2, _3),
215  rclcpp::SystemDefaultsQoS(),
216  callback_group_);
217 }
218 
219 void
221 {
222  message("Destroying lifecycle service clients");
223  for (auto & kv : node_map_) {
224  kv.second.reset();
225  }
226 }
227 
228 bool
229 LifecycleManager::createBondConnection(const std::string & node_name)
230 {
231  const double timeout_ns =
232  std::chrono::duration_cast<std::chrono::nanoseconds>(bond_timeout_).count();
233  const double timeout_s = timeout_ns / 1e9;
234 
235  if (bond_map_.find(node_name) == bond_map_.end() && bond_timeout_.count() > 0.0) {
236  bond_map_[node_name] =
237  std::make_shared<bond::Bond>("bond", node_name, shared_from_this());
238  bond_map_[node_name]->setHeartbeatTimeout(timeout_s);
239  bond_map_[node_name]->setHeartbeatPeriod(0.10);
240  bond_map_[node_name]->start();
241  if (
242  !bond_map_[node_name]->waitUntilFormed(
243  rclcpp::Duration(rclcpp::Duration::from_nanoseconds(timeout_ns / 2))))
244  {
245  RCLCPP_ERROR(
246  get_logger(),
247  "Server %s was unable to be reached after %0.2fs by bond. "
248  "This server may be misconfigured.",
249  node_name.c_str(), timeout_s);
250  return false;
251  }
252  RCLCPP_INFO(get_logger(), "Server %s connected with bond.", node_name.c_str());
253  }
254 
255  return true;
256 }
257 
258 bool
259 LifecycleManager::changeStateForNode(const std::string & node_name, std::uint8_t transition)
260 {
261  message(transition_label_map_[transition] + node_name);
262 
263  if (!node_map_[node_name]->change_state(transition, std::chrono::milliseconds(-1),
264  service_timeout_) ||
265  !(node_map_[node_name]->get_state(service_timeout_) == transition_state_map_[transition]))
266  {
267  RCLCPP_ERROR(get_logger(), "Failed to change state for node: %s", node_name.c_str());
268  return false;
269  }
270 
271  if (transition == Transition::TRANSITION_ACTIVATE) {
272  return createBondConnection(node_name);
273  } else if (transition == Transition::TRANSITION_DEACTIVATE) {
274  bond_map_.erase(node_name);
275  }
276 
277  return true;
278 }
279 
280 bool
281 LifecycleManager::changeStateForAllNodes(std::uint8_t transition, bool hard_change)
282 {
283  // Hard change will continue even if a node fails
284  if (transition == Transition::TRANSITION_CONFIGURE ||
285  transition == Transition::TRANSITION_ACTIVATE)
286  {
287  for (auto & node_name : node_names_) {
288  try {
289  if (!changeStateForNode(node_name, transition) && !hard_change) {
290  return false;
291  }
292  } catch (const std::runtime_error & e) {
293  RCLCPP_ERROR(
294  get_logger(),
295  "Failed to change state for node: %s. Exception: %s.", node_name.c_str(), e.what());
296  return false;
297  }
298  }
299  } else {
300  std::vector<std::string>::reverse_iterator rit;
301  for (rit = node_names_.rbegin(); rit != node_names_.rend(); ++rit) {
302  try {
303  if (!changeStateForNode(*rit, transition) && !hard_change) {
304  return false;
305  }
306  } catch (const std::runtime_error & e) {
307  RCLCPP_ERROR(
308  get_logger(),
309  "Failed to change state for node: %s. Exception: %s.", (*rit).c_str(), e.what());
310  return false;
311  }
312  }
313  }
314  return true;
315 }
316 
317 void
319 {
320  message("Deactivate, cleanup, and shutdown nodes");
321  managed_nodes_state_ = NodeState::FINALIZED;
322  changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE);
323  changeStateForAllNodes(Transition::TRANSITION_CLEANUP);
324  changeStateForAllNodes(Transition::TRANSITION_UNCONFIGURED_SHUTDOWN);
325 }
326 
327 bool
329 {
330  message("Starting managed nodes bringup...");
331  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE) ||
332  !changeStateForAllNodes(Transition::TRANSITION_ACTIVATE))
333  {
334  RCLCPP_ERROR(get_logger(), "Failed to bring up all requested nodes. Aborting bringup.");
335  managed_nodes_state_ = NodeState::UNKNOWN;
336  return false;
337  }
338  message("Managed nodes are active");
339  managed_nodes_state_ = NodeState::ACTIVE;
340  createBondTimer();
341  return true;
342 }
343 
344 bool
346 {
347  message("Configuring managed nodes...");
348  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE)) {
349  RCLCPP_ERROR(get_logger(), "Failed to configure all requested nodes. Aborting bringup.");
350  managed_nodes_state_ = NodeState::UNKNOWN;
351  return false;
352  }
353  message("Managed nodes are now configured");
354  managed_nodes_state_ = NodeState::INACTIVE;
355  return true;
356 }
357 
358 bool
360 {
361  message("Cleaning up managed nodes...");
362  if (!changeStateForAllNodes(Transition::TRANSITION_CLEANUP)) {
363  RCLCPP_ERROR(get_logger(), "Failed to cleanup all requested nodes. Aborting cleanup.");
364  managed_nodes_state_ = NodeState::UNKNOWN;
365  return false;
366  }
367  message("Managed nodes have been cleaned up");
368  managed_nodes_state_ = NodeState::UNCONFIGURED;
369  return true;
370 }
371 
372 bool
374 {
376 
377  message("Shutting down managed nodes...");
380  message("Managed nodes have been shut down");
381  return true;
382 }
383 
384 bool
385 LifecycleManager::reset(bool hard_reset)
386 {
388 
389  message("Resetting managed nodes...");
390  // Should transition in reverse order
391  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE, hard_reset) ||
392  !changeStateForAllNodes(Transition::TRANSITION_CLEANUP, hard_reset))
393  {
394  if (!hard_reset) {
395  RCLCPP_ERROR(get_logger(), "Failed to reset nodes: aborting reset");
396  managed_nodes_state_ = NodeState::UNKNOWN;
397  return false;
398  }
399  }
400 
401  message("Managed nodes have been reset");
402  managed_nodes_state_ = NodeState::UNCONFIGURED;
403  return true;
404 }
405 
406 bool
408 {
410 
411  message("Pausing managed nodes...");
412  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE)) {
413  RCLCPP_ERROR(get_logger(), "Failed to pause nodes: aborting pause");
414  managed_nodes_state_ = NodeState::UNKNOWN;
415  return false;
416  }
417 
418  message("Managed nodes have been paused");
419  managed_nodes_state_ = NodeState::INACTIVE;
420  return true;
421 }
422 
423 bool
425 {
426  message("Resuming managed nodes...");
427  if (!changeStateForAllNodes(Transition::TRANSITION_ACTIVATE)) {
428  RCLCPP_ERROR(get_logger(), "Failed to resume nodes: aborting resume");
429  managed_nodes_state_ = NodeState::UNKNOWN;
430  return false;
431  }
432 
433  message("Managed nodes are active");
434  managed_nodes_state_ = NodeState::ACTIVE;
435  createBondTimer();
436  return true;
437 }
438 
439 void
441 {
442  if (bond_timeout_.count() <= 0) {
443  return;
444  }
445 
446  message("Creating bond timer...");
447  bond_timer_ = this->create_wall_timer(
448  200ms,
449  std::bind(&LifecycleManager::checkBondConnections, this),
450  callback_group_);
451 }
452 
453 void
455 {
456  if (bond_timer_) {
457  message("Terminating bond timer...");
458  bond_timer_->cancel();
459  bond_timer_.reset();
460  }
461 }
462 
463 void
465 {
466  RCLCPP_INFO(
467  get_logger(), "Running Nav2 LifecycleManager rcl preshutdown (%s)",
468  this->get_name());
469 
471 
472  /*
473  * Dropping the bond map is what we really need here, but we drop the others
474  * to prevent the bond map being used. Likewise, squash the service thread.
475  */
476  service_thread_.reset();
477  node_names_.clear();
478  node_map_.clear();
479  bond_map_.clear();
480 }
481 
482 void
484 {
485  rclcpp::Context::SharedPtr context = get_node_base_interface()->get_context();
486 
487  context->add_pre_shutdown_callback(
488  std::bind(&LifecycleManager::onRclPreshutdown, this)
489  );
490 }
491 
492 void
494 {
495  if (!isActive() || !rclcpp::ok() || bond_map_.empty()) {
496  return;
497  }
498 
499  for (auto & node_name : node_names_) {
500  if (!rclcpp::ok()) {
501  return;
502  }
503 
504  if (bond_map_[node_name]->isBroken()) {
505  message(
506  std::string(
507  "Have not received a heartbeat from " + node_name + "."));
508 
509  // if one is down, bring them all down
510  RCLCPP_ERROR(
511  get_logger(),
512  "CRITICAL FAILURE: SERVER %s IS DOWN after not receiving a heartbeat for %i ms."
513  " Shutting down related nodes.",
514  node_name.c_str(), static_cast<int>(bond_timeout_.count()));
515  reset(true); // hard reset to transition all still active down
516  // if a server crashed, it won't get cleared due to failed transition, clear manually
517  bond_map_.clear();
518 
519  // Initialize the bond respawn timer to check if server comes back online
520  // after a failure, within a maximum timeout period.
521  if (attempt_respawn_reconnection_) {
522  bond_respawn_timer_ = this->create_wall_timer(
523  1s,
525  callback_group_);
526  }
527  return;
528  }
529  }
530 }
531 
532 void
534 {
535  // First attempt in respawn, start maximum duration to respawn
536  if (bond_respawn_start_time_.nanoseconds() == 0) {
537  bond_respawn_start_time_ = now();
538  }
539 
540  // Note: isActive() is inverted since this should be in a failure
541  // condition. If another outside user actives the system again, this should not process.
542  if (isActive() || !rclcpp::ok() || node_names_.empty()) {
543  bond_respawn_start_time_ = rclcpp::Time(0);
544  bond_respawn_timer_.reset();
545  return;
546  }
547 
548  // Check number of live connections after a bond failure
549  int live_servers = 0;
550  const int max_live_servers = node_names_.size();
551  for (auto & node_name : node_names_) {
552  if (!rclcpp::ok()) {
553  return;
554  }
555 
556  try {
557  node_map_[node_name]->get_state(service_timeout_); // Only won't throw if the server exists
558  live_servers++;
559  } catch (...) {
560  break;
561  }
562  }
563 
564  // If all are alive, kill timer and retransition system to active
565  // Else, check if maximum timeout has occurred
566  if (live_servers == max_live_servers) {
567  message("Successfully re-established connections from server respawns, starting back up.");
568  bond_respawn_start_time_ = rclcpp::Time(0);
569  bond_respawn_timer_.reset();
570  startup();
571  } else if (now() - bond_respawn_start_time_ >= bond_respawn_max_duration_) {
572  message("Failed to re-establish connection from a server crash after maximum timeout.");
573  bond_respawn_start_time_ = rclcpp::Time(0);
574  bond_respawn_timer_.reset();
575  }
576 }
577 
578 #define ANSI_COLOR_RESET "\x1b[0m"
579 #define ANSI_COLOR_BLUE "\x1b[34m"
580 
581 void
582 LifecycleManager::message(const std::string & msg)
583 {
584  RCLCPP_INFO(get_logger(), ANSI_COLOR_BLUE "\33[1m%s\33[0m" ANSI_COLOR_RESET, msg.c_str());
585 }
586 
587 } // namespace nav2_lifecycle_manager
588 
589 #include "rclcpp_components/register_node_macro.hpp"
590 RCLCPP_COMPONENTS_REGISTER_NODE(nav2_lifecycle_manager::LifecycleManager)
Implements service interface to transition the lifecycle nodes of Nav2 stack. It receives transition ...
void createBondTimer()
Support function for creating bond timer.
void destroyLifecycleServiceClients()
Destroy all the lifecycle service clients.
void destroyBondTimer()
Support function for killing bond connections.
bool cleanup()
Cleanups the managed nodes.
bool pause()
Pause all the managed nodes.
bool shutdown()
Deactivate, clean up and shut down all the managed nodes.
bool resume()
Resume all the managed nodes.
void createLifecycleServiceServers()
Support function for creating service servers.
void onRclPreshutdown()
Perform preshutdown activities before our Context is shutdown. Note that this is related to our Conte...
bool isActive()
function to check if managed nodes are active
bool createBondConnection(const std::string &node_name)
Support function for creating bond connections.
void CreateDiagnostic(diagnostic_updater::DiagnosticStatusWrapper &stat)
function to check the state of Nav2 nodes
void message(const std::string &msg)
Helper function to highlight the output on the console.
~LifecycleManager()
A destructor for nav2_lifecycle_manager::LifecycleManager.
bool reset(bool hard_reset=false)
Reset all the managed nodes.
void shutdownAllNodes()
Support function for shutdown.
void managerCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< ManageLifecycleNodes::Request > request, std::shared_ptr< ManageLifecycleNodes::Response > response)
Lifecycle node manager callback function.
bool changeStateForNode(const std::string &node_name, std::uint8_t transition)
For a node, transition to the new target state.
bool configure()
Configures the managed nodes.
bool changeStateForAllNodes(std::uint8_t transition, bool hard_change=false)
For each node in the map, transition to the new target state.
void createLifecycleServiceClients()
Support function for creating service clients.
void isActiveCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< std_srvs::srv::Trigger::Request > request, std::shared_ptr< std_srvs::srv::Trigger::Response > response)
Trigger callback function checks if the managed nodes are in active state.
Helper functions to interact with a lifecycle node.