Nav2 Navigation Stack - humble  humble
ROS 2 Navigation Stack
lifecycle_manager.cpp
1 // Copyright (c) 2019 Intel Corporation
2 // Copyright (c) 2022 Samsung Research America
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "nav2_lifecycle_manager/lifecycle_manager.hpp"
17 
18 #include <chrono>
19 #include <memory>
20 #include <string>
21 #include <vector>
22 
23 #include "rclcpp/rclcpp.hpp"
24 
25 using namespace std::chrono_literals;
26 using namespace std::placeholders;
27 
28 using lifecycle_msgs::msg::Transition;
29 using lifecycle_msgs::msg::State;
31 
32 namespace nav2_lifecycle_manager
33 {
34 
35 LifecycleManager::LifecycleManager(const rclcpp::NodeOptions & options)
36 : Node("lifecycle_manager", options), diagnostics_updater_(this)
37 {
38  RCLCPP_INFO(get_logger(), "Creating");
39 
40  // The list of names is parameterized, allowing this module to be used with a different set
41  // of nodes
42  declare_parameter("node_names", rclcpp::PARAMETER_STRING_ARRAY);
43  declare_parameter("autostart", rclcpp::ParameterValue(false));
44  declare_parameter("bond_timeout", 4.0);
45  declare_parameter("bond_respawn_max_duration", 10.0);
46  declare_parameter("attempt_respawn_reconnection", true);
47 
49 
50  node_names_ = get_parameter("node_names").as_string_array();
51  get_parameter("autostart", autostart_);
52  double bond_timeout_s;
53  get_parameter("bond_timeout", bond_timeout_s);
54  bond_timeout_ = std::chrono::duration_cast<std::chrono::milliseconds>(
55  std::chrono::duration<double>(bond_timeout_s));
56 
57  double respawn_timeout_s;
58  get_parameter("bond_respawn_max_duration", respawn_timeout_s);
59  bond_respawn_max_duration_ = rclcpp::Duration::from_seconds(respawn_timeout_s);
60 
61  get_parameter("attempt_respawn_reconnection", attempt_respawn_reconnection_);
62 
63  callback_group_ = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive, false);
64  manager_srv_ = create_service<ManageLifecycleNodes>(
65  get_name() + std::string("/manage_nodes"),
66  std::bind(&LifecycleManager::managerCallback, this, _1, _2, _3),
67  rclcpp::ServicesQoS().get_rmw_qos_profile(),
68  callback_group_);
69 
70  is_active_srv_ = create_service<std_srvs::srv::Trigger>(
71  get_name() + std::string("/is_active"),
72  std::bind(&LifecycleManager::isActiveCallback, this, _1, _2, _3),
73  rclcpp::ServicesQoS().get_rmw_qos_profile(),
74  callback_group_);
75 
76  transition_state_map_[Transition::TRANSITION_CONFIGURE] = State::PRIMARY_STATE_INACTIVE;
77  transition_state_map_[Transition::TRANSITION_CLEANUP] = State::PRIMARY_STATE_UNCONFIGURED;
78  transition_state_map_[Transition::TRANSITION_ACTIVATE] = State::PRIMARY_STATE_ACTIVE;
79  transition_state_map_[Transition::TRANSITION_DEACTIVATE] = State::PRIMARY_STATE_INACTIVE;
80  transition_state_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
81  State::PRIMARY_STATE_FINALIZED;
82 
83  transition_label_map_[Transition::TRANSITION_CONFIGURE] = std::string("Configuring ");
84  transition_label_map_[Transition::TRANSITION_CLEANUP] = std::string("Cleaning up ");
85  transition_label_map_[Transition::TRANSITION_ACTIVATE] = std::string("Activating ");
86  transition_label_map_[Transition::TRANSITION_DEACTIVATE] = std::string("Deactivating ");
87  transition_label_map_[Transition::TRANSITION_UNCONFIGURED_SHUTDOWN] =
88  std::string("Shutting down ");
89 
90  init_timer_ = this->create_wall_timer(
91  0s,
92  [this]() -> void {
93  init_timer_->cancel();
95  if (autostart_) {
96  init_timer_ = this->create_wall_timer(
97  0s,
98  [this]() -> void {
99  init_timer_->cancel();
100  startup();
101  },
102  callback_group_);
103  }
104  auto executor = std::make_shared<rclcpp::executors::SingleThreadedExecutor>();
105  executor->add_callback_group(callback_group_, get_node_base_interface());
106  service_thread_ = std::make_unique<nav2_util::NodeThread>(executor);
107  });
108  diagnostics_updater_.setHardwareID("Nav2");
109  diagnostics_updater_.add("Nav2 Health", this, &LifecycleManager::CreateActiveDiagnostic);
110 }
111 
113 {
114  RCLCPP_INFO(get_logger(), "Destroying %s", get_name());
115  service_thread_.reset();
116 }
117 
118 void
120  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
121  const std::shared_ptr<ManageLifecycleNodes::Request> request,
122  std::shared_ptr<ManageLifecycleNodes::Response> response)
123 {
124  switch (request->command) {
125  case ManageLifecycleNodes::Request::STARTUP:
126  response->success = startup();
127  break;
128  case ManageLifecycleNodes::Request::RESET:
129  response->success = reset();
130  break;
131  case ManageLifecycleNodes::Request::SHUTDOWN:
132  response->success = shutdown();
133  break;
134  case ManageLifecycleNodes::Request::PAUSE:
135  response->success = pause();
136  break;
137  case ManageLifecycleNodes::Request::RESUME:
138  response->success = resume();
139  break;
140  }
141 }
142 
143 void
145  const std::shared_ptr<rmw_request_id_t>/*request_header*/,
146  const std::shared_ptr<std_srvs::srv::Trigger::Request>/*request*/,
147  std::shared_ptr<std_srvs::srv::Trigger::Response> response)
148 {
149  response->success = system_active_;
150 }
151 
152 void
153 LifecycleManager::CreateActiveDiagnostic(diagnostic_updater::DiagnosticStatusWrapper & stat)
154 {
155  if (system_active_) {
156  stat.summary(diagnostic_msgs::msg::DiagnosticStatus::OK, "Nav2 is active");
157  } else {
158  stat.summary(diagnostic_msgs::msg::DiagnosticStatus::ERROR, "Nav2 is inactive");
159  }
160 }
161 
162 void
164 {
165  message("Creating and initializing lifecycle service clients");
166  for (auto & node_name : node_names_) {
167  node_map_[node_name] =
168  std::make_shared<LifecycleServiceClient>(node_name, shared_from_this());
169  }
170 }
171 
172 void
174 {
175  message("Destroying lifecycle service clients");
176  for (auto & kv : node_map_) {
177  kv.second.reset();
178  }
179 }
180 
181 bool
182 LifecycleManager::createBondConnection(const std::string & node_name)
183 {
184  const double timeout_ns =
185  std::chrono::duration_cast<std::chrono::nanoseconds>(bond_timeout_).count();
186  const double timeout_s = timeout_ns / 1e9;
187 
188  if (bond_map_.find(node_name) == bond_map_.end() && bond_timeout_.count() > 0.0) {
189  bond_map_[node_name] =
190  std::make_shared<bond::Bond>("bond", node_name, shared_from_this());
191  bond_map_[node_name]->setHeartbeatTimeout(timeout_s);
192  bond_map_[node_name]->setHeartbeatPeriod(0.10);
193  bond_map_[node_name]->start();
194  if (
195  !bond_map_[node_name]->waitUntilFormed(
196  rclcpp::Duration(rclcpp::Duration::from_nanoseconds(timeout_ns / 2))))
197  {
198  RCLCPP_ERROR(
199  get_logger(),
200  "Server %s was unable to be reached after %0.2fs by bond. "
201  "This server may be misconfigured.",
202  node_name.c_str(), timeout_s);
203  return false;
204  }
205  RCLCPP_INFO(get_logger(), "Server %s connected with bond.", node_name.c_str());
206  }
207 
208  return true;
209 }
210 
211 bool
212 LifecycleManager::changeStateForNode(const std::string & node_name, std::uint8_t transition)
213 {
214  message(transition_label_map_[transition] + node_name);
215 
216  if (!node_map_[node_name]->change_state(transition) ||
217  !(node_map_[node_name]->get_state() == transition_state_map_[transition]))
218  {
219  RCLCPP_ERROR(get_logger(), "Failed to change state for node: %s", node_name.c_str());
220  return false;
221  }
222 
223  if (transition == Transition::TRANSITION_ACTIVATE) {
224  return createBondConnection(node_name);
225  } else if (transition == Transition::TRANSITION_DEACTIVATE) {
226  bond_map_.erase(node_name);
227  }
228 
229  return true;
230 }
231 
232 bool
233 LifecycleManager::changeStateForAllNodes(std::uint8_t transition, bool hard_change)
234 {
235  // Hard change will continue even if a node fails
236  if (transition == Transition::TRANSITION_CONFIGURE ||
237  transition == Transition::TRANSITION_ACTIVATE)
238  {
239  for (auto & node_name : node_names_) {
240  try {
241  if (!changeStateForNode(node_name, transition) && !hard_change) {
242  return false;
243  }
244  } catch (const std::runtime_error & e) {
245  RCLCPP_ERROR(
246  get_logger(),
247  "Failed to change state for node: %s. Exception: %s.", node_name.c_str(), e.what());
248  return false;
249  }
250  }
251  } else {
252  std::vector<std::string>::reverse_iterator rit;
253  for (rit = node_names_.rbegin(); rit != node_names_.rend(); ++rit) {
254  try {
255  if (!changeStateForNode(*rit, transition) && !hard_change) {
256  return false;
257  }
258  } catch (const std::runtime_error & e) {
259  RCLCPP_ERROR(
260  get_logger(),
261  "Failed to change state for node: %s. Exception: %s.", (*rit).c_str(), e.what());
262  return false;
263  }
264  }
265  }
266  return true;
267 }
268 
269 void
271 {
272  message("Deactivate, cleanup, and shutdown nodes");
273  changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE);
274  changeStateForAllNodes(Transition::TRANSITION_CLEANUP);
275  changeStateForAllNodes(Transition::TRANSITION_UNCONFIGURED_SHUTDOWN);
276 }
277 
278 bool
280 {
281  message("Starting managed nodes bringup...");
282  if (!changeStateForAllNodes(Transition::TRANSITION_CONFIGURE) ||
283  !changeStateForAllNodes(Transition::TRANSITION_ACTIVATE))
284  {
285  RCLCPP_ERROR(get_logger(), "Failed to bring up all requested nodes. Aborting bringup.");
286  return false;
287  }
288  message("Managed nodes are active");
289  system_active_ = true;
290  createBondTimer();
291  return true;
292 }
293 
294 bool
296 {
297  system_active_ = false;
299 
300  message("Shutting down managed nodes...");
303  message("Managed nodes have been shut down");
304  return true;
305 }
306 
307 bool
308 LifecycleManager::reset(bool hard_reset)
309 {
310  system_active_ = false;
312 
313  message("Resetting managed nodes...");
314  // Should transition in reverse order
315  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE, hard_reset) ||
316  !changeStateForAllNodes(Transition::TRANSITION_CLEANUP, hard_reset))
317  {
318  if (!hard_reset) {
319  RCLCPP_ERROR(get_logger(), "Failed to reset nodes: aborting reset");
320  return false;
321  }
322  }
323 
324  message("Managed nodes have been reset");
325  return true;
326 }
327 
328 bool
330 {
331  system_active_ = false;
333 
334  message("Pausing managed nodes...");
335  if (!changeStateForAllNodes(Transition::TRANSITION_DEACTIVATE)) {
336  RCLCPP_ERROR(get_logger(), "Failed to pause nodes: aborting pause");
337  return false;
338  }
339 
340  message("Managed nodes have been paused");
341  return true;
342 }
343 
344 bool
346 {
347  message("Resuming managed nodes...");
348  if (!changeStateForAllNodes(Transition::TRANSITION_ACTIVATE)) {
349  RCLCPP_ERROR(get_logger(), "Failed to resume nodes: aborting resume");
350  return false;
351  }
352 
353  message("Managed nodes are active");
354  system_active_ = true;
355  createBondTimer();
356  return true;
357 }
358 
359 void
361 {
362  if (bond_timeout_.count() <= 0) {
363  return;
364  }
365 
366  message("Creating bond timer...");
367  bond_timer_ = this->create_wall_timer(
368  200ms,
369  std::bind(&LifecycleManager::checkBondConnections, this),
370  callback_group_);
371 }
372 
373 void
375 {
376  if (bond_timer_) {
377  message("Terminating bond timer...");
378  bond_timer_->cancel();
379  bond_timer_.reset();
380  }
381 }
382 
383 void
385 {
386  RCLCPP_INFO(
387  get_logger(), "Running Nav2 LifecycleManager rcl preshutdown (%s)",
388  this->get_name());
389 
391 
392  /*
393  * Dropping the bond map is what we really need here, but we drop the others
394  * to prevent the bond map being used. Likewise, squash the service thread.
395  */
396  service_thread_.reset();
397  node_names_.clear();
398  node_map_.clear();
399  bond_map_.clear();
400 }
401 
402 void
404 {
405  rclcpp::Context::SharedPtr context = get_node_base_interface()->get_context();
406 
407  context->add_pre_shutdown_callback(
408  std::bind(&LifecycleManager::onRclPreshutdown, this)
409  );
410 }
411 
412 void
414 {
415  if (!system_active_ || !rclcpp::ok() || bond_map_.empty()) {
416  return;
417  }
418 
419  for (auto & node_name : node_names_) {
420  if (!rclcpp::ok()) {
421  return;
422  }
423 
424  if (bond_map_[node_name]->isBroken()) {
425  message(
426  std::string(
427  "Have not received a heartbeat from " + node_name + "."));
428 
429  // if one is down, bring them all down
430  RCLCPP_ERROR(
431  get_logger(),
432  "CRITICAL FAILURE: SERVER %s IS DOWN after not receiving a heartbeat for %i ms."
433  " Shutting down related nodes.",
434  node_name.c_str(), static_cast<int>(bond_timeout_.count()));
435  reset(true); // hard reset to transition all still active down
436  // if a server crashed, it won't get cleared due to failed transition, clear manually
437  bond_map_.clear();
438 
439  // Initialize the bond respawn timer to check if server comes back online
440  // after a failure, within a maximum timeout period.
441  if (attempt_respawn_reconnection_) {
442  bond_respawn_timer_ = this->create_wall_timer(
443  1s,
445  callback_group_);
446  }
447  return;
448  }
449  }
450 }
451 
452 void
454 {
455  // First attempt in respawn, start maximum duration to respawn
456  if (bond_respawn_start_time_.nanoseconds() == 0) {
457  bond_respawn_start_time_ = now();
458  }
459 
460  // Note: system_active_ is inverted since this should be in a failure
461  // condition. If another outside user actives the system again, this should not process.
462  if (system_active_ || !rclcpp::ok() || node_names_.empty()) {
463  bond_respawn_start_time_ = rclcpp::Time(0);
464  bond_respawn_timer_.reset();
465  return;
466  }
467 
468  // Check number of live connections after a bond failure
469  int live_servers = 0;
470  const int max_live_servers = node_names_.size();
471  for (auto & node_name : node_names_) {
472  if (!rclcpp::ok()) {
473  return;
474  }
475 
476  try {
477  node_map_[node_name]->get_state(); // Only won't throw if the server exists
478  live_servers++;
479  } catch (...) {
480  break;
481  }
482  }
483 
484  // If all are alive, kill timer and retransition system to active
485  // Else, check if maximum timeout has occurred
486  if (live_servers == max_live_servers) {
487  message("Successfully re-established connections from server respawns, starting back up.");
488  bond_respawn_start_time_ = rclcpp::Time(0);
489  bond_respawn_timer_.reset();
490  startup();
491  } else if (now() - bond_respawn_start_time_ >= bond_respawn_max_duration_) {
492  message("Failed to re-establish connection from a server crash after maximum timeout.");
493  bond_respawn_start_time_ = rclcpp::Time(0);
494  bond_respawn_timer_.reset();
495  }
496 }
497 
498 #define ANSI_COLOR_RESET "\x1b[0m"
499 #define ANSI_COLOR_BLUE "\x1b[34m"
500 
501 void
502 LifecycleManager::message(const std::string & msg)
503 {
504  RCLCPP_INFO(get_logger(), ANSI_COLOR_BLUE "\33[1m%s\33[0m" ANSI_COLOR_RESET, msg.c_str());
505 }
506 
507 } // namespace nav2_lifecycle_manager
508 
509 #include "rclcpp_components/register_node_macro.hpp"
510 RCLCPP_COMPONENTS_REGISTER_NODE(nav2_lifecycle_manager::LifecycleManager)
Implements service interface to transition the lifecycle nodes of Nav2 stack. It receives transition ...
void createBondTimer()
Support function for creating bond timer.
void destroyLifecycleServiceClients()
Destroy all the lifecycle service clients.
void destroyBondTimer()
Support function for killing bond connections.
bool pause()
Pause all the managed nodes.
bool shutdown()
Deactivate, clean up and shut down all the managed nodes.
bool resume()
Resume all the managed nodes.
void onRclPreshutdown()
Perform preshutdown activities before our Context is shutdown. Note that this is related to our Conte...
void CreateActiveDiagnostic(diagnostic_updater::DiagnosticStatusWrapper &stat)
function to check if the Nav2 system is active
bool createBondConnection(const std::string &node_name)
Support function for creating bond connections.
void message(const std::string &msg)
Helper function to highlight the output on the console.
~LifecycleManager()
A destructor for nav2_lifecycle_manager::LifecycleManager.
bool reset(bool hard_reset=false)
Reset all the managed nodes.
void shutdownAllNodes()
Support function for shutdown.
void managerCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< ManageLifecycleNodes::Request > request, std::shared_ptr< ManageLifecycleNodes::Response > response)
Lifecycle node manager callback function.
bool changeStateForNode(const std::string &node_name, std::uint8_t transition)
For a node, transition to the new target state.
bool changeStateForAllNodes(std::uint8_t transition, bool hard_change=false)
For each node in the map, transition to the new target state.
void createLifecycleServiceClients()
Support function for creating service clients.
void isActiveCallback(const std::shared_ptr< rmw_request_id_t > request_header, const std::shared_ptr< std_srvs::srv::Trigger::Request > request, std::shared_ptr< std_srvs::srv::Trigger::Response > response)
Trigger callback function checks if the managed nodes are in active state.
Helper functions to interact with a lifecycle node.