emqx_cluster_rescue 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3. # ==================================
  4. # RESCUE THE UNBOOTABLE EMQX CLUSTER
  5. # ==================================
  6. ## Global Vars
  7. # Steal from emqx_ctl
  8. THIS_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")" || true; pwd -P)"
  9. usage() {
  10. local Script
  11. Script=$(basename "$0")
  12. echo "
  13. RESCUE THE UNBOOTABLE EMQX CLUSTER
  14. Use this script only when the entire cluster is stuck at booting & loading.
  15. This script provides a list of methods to *hack* the DB of EMQX to bring back
  16. the cluster back to service but MAY come with some side effects including:
  17. - Data loss
  18. - Inconsistent data in the cluster
  19. - Other undefined behaviors
  20. *DO NOT* use this script unless you understand the consequences.
  21. *DO NOT* use this script when EMQX cluster is partitioned.
  22. Use Case:
  23. - Lost one node due to unrecoverable failures (hardware, cloud resource outage)
  24. and this node prevents other nodes in the cluster from starting.
  25. Usage:
  26. # For troubleshooting, find out all the tables that are pending at loading
  27. $Script pending-tables
  28. # For troubleshooting, debug print detailed table info that is pending at loading.
  29. $Script table-details
  30. # Force load one [Tab] or all pending tables from node local storage to bring this node up
  31. # Use local data as the data source for the pending tables, should bring up the node immediately and
  32. # spread the data to other nodes in the cluster.
  33. #
  34. # * Take effect immediately
  35. # * This is a node local change but the change will be lost after restart.
  36. $Script force-load [Tab]
  37. # Remove Node from mnesia cluster.
  38. # Most likely will fail if the remote Node is unreachable.
  39. #
  40. # * This is a cluster wide schema change.
  41. $Script remove-node Node
  42. # Set master node for distributed DB
  43. # The master node will be the data source for pending tables.
  44. #
  45. # * This is a node local change
  46. # * Node could be a remote Erlang node in the cluster or local erlang node
  47. # * Use command: 'unset-master' to rollback
  48. $Script set-master Node
  49. # Unset master node for distributed DB, this is a node local change
  50. $Script unset-master
  51. # Cheat the local node that RemoteNode is down so that it will not wait for it to come up.
  52. # Local node will take local data as the data source for pending tables and spread the data
  53. # to the other pending nodes.
  54. #
  55. # * Check EMQX logs to find out which remote node(s) the local node is waiting for
  56. # * To take effect, restart this EMQX node
  57. # * This is a node local setting
  58. $Script lie-node-down RemoteNode
  59. Tips:
  60. - Override local node name with envvar: \$EMQX_NODE__NAME
  61. "
  62. }
  63. # Functions
  64. #
  65. print_pending_tables() {
  66. local erl_cmd='[ io:format("~p :: ~p~n", [T, maps:with([all_nodes, load_order, storage_type,
  67. active_replicas, local_content, load_by_force,
  68. load_node, load_reason, master_nodes]
  69. , maps:from_list(mnesia:table_info(T, all)))])
  70. || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
  71. ok
  72. '
  73. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  74. }
  75. print_details_per_table() {
  76. local erl_cmd='[ io:format("~p :: ~p~n", [T, mnesia:table_info(T, all)])
  77. || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
  78. ok
  79. '
  80. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  81. }
  82. force-load() {
  83. if [ $# -eq 1 ]; then
  84. local erl_cmd="mnesia:force_load_table(${1})"
  85. else
  86. local erl_cmd='[ {T, mnesia:force_load_table(T)}
  87. || T <- mnesia:system_info(local_tables),
  88. unknown =:= mnesia:table_info(T, load_node)
  89. ]
  90. '
  91. fi
  92. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  93. }
  94. remove-node() {
  95. local target_node=$1
  96. local erl_cmd="
  97. case [T || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node)] of
  98. [] ->
  99. io:format(\"No table need to load\\n\"),
  100. skipped;
  101. TargetTables ->
  102. io:format(\"Going to remove node ${target_node} from schema of the tables:~n~p~n\", [TargetTables]),
  103. case io:read(\"confirm? [yes.] OR Ctrl-D to skip: \") of
  104. {ok, yes} ->
  105. lists:map(fun(T) ->
  106. mnesia:force_load_table(T),
  107. {T, mnesia:del_table_copy(T, '${target_node}') }
  108. end, TargetTables);
  109. eof -> skipped;
  110. R -> {skipped, R}
  111. end
  112. end
  113. "
  114. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  115. }
  116. set-master-node() {
  117. if [ $# -eq 1 ]; then
  118. local erl_cmd="mnesia:set_master_nodes(['${1}']), mnesia_recover:dump_decision_tab()"
  119. else
  120. local erl_cmd="mnesia:set_master_nodes([]), mnesia_recover:dump_decision_tab()"
  121. fi
  122. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  123. }
  124. lie-node-down() {
  125. if [ $# -eq 1 ]; then
  126. local erl_cmd="mnesia_recover:log_mnesia_down('${1}'), mnesia_recover:dump_decision_tab()"
  127. exec "$THIS_DIR/emqx" eval "$erl_cmd"
  128. else
  129. usage
  130. fi
  131. }
  132. CMD=${1:-usage}
  133. [ $# -gt 0 ] && shift 1
  134. case "$CMD" in
  135. force-load)
  136. force-load "$@"
  137. ;;
  138. remove-node)
  139. remove-node "$@"
  140. ;;
  141. pending-tables)
  142. print_pending_tables
  143. ;;
  144. table-details)
  145. print_details_per_table
  146. ;;
  147. set-master)
  148. set-master-node "$@"
  149. ;;
  150. unset-master)
  151. set-master-node
  152. ;;
  153. lie-node-down)
  154. lie-node-down "$@"
  155. ;;
  156. *)
  157. usage
  158. esac