| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- #!/usr/bin/env bash
- set -euo pipefail
- # ==================================
- # RESCUE THE UNBOOTABLE EMQX CLUSTER
- # ==================================
- ## Global Vars
- # Steal from emqx_ctl
- THIS_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")" || true; pwd -P)"
- usage() {
- local Script
- Script=$(basename "$0")
- echo "
- RESCUE THE UNBOOTABLE EMQX CLUSTER
- Use this script only when the entire cluster is stuck at booting & loading.
- This script provides a list of methods to *hack* the DB of EMQX to bring back
- the cluster back to service but MAY come with some side effects including:
- - Data loss
- - Inconsistent data in the cluster
- - Other undefined behaviors
- *DO NOT* use this script unless you understand the consequences.
- *DO NOT* use this script when EMQX cluster is partitioned.
- Use Case:
- - Lost one node due to unrecoverable failures (hardware, cloud resource outage)
- and this node prevents other nodes in the cluster from starting.
- Usage:
- # For troubleshooting, find out all the tables that are pending at loading
- $Script pending-tables
- # For troubleshooting, debug print detailed table info that is pending at loading.
- $Script table-details
- # Force load one [Tab] or all pending tables from node local storage to bring this node up
- # Use local data as the data source for the pending tables, should bring up the node immediately and
- # spread the data to other nodes in the cluster.
- #
- # * Take effect immediately
- # * This is a node local change but the change will be lost after restart.
- $Script force-load [Tab]
- # Remove Node from mnesia cluster.
- # Most likely will fail if the remote Node is unreachable.
- #
- # * This is a cluster wide schema change.
- $Script remove-node Node
- # Set master node for distributed DB
- # The master node will be the data source for pending tables.
- #
- # * This is a node local change
- # * Node could be a remote Erlang node in the cluster or local erlang node
- # * Use command: 'unset-master' to rollback
- $Script set-master Node
- # Unset master node for distributed DB, this is a node local change
- $Script unset-master
- # Cheat the local node that RemoteNode is down so that it will not wait for it to come up.
- # Local node will take local data as the data source for pending tables and spread the data
- # to the other pending nodes.
- #
- # * Check EMQX logs to find out which remote node(s) the local node is waiting for
- # * To take effect, restart this EMQX node
- # * This is a node local setting
- $Script lie-node-down RemoteNode
- Tips:
- - Override local node name with envvar: \$EMQX_NODE__NAME
- "
- }
- # Functions
- #
- print_pending_tables() {
- local erl_cmd='[ io:format("~p :: ~p~n", [T, maps:with([all_nodes, load_order, storage_type,
- active_replicas, local_content, load_by_force,
- load_node, load_reason, master_nodes]
- , maps:from_list(mnesia:table_info(T, all)))])
- || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
- ok
- '
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- }
- print_details_per_table() {
- local erl_cmd='[ io:format("~p :: ~p~n", [T, mnesia:table_info(T, all)])
- || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
- ok
- '
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- }
- force-load() {
- if [ $# -eq 1 ]; then
- local erl_cmd="mnesia:force_load_table(${1})"
- else
- local erl_cmd='[ {T, mnesia:force_load_table(T)}
- || T <- mnesia:system_info(local_tables),
- unknown =:= mnesia:table_info(T, load_node)
- ]
- '
- fi
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- }
- remove-node() {
- local target_node=$1
- local erl_cmd="
- case [T || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node)] of
- [] ->
- io:format(\"No table need to load\\n\"),
- skipped;
- TargetTables ->
- io:format(\"Going to remove node ${target_node} from schema of the tables:~n~p~n\", [TargetTables]),
- case io:read(\"confirm? [yes.] OR Ctrl-D to skip: \") of
- {ok, yes} ->
- lists:map(fun(T) ->
- mnesia:force_load_table(T),
- {T, mnesia:del_table_copy(T, '${target_node}') }
- end, TargetTables);
- eof -> skipped;
- R -> {skipped, R}
- end
- end
- "
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- }
- set-master-node() {
- if [ $# -eq 1 ]; then
- local erl_cmd="mnesia:set_master_nodes(['${1}']), mnesia_recover:dump_decision_tab()"
- else
- local erl_cmd="mnesia:set_master_nodes([]), mnesia_recover:dump_decision_tab()"
- fi
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- }
- lie-node-down() {
- if [ $# -eq 1 ]; then
- local erl_cmd="mnesia_recover:log_mnesia_down('${1}'), mnesia_recover:dump_decision_tab()"
- exec "$THIS_DIR/emqx" eval "$erl_cmd"
- else
- usage
- fi
- }
- CMD=${1:-usage}
- [ $# -gt 0 ] && shift 1
- case "$CMD" in
- force-load)
- force-load "$@"
- ;;
- remove-node)
- remove-node "$@"
- ;;
- pending-tables)
- print_pending_tables
- ;;
- table-details)
- print_details_per_table
- ;;
- set-master)
- set-master-node "$@"
- ;;
- unset-master)
- set-master-node
- ;;
- lie-node-down)
- lie-node-down "$@"
- ;;
- *)
- usage
- esac
|