Login
[x]
Log in using an account from:
Fedora Account System
Red Hat Associate
Red Hat Customer
Or login using a Red Hat Bugzilla account
Forgot Password
Login:
Hide Forgot
Create an Account
Red Hat Bugzilla – Attachment 1453211 Details for
Bug 1572886
add delay to `pcs cluster start --all` to avoid corosync JOIN flood
[?]
New
Simple Search
Advanced Search
My Links
Browse
Requests
Reports
Current State
Search
Tabular reports
Graphical reports
Duplicates
Other Reports
User Changes
Plotly Reports
Bug Status
Bug Severity
Non-Defaults
|
Product Dashboard
Help
Page Help!
Bug Writing Guidelines
What's new
Browser Support Policy
5.0.4.rh83 Release notes
FAQ
Guides index
User guide
Web Services
Contact
Legal
This site requires JavaScript to be enabled to function correctly, please enable it.
[patch]
proposed fix
do-not-start-corosync-on-all-nodes-at-once.patch (text/plain), 12.72 KB, created by
Tomas Jelinek
on 2018-06-20 12:43:17 UTC
(
hide
)
Description:
proposed fix
Filename:
MIME Type:
Creator:
Tomas Jelinek
Created:
2018-06-20 12:43:17 UTC
Size:
12.72 KB
patch
obsolete
>From 04783e919437b8eeac5c2498540ddb65347dc5bb Mon Sep 17 00:00:00 2001 >From: Tomas Jelinek <tojeline@redhat.com> >Date: Mon, 18 Jun 2018 17:01:25 +0200 >Subject: [PATCH] do not start corosync on all nodes at once > >--- > CHANGELOG.md | 3 ++ > pcs/cluster.py | 125 +++++++++++++++++++++++++++++++++++++++++++++----- > pcs/utils.py | 35 +++++++++++++- > pcsd/capabilities.xml | 8 ++++ > pcsd/remote.rb | 45 ++++++++---------- > 5 files changed, 177 insertions(+), 39 deletions(-) > >diff --git a/CHANGELOG.md b/CHANGELOG.md >index 3edb7e29..6ff0f5b1 100644 >--- a/CHANGELOG.md >+++ b/CHANGELOG.md >@@ -10,6 +10,8 @@ > - Options to display and filter failures by an operation and its interval in > `pcs resource failcount reset` and `pcs resource failcount show` commands > ([rhbz#1427273]) >+- When starting a cluster, each node is now started with a small delay to help >+ preventing JOIN flood in corosync ([rhbz#1572886]) > > ### Fixed > - `pcs cib-push diff-against=` does not consider an empty diff as an error >@@ -40,6 +42,7 @@ > [rhbz#1535967]: https://bugzilla.redhat.com/show_bug.cgi?id=1535967 > [rhbz#1566382]: https://bugzilla.redhat.com/show_bug.cgi?id=1566382 > [rhbz#1568353]: https://bugzilla.redhat.com/show_bug.cgi?id=1568353 >+[rhbz#1572886]: https://bugzilla.redhat.com/show_bug.cgi?id=1572886 > [rhbz#1574898]: https://bugzilla.redhat.com/show_bug.cgi?id=1574898 > [rhbz#1579911]: https://bugzilla.redhat.com/show_bug.cgi?id=1579911 > [rhbz#1581150]: https://bugzilla.redhat.com/show_bug.cgi?id=1581150 >diff --git a/pcs/cluster.py b/pcs/cluster.py >index 0b3b4fe3..a64fd5fd 100644 >--- a/pcs/cluster.py >+++ b/pcs/cluster.py >@@ -1053,11 +1053,37 @@ def start_cluster(argv): > wait_for_nodes_started(nodes, wait_timeout) > return > >- print("Starting Cluster...") >- service_list = [] >+ start_all = ( >+ "--pacemaker" not in utils.pcs_options >+ and >+ "--corosync" not in utils.pcs_options >+ ) >+ if start_all or "--corosync" in utils.pcs_options: >+ start_cluster_corosync() >+ if start_all or "--pacemaker" in utils.pcs_options: >+ start_cluster_pacemaker() >+ >+ # --wait will never succeed if only corosync is started. However, >+ # --pacemaker and --corosync is only supposed to be used from pcsd which >+ # does not use --wait. --wait is used from cli, where --corosync and >+ # --pacemaker is not supposed to be used (it's not documented). >+ if wait: >+ wait_for_nodes_started([], wait_timeout) >+ >+def start_cluster_pacemaker(): >+ print("Starting Cluster (pacemaker)...") >+ output, retval = utils.start_service("pacemaker") >+ if retval != 0: >+ print(output) >+ utils.err("unable to start pacemaker") >+ >+def start_cluster_corosync(): > if utils.is_cman_cluster(): >-# Verify that CMAN_QUORUM_TIMEOUT is set, if not, then we set it to 0 >- retval, output = getstatusoutput('source /etc/sysconfig/cman ; [ -z "$CMAN_QUORUM_TIMEOUT" ]') >+ print("Starting Cluster (cman)...") >+ # Verify that CMAN_QUORUM_TIMEOUT is set, if not, then we set it to 0 >+ retval, output = getstatusoutput( >+ 'source /etc/sysconfig/cman ; [ -z "$CMAN_QUORUM_TIMEOUT" ]' >+ ) > if retval == 0: > with open("/etc/sysconfig/cman", "a") as cman_conf_file: > cman_conf_file.write("\nCMAN_QUORUM_TIMEOUT=0\n") >@@ -1066,18 +1092,17 @@ def start_cluster(argv): > if retval != 0: > print(output) > utils.err("unable to start cman") >- else: >- service_list.append("corosync") >- if utils.need_to_handle_qdevice_service(): >- service_list.append("corosync-qdevice") >- service_list.append("pacemaker") >+ return >+ >+ print("Starting Cluster (corosync)...") >+ service_list = ["corosync"] >+ if utils.need_to_handle_qdevice_service(): >+ service_list.append("corosync-qdevice") > for service in service_list: > output, retval = utils.start_service(service) > if retval != 0: > print(output) > utils.err("unable to start {0}".format(service)) >- if wait: >- wait_for_nodes_started([], wait_timeout) > > def start_cluster_all(): > wait = False >@@ -1092,6 +1117,31 @@ def start_cluster_all(): > if wait: > wait_for_nodes_started(all_nodes, wait_timeout) > >+class IsComponentStartSupported(object): >+ def __init__(self, node): >+ self.node = node >+ self.supported = False >+ self.error = None >+ >+ def run(self): >+ code, output = utils.getPcsdCapabilities(self.node) >+ if code != 0: >+ message = '{0}: {1}'.format(self.node, output.strip()) >+ print(message) >+ self.error = message >+ else: >+ try: >+ data = json.loads(output) >+ if ( >+ "node.start-stop-enable-disable.start-component" >+ in >+ data["pcsd_capabilities"] >+ ): >+ self.supported = True >+ except KeyError, ValueError: >+ # not a valid json or 404 => not supported >+ pass >+ > def start_cluster_nodes(nodes): > # Large clusters take longer time to start up. So we make the timeout longer > # for each 8 nodes: >@@ -1104,13 +1154,64 @@ def start_cluster_nodes(nodes): > timeout = int( > settings.default_request_timeout * math.ceil(len(nodes) / 8.0) > ) >+ was_error = False >+ >+ task_list = [ >+ IsComponentStartSupported(node) for node in nodes >+ ] >+ utils.run_parallel([task.run for task in task_list]) >+ nodes_supported = [] >+ nodes_not_supported = [] >+ accessible_nodes = [] >+ for task in task_list: >+ if task.error: >+ # unable to connect, unathorized >+ was_error = True >+ print("{0}: Not starting cluster - node is unreachable".format( >+ task.node >+ )) >+ else: >+ if task.supported: >+ nodes_supported.append(task.node) >+ else: >+ nodes_not_supported.append(task.node) >+ if not nodes_supported + nodes_not_supported: >+ utils.err("unable to start all nodes") >+ > node_errors = parallel_for_nodes( >- utils.startCluster, nodes, quiet=True, timeout=timeout >+ utils.startCorosync, >+ nodes_supported, >+ quiet=True, >+ timeout=timeout, >+ __sleep_step=0.25 >+ ) >+ started_corosync_nodes = [ >+ node for node in nodes_supported if node not in node_errors.keys() >+ ] >+ if node_errors: >+ utils.err( >+ "unable to start all nodes\n" + "\n".join(node_errors.values()), >+ exit_after_error=not started_corosync_nodes >+ ) >+ was_error = True >+ >+ for node in node_errors: >+ print("{0}: Not starting cluster - node is unreachable".format(node)) >+ >+ # nodes not supporting separate corosync / pacemaker start will start both >+ # pacemaker and corosync >+ node_errors = parallel_for_nodes( >+ utils.startPacemaker, >+ set(started_corosync_nodes + nodes_not_supported), >+ quiet=True, >+ timeout=timeout > ) > if node_errors: > utils.err( > "unable to start all nodes\n" + "\n".join(node_errors.values()) > ) >+ if was_error: >+ utils.err("unable to start all nodes") > > def is_node_fully_started(node_status): > return ( >diff --git a/pcs/utils.py b/pcs/utils.py >index bf5cc514..eb02ca34 100644 >--- a/pcs/utils.py >+++ b/pcs/utils.py >@@ -157,6 +157,11 @@ def getPcsdInstanceSignature(node): > node, 'remote/pcsd_instance_signature', None, False, False > ) > >+def getPcsdCapabilities(node): >+ return sendHTTPRequest( >+ node, 'remote/capabilities', printResult=False, printSuccess=False >+ ) >+ > def get_uid_gid_file_name(uid, gid): > return "pcs-uidgid-%s-%s" % (uid, gid) > >@@ -282,10 +287,29 @@ def getPacemakerNodeStatus(node): > node, "remote/pacemaker_node_status", None, False, False > ) > >-def startCluster(node, quiet=False, timeout=None): >+def startPacemaker(node, quiet=False, timeout=None): >+ return startCluster( >+ node, quiet=quiet, timeout=timeout, pacemaker=True, corosync=False >+ ) >+ >+def startCorosync(node, quiet=False, timeout=None): >+ return startCluster( >+ node, quiet=quiet, timeout=timeout, pacemaker=False, corosync=True >+ ) >+ >+def startCluster( >+ node, quiet=False, timeout=None, pacemaker=True, corosync=True >+): >+ data = dict() >+ if pacemaker and not corosync: >+ data["component"] = "pacemaker" >+ elif corosync and not pacemaker: >+ data["component"] = "corosync" >+ data = urllib_urlencode(data) > return sendHTTPRequest( > node, > "remote/cluster_start", >+ data, > printResult=False, > printSuccess=not quiet, > timeout=timeout >@@ -1235,13 +1259,20 @@ def run_parallel(worker_list, wait_seconds=1): > > def create_task(report, action, node, *args, **kwargs): > def worker(): >+ sleep = kwargs.pop("__sleep", 0) >+ if sleep: >+ time.sleep(sleep) > returncode, output = action(node, *args, **kwargs) > report(node, returncode, output) > return worker > > def create_task_list(report, action, node_list, *args, **kwargs): >+ sleep_step = kwargs.pop("__sleep_step", 0) > return [ >- create_task(report, action, node, *args, **kwargs) for node in node_list >+ create_task( >+ report, action, node, *args, __sleep=(index * sleep_step), **kwargs >+ ) >+ for index, node in enumerate(node_list) > ] > > def parallel_for_nodes(action, node_list, *args, **kwargs): >diff --git a/pcsd/capabilities.xml b/pcsd/capabilities.xml >index 55a514d5..495fde5e 100644 >--- a/pcsd/capabilities.xml >+++ b/pcsd/capabilities.xml >@@ -429,6 +429,14 @@ > pcs commands: cluster start --wait[=timeout] > </description> > </capability> >+ <capability id="node.start-stop-enable-disable.start-component" in-pcs="0" in-pcsd="1"> >+ <description> >+ At the node level, provide means for starting services separatelly so >+ the nodes starting can be synchronized on the cluster level. >+ >+ daemon urls: cluster_start (param: component) >+ </description> >+ </capability> > <capability id="node.start-stop-enable-disable.stop-component" in-pcs="0" in-pcsd="1"> > <description> > At the node level, provide means for stopping services separatelly so >diff --git a/pcsd/remote.rb b/pcsd/remote.rb >index ebd4ea8d..27af41b2 100644 >--- a/pcsd/remote.rb >+++ b/pcsd/remote.rb >@@ -233,33 +233,16 @@ def cluster_status_remote(params, request, auth_user) > return JSON.generate(status) > end > >-def cluster_start(params, request, auth_user) >- if params[:name] >- code, response = send_request_with_token( >- auth_user, params[:name], 'cluster_start', true >- ) >- else >- if not allowed_for_local_cluster(auth_user, Permissions::WRITE) >- return 403, 'Permission denied' >- end >- $logger.info "Starting Daemons" >- output, stderr, retval = run_cmd(auth_user, PCS, 'cluster', 'start') >- $logger.debug output >- if retval != 0 >- return [400, (output + stderr).join] >- else >- return output >- end >+def _cluster_start_stop(action, params, request, auth_user) >+ if not ['start', 'stop'].include?(action) >+ return [400, "Action can be 'start' or 'stop', got '#{action}'"] > end >-end >- >-def cluster_stop(params, request, auth_user) > if params[:name] > params_without_name = params.reject {|key, value| > key == "name" or key == :name > } > code, response = send_request_with_token( >- auth_user, params[:name], 'cluster_stop', true, params_without_name >+ auth_user, params[:name], "cluster_#{action}", true, params_without_name > ) > else > if not allowed_for_local_cluster(auth_user, Permissions::WRITE) >@@ -273,19 +256,31 @@ def cluster_stop(params, request, auth_user) > options << "--corosync" > end > end >- options << "--force" if params["force"] >- $logger.info "Stopping Daemons" >+ if action == "stop" >+ options << "--force" if params["force"] >+ $logger.info "Stopping Daemons" >+ else >+ $logger.info "Starting Daemons" >+ end > stdout, stderr, retval = run_cmd( >- auth_user, PCS, "cluster", "stop", *options >+ auth_user, PCS, 'cluster', action, *options > ) > if retval != 0 >- return [400, stderr.join] >+ return [400, (stdout + stderr).join] > else > return stdout.join > end > end > end > >+def cluster_start(params, request, auth_user) >+ return _cluster_start_stop('start', params, request, auth_user) >+end >+ >+def cluster_stop(params, request, auth_user) >+ return _cluster_start_stop('stop', params, request, auth_user) >+end >+ > def config_backup(params, request, auth_user) > if params[:name] > code, response = send_request_with_token( >-- >2.11.0 >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 1572886
: 1453211 |
1472983