Description of problem: Glusterd crashed when volume was stopped. Version-Release number of selected component (if applicable): [root@rhsqa14-vm1 ~]# rpm -qa | grep gluster glusterfs-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-devel-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-geo-replication-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-resource-agents-3.7dev-0.952.gita7f1d08.el6.noarch glusterfs-libs-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-api-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-fuse-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-extra-xlators-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-regression-tests-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-rdma-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-debuginfo-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-cli-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-server-3.7dev-0.994.gitf522001.el6.x86_64 glusterfs-api-devel-3.7dev-0.994.gitf522001.el6.x86_64 [root@rhsqa14-vm1 ~]# [root@rhsqa14-vm1 ~]# glusterfs --version glusterfs 3.7dev built on Apr 13 2015 07:14:26 Repository revision: git://git.gluster.com/glusterfs.git Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com/> GlusterFS comes with ABSOLUTELY NO WARRANTY. It is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. [root@rhsqa14-vm1 ~]# How reproducible: easily Steps to Reproduce: 1.gluster v VOl stop 2.glusterd crashes. 3. Actual results: [root@rhsqa14-vm1 ~]# for i in { Tim Vol_test everglades mix test testing tri }; do gluster v stop $i; done Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. Stopping volume will make its data inaccessible. Do you want to continue? (y/n) Connection failed. Please check if gluster daemon is operational. [root@rhsqa14-vm1 ~]# Additional info: core and sosreport available here: http://rhsqe-repo.lab.eng.blr.redhat.com/sosreports/107 Debug info and bt output (gdb) p svc $1 = (glusterd_svc_t *) 0x941178 (gdb) p volinfo $2 = (glusterd_volinfo_t *) 0x940d60 (gdb) p *volinfo $3 = {lock = 1, is_snap_volume = _gf_false, snapshot = 0x0, restored_from_snap = "\227\310ƥγB֑\335\355\252Q\036wI", tier_info = {cold_type = 2, cold_brick_count = 2, cold_replica_count = 2, cold_disperse_count = 0, cold_dist_leaf_count = 0, hot_type = 2, hot_brick_count = 0, hot_replica_count = 0}, parent_volname = "N/A", '\000' <repeats 252 times>, volname = "Vol_test", '\000' <repeats 247 times>, type = 5, brick_count = 6, snap_count = 1, snap_max_hard_limit = 256, vol_list = {next = 0x9bb848, prev = 0x97a958}, snapvol_list = {next = 0x940fc8, prev = 0x940fc8}, bricks = {next = 0x94e070, prev = 0x9664e0}, snap_volumes = {next = 0x4f17238, prev = 0x4f17238}, status = GLUSTERD_STATUS_STOPPED, sub_count = 2, stripe_count = 1, replica_count = 2, arbiter_count = 0, disperse_count = 0, redundancy_count = 0, subvol_count = 3, dist_leaf_count = 2, port = 0, shandle = 0x8f74f0, rb_shandle = 0x949690, node_state_shandle = 0x923970, quota_conf_shandle = 0x9237a0, rebal = {defrag_status = GF_DEFRAG_STATUS_NOT_STARTED, rebalance_files = 0, rebalance_data = 0, lookedup_files = 0, skipped_files = 0, defrag = 0x0, defrag_cmd = 0, rebalance_failures = 0, rebalance_id = '\000' <repeats 15 times>, rebalance_time = 0, op = GD_OP_NONE, dict = 0x0}, rep_brick = {rb_status = GF_RB_STATUS_NONE, src_brick = 0x0, dst_brick = 0x0, rb_id = '\000' <repeats 15 times>}, version = 16, quota_conf_version = 0, cksum = 2202584249, quota_conf_cksum = 1271429248, transport_type = GF_TRANSPORT_TCP, dict = 0x7f4aaa67a8a0, volume_id = "\360\275\216\237\061\035H\217\265\302\324\375y\031J\f", auth = {username = 0x8f7240 "6619500d-6c3e-4e55-84d2-779207fe0ba7", password = 0x8f72c0 "11cae2ff-d6d3-4626-af25-296cee25d609"}, logdir = 0x0, gsync_slaves = 0x7f4aaa67a92c, gsync_active_slaves = 0x7f4aaa67a9b8, decommission_in_progress = 0, xl = 0x892950, memory_accounting = _gf_false, caps = 0, op_version = 30600, client_op_version = 30600, reflock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, refcnt = 1, quorum_status = NOT_APPLICABLE_QUORUM, snapd = {svc = {name = '\000' <repeats 4095 times>, conn = {rpc = 0x0, sockpath = '\000' <repeats 4095 times>, frame_timeout = 0, notify = 0}, proc = {name = '\000' <repeats 4095 times>, pidfile = '\000' <repeats 4095 times>, logdir = '\000' <repeats 4095 times>, logfile = '\000' <repeats 4095 times>, volfile = '\000' <repeats 4095 times>, volfileserver = '\000' <repeats 4095 times>, volfileid = '\000' <repeats 255 times>}, manager = 0, start = 0, stop = 0, online = _gf_false}, port = 0, handle = 0x923650}} (gdb) p *svc $4 = {name = '\000' <repeats 4095 times>, conn = {rpc = 0x0, sockpath = '\000' <repeats 4095 times>, frame_timeout = 0, notify = 0}, proc = {name = '\000' <repeats 4095 times>, pidfile = '\000' <repeats 4095 times>, logdir = '\000' <repeats 4095 times>, logfile = '\000' <repeats 4095 times>, volfile = '\000' <repeats 4095 times>, volfileserver = '\000' <repeats 4095 times>, volfileid = '\000' <repeats 255 times>}, manager = 0, start = 0, stop = 0, online = _gf_false} (gdb) bt #0 0x0000000000000000 in ?? () #1 0x00007f4aa1b64562 in glusterd_stop_volume (volinfo=0x940d60) at glusterd-volume-ops.c:2390 #2 0x00007f4aa1b64cdf in glusterd_op_stop_volume (dict=<value optimized out>) at glusterd-volume-ops.c:2430 #3 0x00007f4aa1b00b74 in glusterd_op_commit_perform (op=GD_OP_STOP_VOLUME, dict=0x7f4aaa6a882c, op_errstr=0x7f4a8d77a280, rsp_dict=0x7f4aaa6a88b8) at glusterd-op-sm.c:4999 #4 0x00007f4aa1b73d98 in gd_commit_op_phase (op=GD_OP_STOP_VOLUME, op_ctx=0x7f4aaa6a87a0, req_dict=0x7f4aaa6a882c, op_errstr=0x7f4a8d77a280, txn_opinfo=0x7f4a8d77a200) at glusterd-syncop.c:1295 #5 0x00007f4aa1b756ac in gd_sync_task_begin (op_ctx=0x7f4aaa6a87a0, req=0x8cad8c) at glusterd-syncop.c:1736 #6 0x00007f4aa1b756fb in glusterd_op_begin_synctask (req=0x8cad8c, op=<value optimized out>, dict=0x7f4aaa6a87a0) at glusterd-syncop.c:1787 #7 0x00007f4aa1b65786 in __glusterd_handle_cli_stop_volume (req=0x8cad8c) at glusterd-volume-ops.c:545 #8 0x00007f4aa1adcd7f in glusterd_big_locked_handler (req=0x8cad8c, actor_fn=0x7f4aa1b655d0 <__glusterd_handle_cli_stop_volume>) at glusterd-handler.c:83 #9 0x000000356f061c72 in synctask_wrap (old_task=<value optimized out>) at syncop.c:375 #10 0x00000036ae6438f0 in ?? () from /lib64/libc.so.6 #11 0x0000000000000000 in ?? () (gdb)
This was already reported as bug-1213295 . This happened as a data structure related to snapshot in gluster wasn't initialized when GlusterD was restarted. This would cause volume stop to crash. A fix has been posted for review https://review.gluster.org/10304 . I'm closing this bug as a duplicate. Sorry for the inconvenience, I should have remembered that original bug. *** This bug has been marked as a duplicate of bug 1213295 ***