Description of problem: Bought down glusterd in a node (bricks were still up in the node). From another node, took a snapshot. Then started glusterd again. glusterd crashed with the following bt. (gdb) bt #0 0x000000303fa4812c in vfprintf () from /lib64/libc.so.6 #1 0x000000303fa6fa52 in vsnprintf () from /lib64/libc.so.6 #2 0x000000303fa4f523 in snprintf () from /lib64/libc.so.6 #3 0x00007f6647d09071 in _mk_rundir_p (volinfo=0x1df1ef0) at glusterd-utils.c:1748 #4 0x00007f6647d0938e in glusterd_volume_start_glusterfs (volinfo=0x1df1ef0, brickinfo=0x1df7e20, wait=_gf_false) at glusterd-utils.c:1794 #5 0x00007f6647d18418 in glusterd_brick_start (volinfo=0x1df1ef0, brickinfo=0x1df7e20, wait=_gf_false) at glusterd-utils.c:6616 #6 0x00007f6647d185c3 in glusterd_restart_bricks (conf=0x1de8750) at glusterd-utils.c:6648 #7 0x00007f6647d0f130 in glusterd_spawn_daemons (opaque=0x0) at glusterd-utils.c:3448 #8 0x00007f66562fb4ba in synctask_wrap (old_task=0x1e0f340) at syncop.c:333 #9 0x000000303fa43bf0 in ?? () from /lib64/libc.so.6 #10 0x0000000000000000 in ?? () (gdb) f 6 #6 0x00007f6647d185c3 in glusterd_restart_bricks (conf=0x1de8750) at glusterd-utils.c:6648 6648 glusterd_brick_start (volinfo, brickinfo, _gf_false); (gdb) p conf->volumes $3 = {next = 0x1de97e0, prev = 0x1de97e0} (gdb) macro define list_entry(ptr, type, member) ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) (gdb) p list_entry(conf->volumes, glusterd_volinfo_t, vol_list) $4 = (glusterd_volinfo_t *) 0x1de95a8 (gdb) p volinfo $5 = (glusterd_volinfo_t *) 0x1df1ef0 (gdb) p *$4 $6 = {lock = 0, is_snap_volume = _gf_false, snapshot = 0x0, restored_from_snap = '\000' <repeats 15 times>, parent_volname = '\000' <repeats 255 times>, volname = '\000' <repeats 240 times>"\260, \271\335\001\000\000\000\000p\271\335\001\000\000\000", type = 31333456, brick_count = 0, snap_count = 31309440, snap_max_hard_limit = 140077266235408, vol_list = { next = 0x1de97e0, prev = 0x1de97e0}, snapvol_list = {next = 0x1de97f0, prev = 0x1de97f0}, bricks = {next = 0x0, prev = 0x0}, snap_volumes = {next = 0x0, prev = 0x0}, status = GLUSTERD_STATUS_NONE, sub_count = 0, stripe_count = 31375480, replica_count = 0, subvol_count = 31396408, dist_leaf_count = 0, port = 31332480, shandle = 0x0, rb_shandle = 0x1de99e0, node_state_shandle = 0x0, quota_conf_shandle = 0x32, rebal = {defrag_status = GF_DEFRAG_STATUS_NOT_STARTED, rebalance_files = 140077268159895, rebalance_data = 140077268159949, lookedup_files = 140077271316704, skipped_files = 140077483987300, defrag = 0x7f6654ac91f0, defrag_cmd = 0, rebalance_failures = 0, rebalance_id = "\240\230\336\001\000\000\000\000\240\230\336\001\000\000\000", rebalance_time = 0, op = GD_OP_NONE, dict = 0x1dea760}, rep_brick = {rb_status = 4, src_brick = 0x0, dst_brick = 0x0, rb_id = "8\225\254Tf\177\000\000\000\000\000\000\000\000\000"}, version = 0, quota_conf_version = 0, cksum = 0, quota_conf_cksum = 0, transport_type = GF_TRANSPORT_TCP, nfs_transport_type = GF_TRANSPORT_TCP, dict = 0x0, volume_id = "\000\000\000\000\006\000\000\000\003\000\000\000\000\000\000", auth = { username = 0x3 <Address 0x3 out of bounds>, password = 0x3 <Address 0x3 out of bounds>}, logdir = 0x1de98e8 "", gsync_slaves = 0x0, decommission_in_progress = 31398816, xl = 0x1df1ba0, memory_accounting = _gf_true, caps = 0, op_version = 31519552, client_op_version = 0, reflock = {__data = {__lock = 0, __count = 0, __owner = 31331968, __nusers = 0, __kind = 49152, __spins = 0, __list = { __prev = 0x100, __next = 0x5a}}, __size = "\000\000\000\000\000\000\000\000\200\026\336\001\000\000\000\000\000\300\000\000\000\000\000\000\000\001\000\000\000\000\000\000Z\000\000\000\000\000\000", __align = 0}, refcnt = 0, quorum_status = NOT_APPLICABLE_QUORUM} (gdb) p *$5 $7 = {lock = 1851880052, is_snap_volume = 1919905907, snapshot = 0xd00657079742d74, restored_from_snap = "\360\255\272\000\000\000\000\000A\000\000\000\000\000\000", parent_volname = "'\000\000\000\004\000\000\000\000\000\000\000\200`\335\001\000\000\000\000\276\272\376\312\000\000\000\000\000\000\000\000\066\060\060\000\r\360\255\272\000\361\340\001\000\000\000\000\377\377\377\377\000\000\000\000A\000\000\000\000\000\000\000-\000\000\000\016\000\000\000\000\000\000\000\200`\335\001\000\000\000\000\276\272\376\312\000\000\000\000\000\000\000\000frame-timeout\000\r\360\255\272\000\000\000\000\000\000a\003\000\000\000\000\000\000\330\376\330?0\000\000\000\260H\341\001\000\000\000\000glusterd\000\325\330?0", '\000' <repeats 98 times>, volname = '\000' <repeats 240 times>, "@\320\330?0\000\000\000\341\001\000\000\000\000\000", type = 1071185624, brick_count = 48, snap_count = 207229615832, snap_max_hard_limit = 256, vol_list = { next = 0x1df2128, prev = 0x1df2128}, snapvol_list = {next = 0x1df2138, prev = 0x1df2138}, bricks = {next = 0x1df2148, prev = 0x1df2148}, snap_volumes = {next = 0x1df2158, prev = 0x1df2158}, status = GLUSTERD_STATUS_STARTED, sub_count = 3, stripe_count = 1, replica_count = 3, subvol_count = 1, dist_leaf_count = 3, port = 0, shandle = 0x0, rb_shandle = 0x1df2650, node_state_shandle = 0x1df2750, quota_conf_shandle = 0x1df2700, rebal = {defrag_status = GF_DEFRAG_STATUS_NOT_STARTED, rebalance_files = 0, rebalance_data = 0, lookedup_files = 0, skipped_files = 0, defrag = 0x0, defrag_cmd = 0, rebalance_failures = 0, rebalance_id = '\000' <repeats 15 times>, rebalance_time = 0, op = GD_OP_NONE, dict = 0x0}, rep_brick = {rb_status = GF_RB_STATUS_NONE, src_brick = 0x0, dst_brick = 0x0, rb_id = '\000' <repeats 15 times>}, version = 2, quota_conf_version = 0, cksum = 1542204662, quota_conf_cksum = 0, transport_type = GF_TRANSPORT_TCP, nfs_transport_type = GF_TRANSPORT_TCP, dict = 0x7f6654ac927c, volume_id = "\020\066'าค\351M\357\235\001\200o\231d3\231", auth = {username = 0x1de1a70 "723ce4bc-103e-4dfc-8373-59fc77592114", password = 0x1de0f70 "1575ca73-3d21-474d-af09-b30812ea4d15"}, logdir = 0x0, gsync_slaves = 0x7f6654ac9308, decommission_in_progress = 0, xl = 0x1dd6080, memory_accounting = _gf_false, caps = 0, op_version = 2, client_op_version = 2, reflock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = -1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 16 times>"\377, \377\377\377", '\000' <repeats 19 times>, __align = 0}, refcnt = 0, quorum_status = NOT_APPLICABLE_QUORUM} What we see here is that the conf->volumes list is corrupted. Tried checking the volumes info file and found that the info file is missing. [root@VM2 ~]# cd /var/lib/glusterd/vols/test_vol/ [root@VM2 test_vol]# ls bricks cksum node_state.info quota.cksum quota.conf rbstate run test_vol-fuse.vol trusted-test_vol-fuse.vol [root@VM2 test_vol]# ls -lrt total 32 -rw------- 1 root root 2053 May 22 09:39 trusted-test_vol-fuse.vol -rw------- 1 root root 1711 May 22 09:39 test_vol-fuse.vol drwxr-xr-x 2 root root 4096 May 22 09:39 run -rw------- 1 root root 12 May 22 09:39 rbstate -rw------- 1 root root 84 May 22 09:39 node_state.info -rw------- 1 root root 15 May 22 09:40 cksum -rw------- 1 root root 0 May 22 09:40 quota.conf -rw------- 1 root root 18 May 22 09:40 quota.cksum drwxr-xr-x 2 root root 4096 May 22 09:40 bricks However, when glusterd was coming back up, it seems to have read from the info file, coz the following logs are present. [2014-05-22 09:40:33.595468] D [store.c:499:gf_store_iter_new] 0-: Returning with 0 [2014-05-22 09:40:33.595518] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595535] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = type value = 2 [2014-05-22 09:40:33.595573] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595597] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = count value = 3 [2014-05-22 09:40:33.595626] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595640] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = status value = 1 [2014-05-22 09:40:33.595661] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595674] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = sub_count value = 3 [2014-05-22 09:40:33.595703] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595716] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = stripe_count value = 1 [2014-05-22 09:40:33.595737] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595751] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = replica_count value = 3 [2014-05-22 09:40:33.595771] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595785] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = version value = 2 [2014-05-22 09:40:33.595805] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595818] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = transport-type value = 0 [2014-05-22 09:40:33.595839] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595853] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = parent_volname value = N/A [2014-05-22 09:40:33.595879] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595893] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = volume-id value = 103627d2-a4e9-4def-9d01-806f99643399 [2014-05-22 09:40:33.595927] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595941] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = username value = 723ce4bc-103e-4dfc-8373-59fc77592114 [2014-05-22 09:40:33.595968] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.595984] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = password value = 1575ca73-3d21-474d-af09-b30812ea4d15 [2014-05-22 09:40:33.596046] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596063] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = op-version value = 2 [2014-05-22 09:40:33.596086] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596100] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = client-op-version value = 2 [2014-05-22 09:40:33.596122] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596136] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = restored_from_snap value = 00000000-0000-0000-0000-000000000000 [2014-05-22 09:40:33.596162] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596176] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = snap-max-hard-limit value = 256 [2014-05-22 09:40:33.596199] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596213] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = brick-0 value = 10.70.42.248:-brick-brick-dirs-brick1 [2014-05-22 09:40:33.596251] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596267] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = brick-1 value = 10.70.43.199:-brick-brick-dirs-brick2 [2014-05-22 09:40:33.596290] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596304] D [glusterd-store.c:2394:glusterd_store_update_volinfo] 0-: key = brick-2 value = 10.70.43.139:-brick-brick-dirs-brick3 [2014-05-22 09:40:33.596327] D [store.c:608:gf_store_iter_get_next] 0-: Returning with -1 [2014-05-22 09:40:33.596438] D [store.c:499:gf_store_iter_new] 0-: Returning with 0 [2014-05-22 09:40:33.596484] D [glusterd-utils.c:921:glusterd_brickinfo_new] 0-management: Returning 0 [2014-05-22 09:40:33.596528] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596556] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 [2014-05-22 09:40:33.596576] D [store.c:608:gf_store_iter_get_next] 0-: Returning with 0 qThe logs Version-Release number of selected component (if applicable): How reproducible: Quite rarely. Steps to Reproduce: 1.Create a volume and start it. 2.pkill glusterd on one node. 3.Run snap create command (gluster snap create <snap-name> <vol-name>) from another node, with force option. 4. After the snap command is successful, start glusterd on the first node. Actual results: glusterd crashes Expected results: glusterd should not crash Additional info:
As I discussed with Avara , this was happinig when quoram check it not there in snapshot . Current implementation will not allow if any one of brick is not online (bricks are online but glusterd is not running in that node then also we are considering bricks are offline ). So I am closing this bug and please open this bug if it occurs again .