Description of problem: Created a replicate volume and mounted it via fuse and nfs clients. Started running some tests (sanity script) om the fuse mount and added 2 more bricks (thus making the volume 2x2 distributed replicate), and gave rebalance. gluster-rebalance process crashed with the following backtrace. Program terminated with signal 6, Aborted. #0 0x00000034c2232905 in raise () from /lib64/libc.so.6 Missing separate debuginfos, use: debuginfo-install glibc-2.12-1.25.el6_1.3.x86_64 libgcc-4.4.5-6.el6.x86_64 (gdb) bt #0 0x00000034c2232905 in raise () from /lib64/libc.so.6 #1 0x00000034c22340e5 in abort () from /lib64/libc.so.6 #2 0x00000034c222b9be in __assert_fail_base () from /lib64/libc.so.6 #3 0x00000034c222ba80 in __assert_fail () from /lib64/libc.so.6 #4 0x00007fab81cafa21 in client3_1_entrylk (frame=0x7fab84e37c58, this=0x10fd200, data=0x7fffa77095e0) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:4763 #5 0x00007fab81c95b79 in client_entrylk (frame=0x7fab84e37c58, this=0x10fd200, volume=0x10fffc0 "mirror-replicate-1", loc=0x7fab801f2e50, basename=0x11ba2d0 "coverage", cmd=ENTRYLK_LOCK_NB, type=ENTRYLK_WRLCK) at ../../../../../xlators/protocol/client/src/client.c:1665 #6 0x00007fab81a539a7 in afr_nonblocking_entrylk (frame=0x7fab84c2a028, this=0x1100820) at ../../../../../xlators/cluster/afr/src/afr-lk-common.c:1300 #7 0x00007fab81a31d59 in afr_lock_rec (frame=0x7fab84c2a028, this=0x1100820) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:1156 #8 0x00007fab81a31dc6 in afr_lock (frame=0x7fab84c2a028, this=0x1100820) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:1175 #9 0x00007fab81a3208a in afr_transaction (frame=0x7fab84c2a028, this=0x1100820, type=AFR_ENTRY_TRANSACTION) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:1266 #10 0x00007fab81a10b37 in afr_mkdir (frame=0x7fab84e37bac, this=0x1100820, loc=0x7fab80713054, mode=16877, params=0x10f56e0) at ../../../../../xlators/cluster/afr/src/afr-dir-write.c:770 #11 0x00007fab817b3b7b in dht_selfheal_dir_mkdir (frame=0x7fab84e370ec, loc=0x7fab80713054, layout=0x11ba040, force=0) at ../../../../../xlators/cluster/dht/src/dht-selfheal.c:434 #12 0x00007fab817b4fbc in dht_selfheal_directory (frame=0x7fab84e370ec, dir_cbk=0x7fab817bddbb <dht_lookup_selfheal_cbk>, loc=0x7fab80713054, layout=0x11ba040) at ../../../../../xlators/cluster/dht/src/dht-selfheal.c:855 #13 0x00007fab817c00d6 in dht_lookup_dir_cbk (frame=0x7fab84e370ec, cookie=0x7fab84e379a8, this=0x11014a0, op_ret=-1, op_errno=2, inode=0x0, stbuf=0x7fab801ee6a4, xattr=0x0, postparent=0x7fab801ee714) at ../../../../../xlators/cluster/dht/src/dht-common.c:504 #14 0x00007fab81a6161e in afr_lookup_done (frame=0x7fab84e379a8, this=0x1100820) at ../../../../../xlators/cluster/afr/src/afr-common.c:1743 #15 0x00007fab81a61d20 in afr_lookup_cbk (frame=0x7fab84e379a8, cookie=0x1, this=0x1100820, op_ret=-1, op_errno=2, inode=0x7fab792ee174, buf=0x7fffa7709cd0, xattr=0x0, postparent=0x7fffa7709c60) at ../../../../../xlators/cluster/afr/src/afr-common.c:1906 #16 0x00007fab81ca4367 in client3_1_lookup_cbk (req=0x7fab800d604c, iov=0x7fab800d608c, count=1, myframe=0x7fab84e37b00) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:2185 #17 0x00007fab85ddb919 in rpc_clnt_handle_reply (clnt=0x11383d0, pollin=0x10f3980) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:796 #18 0x00007fab85ddbcb6 in rpc_clnt_notify (trans=0x1147eb0, mydata=0x1138400, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x10f3980) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:915 #19 0x00007fab85dd7da8 in rpc_transport_notify (this=0x1147eb0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x10f3980) at ../../../../rpc/rpc-lib/src/rpc-transport.c:498 #20 0x00007fab82ae8270 in socket_event_poll_in (this=0x1147eb0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1686 #21 0x00007fab82ae87f4 in socket_event_handler (fd=11, idx=4, data=0x1147eb0, poll_in=1, poll_out=0, poll_err=0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1801 #22 0x00007fab86032030 in event_dispatch_epoll_handler (event_pool=0x10dac20, events=0x10f4870, i=0) at ../../../libglusterfs/src/event.c:794 #23 0x00007fab86032253 in event_dispatch_epoll (event_pool=0x10dac20) at ../../../libglusterfs/src/event.c:856 #24 0x00007fab860325de in event_dispatch (event_pool=0x10dac20) at ../../../libglusterfs/src/event.c:956 #25 0x0000000000407dcc in main (argc=19, argv=0x7fffa770a3c8) at ../../../glusterfsd/src/glusterfsd.c:1612 (gdb) f 4 #4 0x00007fab81cafa21 in client3_1_entrylk (frame=0x7fab84e37c58, this=0x10fd200, data=0x7fffa77095e0) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:4763 4763 GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, (gdb) l 4758 if (!uuid_is_null (args->loc->inode->gfid)) 4759 memcpy (req.gfid, args->loc->inode->gfid, 16); 4760 else 4761 memcpy (req.gfid, args->loc->gfid, 16); 4762 4763 GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, 4764 !uuid_is_null (*((uuid_t*)req.gfid)), 4765 unwind, op_errno, EINVAL); 4766 req.cmd = args->cmd_entrylk; 4767 req.type = args->type; (gdb) p req.gfid $1 = '\000' <repeats 15 times> (gdb) p args->loc->gfid $2 = '\000' <repeats 15 times> (gdb) p args->loc->inode->gfid $3 = '\000' <repeats 15 times> (gdb) p *args->loc $4 = {path = 0x11ba7b0 "/run19400", name = 0x0, inode = 0x7fab792ee0e0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>} (gdb) f 10 #10 0x00007fab81a10b37 in afr_mkdir (frame=0x7fab84e37bac, this=0x1100820, loc=0x7fab80713054, mode=16877, params=0x10f56e0) at ../../../../../xlators/cluster/afr/src/afr-dir-write.c:770 770 afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); (gdb) p *loc $5 = {path = 0x7fab6c01c570 "/run19400/coverage", name = 0x7fab6c01c57a "coverage", inode = 0x7fab792ee174, parent = 0x7fab792ee0e0, gfid = "\347\327\323\377gfHۑ'U\256\232\216\264t", pargfid = "\320\066g\353\241\324A3\267\\G\375 f+~"} (gdb) l afr_mkdir 712 713 714 int 715 afr_mkdir (call_frame_t *frame, xlator_t *this, 716 loc_t *loc, mode_t mode, dict_t *params) 717 { 718 afr_private_t * priv = NULL; 719 afr_local_t * local = NULL; 720 call_frame_t * transaction_frame = NULL; 721 int ret = -1; (gdb) 722 int op_errno = 0; 723 724 VALIDATE_OR_GOTO (frame, out); 725 VALIDATE_OR_GOTO (this, out); 726 VALIDATE_OR_GOTO (this->private, out); 727 728 priv = this->private; 729 730 QUORUM_CHECK(mkdir,out); 731 (gdb) 732 transaction_frame = copy_frame (frame); 733 if (!transaction_frame) { 734 op_errno = ENOMEM; 735 goto out; 736 } 737 738 AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); 739 local = transaction_frame->local; 740 741 ret = afr_local_init (local, priv, &op_errno); (gdb) 742 if (ret < 0) 743 goto out; 744 745 loc_copy (&local->loc, loc); 746 747 LOCK (&priv->read_child_lock); 748 { 749 local->read_child_index = (++priv->read_child_rr) 750 % (priv->child_count); 751 } (gdb) 752 UNLOCK (&priv->read_child_lock); 753 754 local->cont.mkdir.mode = mode; 755 if (params) 756 local->cont.mkdir.params = dict_ref (params); 757 758 local->transaction.fop = afr_mkdir_wind; 759 local->transaction.done = afr_mkdir_done; 760 local->transaction.unwind = afr_mkdir_unwind; 761 (gdb) 762 ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, 763 &op_errno); 764 if (ret) 765 goto out; 766 767 local->transaction.main_frame = frame; 768 local->transaction.basename = AFR_BASENAME (loc->path); 769 770 afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); l afr_build_parent_loc 47 #include "afr.h" 48 #include "afr-transaction.h" 49 50 int 51 afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) 52 { 53 int ret = -1; 54 char *child_path = NULL; 55 56 if (!child->parent) { (gdb) 57 if (op_errno) 58 *op_errno = EINVAL; 59 goto out; 60 } 61 62 child_path = gf_strdup (child->path); 63 if (!child_path) { 64 if (op_errno) 65 *op_errno = ENOMEM; 66 goto out; (gdb) 67 } 68 parent->path = dirname (child_path); 69 parent->inode = inode_ref (child->parent); 70 ret = 0; 71 out: 72 return ret; 73 } 74 75 /* {{{ create */ 76 (gdb) f 10 #10 0x00007fab81a10b37 in afr_mkdir (frame=0x7fab84e37bac, this=0x1100820, loc=0x7fab80713054, mode=16877, params=0x10f56e0) at ../../../../../xlators/cluster/afr/src/afr-dir-write.c:770 770 afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); (gdb) p local->transaction->parent_loc $6 = {path = 0x11ba7b0 "/run19400", name = 0x0, inode = 0x7fab792ee0e0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>} (gdb) Version-Release number of selected component (if applicable): How reproducible: Steps to Reproduce: 1. create replicate volume, start it and mount it 2. start some tests on the mount point 3. add more bricks and give rebalance Actual results: rebalance process crashed Expected results: rebalance process should not crash Additional info: In rebalance after the lookup we are not linking the inode to inode table (since its done by either protocol/server for bricks and fuse for client and here the inode is created for rebalance purpose only), thus loc->inode and loc->parent will have their gfids NULL. Now afr_mkdir while building the parent_loc will not consider the loc->pargfid present in the child loc, hence keeping parent_loc->gfid NULL. Thus in protocol/client it looks for gfid either in loc->gfid or loc->inode->gfid and assert if it cannot find gfid in either of them.
CHANGE: http://review.gluster.com/2857 (cluster/afr: copy the parent's gfid from child loc while building parent loc) merged in master by Vijay Bellur (vijay)
please update these bugs w.r.to 3.3.0qa27, need to work on it as per target milestone set.
This bug is not found now. Checked with glusterfs-3.3.0qa33. Since we are filling in the gfid the issue is fixed.