Description of problem: After the bug 826080, stopped all the tests and did rm -rf on all the mount points. One of the glusterfs servers crashed with the below backtrace. (gdb) bt #0 0x0000003cdba32885 in raise () from /lib64/libc.so.6 #1 0x0000003cdba34065 in abort () from /lib64/libc.so.6 #2 0x0000003cdba2b9fe in __assert_fail_base () from /lib64/libc.so.6 #3 0x0000003cdba2bac0 in __assert_fail () from /lib64/libc.so.6 #4 0x00007f353356cb52 in server_connection_destroy (this=0xa0f7e0, conn=0xcabde0) at ../../../../../xlators/protocol/server/src/server-helpers.c:621 #5 0x00007f353356ccb3 in server_conn_unref (conn=0xcabde0) at ../../../../../xlators/protocol/server/src/server-helpers.c:657 #6 0x00007f35335651a5 in server_submit_reply (frame=0x7f353b830174, req=0x7f3532fe7f00, arg=0x7f3531782880, payload=0x0, payloadcount=0, iobref=0x7f34f41a4090, xdrproc=0x7f353c688446 <xdr_gf_common_rsp>) at ../../../../../xlators/protocol/server/src/server.c:196 #7 0x00007f35335711f3 in server_inodelk_cbk (frame=0x7f353b830174, cookie=0x7f353b958d58, this=0xa0f7e0, op_ret=0, op_errno=0, xdata=0x0) at ../../../../../xlators/protocol/server/src/server3_1-fops.c:259 #8 0x00007f35337a6893 in io_stats_inodelk_cbk (frame=0x7f353b958d58, cookie=0x7f353b8fd6ac, this=0xa0e4b0, op_ret=0, op_errno=0, xdata=0x0) at ../../../../../xlators/debug/io-stats/src/io-stats.c:1762 #9 0x00007f353cad0f22 in default_inodelk_cbk (frame=0x7f353b8fd6ac, cookie=0x7f353b9744c0, this=0xa0d160, op_ret=0, op_errno=0, xdata=0x0) at ../../../libglusterfs/src/defaults.c:371 #10 0x00007f353cad0f22 in default_inodelk_cbk (frame=0x7f353b9744c0, cookie=0x7f353b968eac, this=0xa0be70, op_ret=0, op_errno=0, xdata=0x0) at ../../../libglusterfs/src/defaults.c:371 #11 0x00007f3533df70a0 in iot_inodelk_cbk (frame=0x7f353b968eac, cookie=0x7f353b969b70, this=0xa0aca0, op_ret=0, op_errno=0, xdata=0x0) at ../../../../../xlators/performance/io-threads/src/io-threads.c:2013 #12 0x00007f3538100db1 in grant_blocked_inode_locks (this=0xa09ac0, pl_inode=0x7f3524000b20, dom=0x7f3524000980) at ../../../../../xlators/features/locks/src/inodelk.c:372 #13 0x00007f3538101405 in release_inode_locks_of_transport (this=0xa09ac0, dom=0x7f3524000980, inode=0x7f35329e604c, trans=0x120f340) at ../../../../../xlators/features/locks/src/inodelk.c:470 #14 0x00007f3538101b89 in pl_common_inodelk (frame=0x7f353b967880, this=0xa09ac0, volume=0x7f3534000b90 "mirror-replicate-0", inode=0x7f35329e604c, cmd=6, flock=0x7f353b5eed14, loc=0x7f353b5eeccc, fd=0x0) at ../../../../../xlators/features/locks/src/inodelk.c:611 #15 0x00007f3538101fba in pl_inodelk (frame=0x7f353b967880, this=0xa09ac0, volume=0x7f3534000b90 "mirror-replicate-0", loc=0x7f353b5eeccc, cmd=6, flock=0x7f353b5eed14) at ../../../../../xlators/features/locks/src/inodelk.c:681 #16 0x00007f3533df730d in iot_inodelk_wrapper (frame=0x7f353b932740, this=0xa0aca0, volume=0x7f3534000b90 "mirror-replicate-0", loc=0x7f353b5eeccc, cmd=6, lock=0x7f353b5eed14, xdata=0x0) at ../../../../../xlators/performance/io-threads/src/io-threads.c:2023 #17 0x00007f353caf2810 in call_resume_wind (stub=0x7f353b5eec84) at ../../../libglusterfs/src/call-stub.c:2641 #18 0x00007f353caf9f6a in call_resume (stub=0x7f353b5eec84) at ../../../libglusterfs/src/call-stub.c:4151 #19 0x00007f3533de98d6 in iot_worker (data=0xa21940) at ../../../../../xlators/performance/io-threads/src/io-threads.c:131 #20 0x0000003cdc2077f1 in start_thread () from /lib64/libpthread.so.0 #21 0x0000003cdbae592d in clone () from /lib64/libc.so.6 (gdb) f 4 #4 0x00007f353356cb52 in server_connection_destroy (this=0xa0f7e0, conn=0xcabde0) at ../../../../../xlators/protocol/server/src/server-helpers.c:621 621 GF_ASSERT (list_empty (&inodelk_lockers)); (gdb) l 616 list_splice_init (<able->entrylk_lockers, 617 &entrylk_lockers); 618 GF_FREE (ltable); 619 } 620 621 GF_ASSERT (list_empty (&inodelk_lockers)); 622 GF_ASSERT (list_empty (&entrylk_lockers)); 623 624 if (fdtable) 625 gf_fd_fdtable_destroy (fdtable); (gdb) p inodelk_lockers $1 = {next = 0x7f34f4179c60, prev = 0x7f34f4179c60} (gdb) p inodelk_lockers->next $2 = (struct list_head *) 0x7f34f4179c60 (gdb) p inodelk_lockers->next->next $3 = (struct list_head *) 0x7f3531782710 (gdb) p inodelk_lockers->next->next->next $4 = (struct list_head *) 0x7f34f4179c60 (gdb) p inodelk_lockers->next->next->next->next $5 = (struct list_head *) 0x7f3531782710 (gdb) p lk_heal $6 = _gf_true (gdb) In server_submit_reply, if reply submission fails, then we cleanup the connection only if lock-heal is off, otherwise we just unref the connection object. 176 gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed"); 177 if (frame && conn && !lk_heal) { 178 server_connection_cleanup (frame->this, conn, 179 INTERNAL_LOCKS | POSIX_LOCKS); (gdb) 180 } else { 181 /* TODO: Failure of open(dir), create, inodelk, entrylk 182 or lk fops send failure must be handled specially. */ 183 ; 184 } 185 goto ret; 186 } 187 188 ret = 0; 189 ret: (gdb) 190 if (state) { 191 free_state (state); 192 } 193 194 if (frame) { 195 if (frame->root->trans) 196 server_conn_unref (frame->root->trans); 197 STACK_DESTROY (frame->root); 198 } 199 (gdb) 200 if (new_iobref) { 201 iobref_unref (iobref); 202 } 203 204 return ret; 205 } if conn-ref becomes zero we destroy the connection where it is assumed that there are no locks in the lock table. But since lock-heal is turned on server remembers the locks till grace-timeout is over upon disconnection with the client, hence lock table will not be empty. Version-Release number of selected component (if applicable): How reproducible: Steps to Reproduce: 1. 2. 3. Actual results: Expected results: Additional info: [2012-05-29 05:03:02.014357] I [server-helpers.c:741:server_connection_put] 0-mirror-server: Shutting down connection gqas004.sbu.lab. eng.bos.redhat.com-6159-2012/05/29-03:36:15:743236-mirror-client-0-9 [2012-05-29 05:03:02.014360] I [server-helpers.c:629:server_connection_destroy] 0-mirror-server: destroyed connection of gqas004.sbu.l ab.eng.bos.redhat.com-6159-2012/05/29-03:36:15:743236-mirror-client-0-8 [2012-05-29 05:03:02.014379] I [server-helpers.c:346:do_lock_table_cleanup] 0-mirror-server: inodelk released on / [2012-05-29 05:03:02.014430] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on / [2012-05-29 05:03:02.014443] E [rpcsvc.c:1080:rpcsvc_submit_generic] 0-rpc-service: failed to submit message (XID: 0x20425x, Program: GlusterFS 3.3.0qa43, ProgVers: 330, Proc: 29) to rpc-transport (tcp.mirror-server) [2012-05-29 05:03:02.014455] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551 [2012-05-29 05:03:02.014492] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551/linux-2.6.31.1 [2012-05-29 05:03:02.014505] E [server.c:176:server_submit_reply] (-->/usr/local/lib/libglusterfs.so.0(default_inodelk_cbk+0x158) [0x7 f353cad0f22] (-->/usr/local/lib/glusterfs/3.3.0qa43/xlator/debug/io-stats.so(io_stats_inodelk_cbk+0x245) [0x7f35337a6893] (-->/usr/loc al/lib/glusterfs/3.3.0qa43/xlator/protocol/server.so(server_inodelk_cbk+0x2f6) [0x7f35335711f3]))) 0-: Reply submission failed [2012-05-29 05:03:02.014517] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551/linux-2.6.31.1/arch [2012-05-29 05:03:02.014554] I [socket.c:2378:socket_submit_reply] 0-tcp.mirror-server: not connected (priv->connected = 255) [2012-05-29 05:03:02.014564] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551/linux-2.6.31.1/arch/powerp c [2012-05-29 05:03:02.014571] E [rpcsvc.c:1080:rpcsvc_submit_generic] 0-rpc-service: failed to submit message (XID: 0x77403x, Program: GlusterFS 3.3.0qa43, ProgVers: 330, Proc: 29) to rpc-transport (tcp.mirror-server) [2012-05-29 05:03:02.014607] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551/linux-2.6.31.1/arch/powerpc/platforms/ps3 [2012-05-29 05:03:02.014633] E [server.c:176:server_submit_reply] (-->/usr/local/lib/libglusterfs.so.0(default_inodelk_cbk+0x158) [0x7f353cad0f22] (-->/usr/local/lib/glusterfs/3.3.0qa43/xlator/debug/io-stats.so(io_stats_inodelk_cbk+0x245) [0x7f35337a6893] (-->/usr/local/lib/glusterfs/3.3.0qa43/xlator/protocol/server.so(server_inodelk_cbk+0x2f6) [0x7f35335711f3]))) 0-: Reply submission failed [2012-05-29 05:03:02.014639] I [server-helpers.c:474:do_fd_cleanup] 0-mirror-server: fd cleanup on /run9551/linux-2.6.31.1/arch/powerpc/platforms pending frames: patchset: git://git.gluster.com/glusterfs.git signal received: 6 time of crash: 2012-05-29 05:03:02 configuration details: argp 1 backtrace 1 dlfcn 1 fdatasync 1 libpthread 1
Need to check for 'lock-self-heal' flag before doing the GF_ASSERT on the list.
connection ref count represents the number of requests in transit. If the reply is sent it should be decremented. The code hit the assert because the reply submission failure is not handled when lk_heal is on. That should be fixed so that the assert wont be hit. gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed"); 177 if (frame && conn && !lk_heal) { 178 server_connection_cleanup (frame->this, conn, 179 INTERNAL_LOCKS | POSIX_LOCKS); (gdb) 180 } else { 181 /* TODO: Failure of open(dir), create, inodelk, entrylk 182 or lk fops send failure must be handled specially. */
git blame on xlators/protocol/server/src/server.c shows Junaid added the TODO:, reassigning it to him. ------- fdd20492 (Amar Tumballi 2010-06-21 07:00:04 +0000 175) if (ret == -1) { 0349ec85 (Amar Tumballi 2011-03-16 09:43:27 +0000 176) gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed"); 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 177) if (frame && conn && !lk_heal) { 9fd44bd9 (Pranith Kumar K 2012-03-18 13:07:30 +0530 178) server_connection_cleanup (frame->this, conn, 9fd44bd9 (Pranith Kumar K 2012-03-18 13:07:30 +0530 179) INTERNAL_LOCKS | POSIX_LOCKS); 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 180) } else { 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 181) /* TODO: Failure of open(dir), create, inodelk, entrylk 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 182) or lk fops send failure must be handled specially. */ 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 183) ; 83277598 (Mohammed Junaid 2012-03-19 19:56:21 +0530 184) } fdd20492 (Amar Tumballi 2010-06-21 07:00:04 +0000 185) goto ret; fdd20492 (Amar Tumballi 2010-06-21 07:00:04 +0000 186) } --------
Not seen the error in last few releases. marking lesser priority for now.
pre-release version is ambiguous and about to be removed as a choice. If you believe this is still a bug, please change the status back to NEW and choose the appropriate, applicable version for it.