Hide Forgot
tests: 2 fuse clients One client was running sanity script and the other client was running ping_pong and find <mount_point> | xargs stat. Brought one server down, slept for some time and then started back the server. The client running sanity script had crashed. This is the backtrace. Core was generated by `/usr/local/sbin/glusterfs --volfile-id=mirror --volfile-server=10.1.11.73 /clie'. Program terminated with signal 11, Segmentation fault. #0 0x00002aaaad84901d in afr_frame_return (frame=0x2adda48bfba0) at ../../../../../xlators/cluster/afr/src/afr-common.c:904 904 call_count = --local->call_count; (gdb) bt #0 0x00002aaaad84901d in afr_frame_return (frame=0x2adda48bfba0) at ../../../../../xlators/cluster/afr/src/afr-common.c:904 #1 0x00002aaaad81bf11 in afr_openfd_fix_open_cbk (frame=0x2adda48bfba0, cookie=0x0, this=0x6dde6c0, op_ret=-1, op_errno=2, fd=0x2aaab1024044) at ../../../../../xlators/cluster/afr/src/afr-open.c:270 #2 0x00002aaaad5d094f in client3_1_open_cbk (req=0x2aaaae9ae6ec, iov=0x2aaaae9ae72c, count=1, myframe=0x2adda4630944) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:368 #3 0x00002adda3218fde in rpc_clnt_handle_reply (clnt=0x6def5c0, pollin=0x2aaab80a8600) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:747 #4 0x00002adda3219315 in rpc_clnt_notify (trans=0x6def770, mydata=0x6def5f0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2aaab80a8600) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:860 #5 0x00002adda32159fb in rpc_transport_notify (this=0x6def770, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2aaab80a8600) at ../../../../rpc/rpc-lib/src/rpc-transport.c:498 #6 0x00002aaaaad6dea7 in socket_event_poll_in (this=0x6def770) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1676 #7 0x00002aaaaad6e3e9 in socket_event_handler (fd=7, idx=2, data=0x6def770, poll_in=1, poll_out=0, poll_err=0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1791 #8 0x00002adda2fc364c in event_dispatch_epoll_handler (event_pool=0x6dd1920, events=0x6dd65c0, i=1) at ../../../libglusterfs/src/event.c:794 #9 0x00002adda2fc3851 in event_dispatch_epoll (event_pool=0x6dd1920) at ../../../libglusterfs/src/event.c:856 #10 0x00002adda2fc3bab in event_dispatch (event_pool=0x6dd1920) at ../../../libglusterfs/src/event.c:956 #11 0x0000000000407222 in main (argc=4, argv=0x7ffffe760158) at ../../../glusterfsd/src/glusterfsd.c:1557 (gdb) f 0 #0 0x00002aaaad84901d in afr_frame_return (frame=0x2adda48bfba0) at ../../../../../xlators/cluster/afr/src/afr-common.c:904 904 call_count = --local->call_count; (gdb) p local $1 = (afr_local_t *) 0x0 (gdb) p *frame $2 = {root = 0x2adda48bfb18, parent = 0x0, next = 0x2adda4630944, prev = 0x0, local = 0x0, this = 0x6dde6c0, ret = 0, ref_count = 0, lock = 0, cookie = 0x0, complete = _gf_false, op = GF_FOP_NULL, begin = {tv_sec = 0, tv_usec = 0}, end = {tv_sec = 0, tv_usec = 0}, wind_from = 0x0, wind_to = 0x0, unwind_from = 0x0, unwind_to = 0x0} (gdb)
I ran the same tests with valgrind and found the crash again when the down server came back up. This is the backtrace. Core was generated by `'. Program terminated with signal 11, Segmentation fault. #0 0x000000317e8ccff3 in readv () from /lib64/libc.so.6 (gdb) info thr 4 Thread 29360 0x000000317f00e9dd in raise () from /lib64/libpthread.so.0 3 Thread 29361 0x000000317f00e838 in do_sigwait () from /lib64/libpthread.so.0 2 Thread 29362 0x000000317e89a541 in nanosleep () from /lib64/libc.so.6 * 1 Thread 29364 0x000000317e8ccff3 in readv () from /lib64/libc.so.6 (gdb) t 4 [Switching to thread 4 (Thread 29360)]#0 0x000000317f00e9dd in raise () from /lib64/libpthread.so.0 (gdb) bt #0 0x000000317f00e9dd in raise () from /lib64/libpthread.so.0 #1 0x0000000004c37c2f in gf_print_trace (signum=11) at ../../../libglusterfs/src/common-utils.c:411 #2 <signal handler called> #3 0x000000000a796ce0 in afr_get_resumable_calls (this=0x560c650, fd_ctx=0x1156aec0, list=0x7fefffe70) at ../../../../../xlators/cluster/afr/src/afr-open.c:229 #4 0x000000000a79708e in afr_openfd_fix_open_cbk (frame=0x534cde8, cookie=0x0, this=0x560c650, op_ret=-1, op_errno=2, fd=0xdf6f074) at ../../../../../xlators/cluster/afr/src/afr-open.c:293 #5 0x000000000a54b94f in client3_1_open_cbk (req=0xba6e4fc, iov=0xba6e53c, count=1, myframe=0x534cca0) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:368 #6 0x0000000004ea9fde in rpc_clnt_handle_reply (clnt=0x56184b0, pollin=0x1111c0e0) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:747 #7 0x0000000004eaa315 in rpc_clnt_notify (trans=0x56186e0, mydata=0x56184e0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x1111c0e0) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:860 #8 0x0000000004ea69fb in rpc_transport_notify (this=0x56186e0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x1111c0e0) at ../../../../rpc/rpc-lib/src/rpc-transport.c:498 #9 0x0000000007313ea7 in socket_event_poll_in (this=0x56186e0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1676 #10 0x00000000073143e9 in socket_event_handler (fd=11, idx=2, data=0x56186e0, poll_in=1, poll_out=0, poll_err=0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1791 #11 0x0000000004c5364c in event_dispatch_epoll_handler (event_pool=0x52e9ee0, events=0x55de630, i=0) at ../../../libglusterfs/src/event.c:794 #12 0x0000000004c53851 in event_dispatch_epoll (event_pool=0x52e9ee0) at ../../../libglusterfs/src/event.c:856 #13 0x0000000004c53bab in event_dispatch (event_pool=0x52e9ee0) at ../../../libglusterfs/src/event.c:956 #14 0x0000000000407222 in main (argc=8, argv=0x7ff000458) at ../../../glusterfsd/src/glusterfsd.c:1557 (gdb) f 3 #3 0x000000000a796ce0 in afr_get_resumable_calls (this=0x560c650, fd_ctx=0x1156aec0, list=0x7fefffe70) at ../../../../../xlators/cluster/afr/src/afr-open.c:229 229 if (call_local->child_up[i] && (gdb) p call_local $1 = (afr_local_t *) 0x0 (gdb) l 224 list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, 225 call_list) { 226 call = _gf_true; 227 call_local = paused_call->frame->local; 228 for (i = 0; i < priv->child_count; i++) { 229 if (call_local->child_up[i] && 230 (fd_ctx->opened_on[i] == AFR_FD_OPENING)) 231 call = _gf_false; 232 } 233 (gdb) p *paused_call->frame $2 = {root = 0x5ee77ac, parent = 0x5ee7834, next = 0x0, prev = 0x5ee7834, local = 0x0, this = 0x560c650, ret = 0xa9e8f35 <wb_sync_cbk>, ref_count = 0, lock = 1, cookie = 0x534b300, complete = _gf_false, op = GF_FOP_NULL, begin = {tv_sec = 0, tv_usec = 0}, end = {tv_sec = 0, tv_usec = 0}, wind_from = 0xa9f5188 "wb_sync", wind_to = 0xa9f51f0 "FIRST_CHILD(sync_frame->this)->fops->writev", unwind_from = 0x0, unwind_to = 0xa9f521c "wb_sync_cbk"} (gdb) p *paused_call $3 = {call_list = {next = 0x113d8d50, prev = 0x1156af18}, frame = 0x534b300} (gdb)
setup: replicate setup with replica count of 2
CHANGE: http://review.gluster.com/283 (Change-Id: I861b3c4494735b0ba6e038cdc39c50b9866747a8) merged in master by Vijay Bellur (vijay)
CHANGE: http://review.gluster.com/286 (Change-Id: Ie026ebed98cf5ff75ae1a13437d29f67d0e0254a) merged in master by Vijay Bellur (vijay)
CHANGE: http://review.gluster.com/292 (Change-Id: Idce22a6266c354e327d5d717715d2e62533eec58) merged in master by Vijay Bellur (vijay)
This bug is fixed and not seen now. Similar tests did not lead to this crash.