Description of problem:- Core was generated by `/usr/local/sbin/glusterfs --volfile-id=/datastore --volfile-server=192.168.2.35'. Program terminated with signal 11, Segmentation fault. #0 0x000000356fa0c100 in pthread_spin_lock () from /lib64/libpthread.so.0 Missing separate debuginfos, use: debuginfo-install glibc-2.12-1.25.el6_1.3.x86_64 libgcc-4.4.5-6.el6.x86_64 (gdb) bt #0 0x000000356fa0c100 in pthread_spin_lock () from /lib64/libpthread.so.0 #1 0x00007f674a091a40 in mem_put (ptr=0x2c41fb0) at mem-pool.c:519 #2 0x00007f6745ad7b93 in clnt_fd_lk_local_unref (this=0x21f5760, local=0x2c41fb0) at client-handshake.c:537 #3 0x00007f6745ad8144 in client_reacquire_lock_cbk (req=0x7f6744569908, iov=0x7f6744569948, count=1, myframe=0x7f6748c820b8) at client-handshake.c:697 #4 0x00007f6749e39919 in rpc_clnt_handle_reply (clnt=0x2224bb0, pollin=0x2a30330) at rpc-clnt.c:796 #5 0x00007f6749e39cb6 in rpc_clnt_notify (trans=0x2234730, mydata=0x2224be0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2a30330) at rpc-clnt.c:915 #6 0x00007f6749e35da8 in rpc_transport_notify (this=0x2234730, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2a30330) at rpc-transport.c:498 #7 0x00007f6746914270 in socket_event_poll_in (this=0x2234730) at socket.c:1686 #8 0x00007f67469147f4 in socket_event_handler (fd=13, idx=2, data=0x2234730, poll_in=1, poll_out=0, poll_err=0) at socket.c:1801 #9 0x00007f674a09005c in event_dispatch_epoll_handler (event_pool=0x21e0370, events=0x21ee5e0, i=0) at event.c:794 #10 0x00007f674a09027f in event_dispatch_epoll (event_pool=0x21e0370) at event.c:856 #11 0x00007f674a09060a in event_dispatch (event_pool=0x21e0370) at event.c:956 #12 0x0000000000407dcc in main (argc=4, argv=0x7fff91ec3ce8) at glusterfsd.c:1612 (gdb) bt full #0 0x000000356fa0c100 in pthread_spin_lock () from /lib64/libpthread.so.0 No symbol table info available. #1 0x00007f674a091a40 in mem_put (ptr=0x2c41fb0) at mem-pool.c:519 list = 0x2c41f94 in_use = 0x0 head = 0x2c41f94 tmp = 0x2c41fa4 pool = 0xcafebabe __FUNCTION__ = "mem_put" #2 0x00007f6745ad7b93 in clnt_fd_lk_local_unref (this=0x21f5760, local=0x2c41fb0) at client-handshake.c:537 ref = 0 __FUNCTION__ = "clnt_fd_lk_local_unref" #3 0x00007f6745ad8144 in client_reacquire_lock_cbk (req=0x7f6744569908, iov=0x7f6744569948, count=1, myframe=0x7f6748c820b8) at client-handshake.c:697 ret = 0 this = 0x21f5760 rsp = {op_ret = 0, op_errno = 0, xdata = {xdata_len = 0, xdata_val = 0x0}} frame = 0x7f6748c820b8 local = 0x2c41fb0 __FUNCTION__ = "client_reacquire_lock_cbk" #4 0x00007f6749e39919 in rpc_clnt_handle_reply (clnt=0x2224bb0, pollin=0x2a30330) at rpc-clnt.c:796 conn = 0x2224be0 saved_frame = 0x2224f98 ret = 0 req = 0x7f6744569908 xid = 6789 __FUNCTION__ = "rpc_clnt_handle_reply" #5 0x00007f6749e39cb6 in rpc_clnt_notify (trans=0x2234730, mydata=0x2224be0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2a30330) at rpc-clnt.c:915 conn = 0x2224be0 clnt = 0x2224bb0 ret = -1 req_info = 0x0 pollin = 0x2a30330 tv = {tv_sec = 0, tv_usec = 0} #6 0x00007f6749e35da8 in rpc_transport_notify (this=0x2234730, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x2a30330) at rpc-transport.c:498 ret = -1 __FUNCTION__ = "rpc_transport_notify" ---Type <return> to continue, or q <return> to quit---q Quit (gdb) f 1 #1 0x00007f674a091a40 in mem_put (ptr=0x2c41fb0) at mem-pool.c:519 519 LOCK (&pool->lock); (gdb) l 514 pool = *tmp; 515 if (!pool) { 516 gf_log ("mem-pool", GF_LOG_ERROR, "mem-pool ptr is NULL"); 517 return; 518 } 519 LOCK (&pool->lock); 520 { 521 522 switch (__is_member (pool, ptr)) 523 { (gdb) 524 case 1: 525 in_use = (head + GF_MEM_POOL_LIST_BOUNDARY + 526 GF_MEM_POOL_PTR); 527 if (!is_mem_chunk_in_use(in_use)) { 528 gf_log_callingfn ("mem-pool", GF_LOG_CRITICAL, 529 "mem_put called on freed ptr %p of mem " 530 "pool %p", ptr, pool); 531 break; 532 } 533 pool->hot_count--; (gdb) f 2 #2 0x00007f6745ad7b93 in clnt_fd_lk_local_unref (this=0x21f5760, local=0x2c41fb0) at client-handshake.c:537 537 mem_put (local); (gdb) l 532 } 533 UNLOCK (&local->lock); 534 535 if (ref == 0) { 536 LOCK_DESTROY (&local->lock); 537 mem_put (local); 538 } 539 ref = 0; 540 out: 541 return ref; (gdb) p local $1 = (clnt_fd_lk_local_t *) 0x2c41fb0 (gdb) p *local $2 = {ref = 0, error = _gf_false, lock = 1, fdctx = 0x2274ce0} (gdb) p *local->fdctx $3 = {sfd_pos = {next = 0x2210898, prev = 0x2210898}, remote_fd = 0, inode = 0x7f673fb5f0e0, ino = 0, gen = 0, is_dir = 0 '\000', released = 0 '\000', flags = 32770, wbflags = 0, lk_ctx = 0x7f6738001440, mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, lock_list = {next = 0x2274d50, prev = 0x2274d50}} Version-Release number of selected component (if applicable): mainline How reproducible: Steps to Reproduce: 1.create a replicate volume with 2 bricks 2.start the volume 3.create '3' gluster mounts, 1 nfs mount from the client 4.perform "dd if=/dev/urandom of=file1 bs=1M count=1024" from one gluster mount 5. perform "dd if=./file1 of=file2 bs=1M count=2048" on nfs mount 6. perform "ping_pong -rw file2 4" from 2nd gluster mount 7. perform "ping_pong -rw file1 4" from 3rd gluster mount 8. gluster volume set <volumename> cluster.data-self-heal-algorithm full 9. gluster volume set <volumename> self-heal-daemon off 10. brick down brick2 while dd in progress 11. brick back brick2 The gluster clients which was running ping_pong crashed. Actual results: Crashed the gluster client
Created attachment 565285 [details] Attaching client log
CHANGE: http://review.gluster.com/2852 (protocol/client: Calling GF_FREE on memory allocated via GF_CALLOC.) merged in master by Vijay Bellur (vijay)
The issue was, a piece of memory allocated via GF_CALLOC was accidently freed using mem_put.
Verified the bug on 3.3.0qa38. bug is fixed.
I'm running v3.4.2 and ping_pong doesn't run on two nodes with a replicated volume running: ping_pong -rw test 3 any ideas why not? It looks like this bug may not be fixed? It runs on one node, but hangs on the other. Thanks Rich
Sounds like standalone issue which should have it's own bug. > I'm running v3.4.2 and ping_pong doesn't run on two nodes with a replicated > volume running: > > ping_pong -rw test 3 > > any ideas why not? It looks like this bug may not be fixed? > It runs on one node, but hangs on the other. > Thanks > Rich