Bug 1019874
Summary: | Glusterfs crash when remove brick while data are still written | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
Product: | [Community] GlusterFS | Reporter: | Viktor Krivak <viktor.krivak> | ||||||||
Component: | fuse | Assignee: | Csaba Henk <csaba> | ||||||||
Status: | CLOSED DUPLICATE | QA Contact: | |||||||||
Severity: | unspecified | Docs Contact: | |||||||||
Priority: | unspecified | ||||||||||
Version: | 3.4.1 | CC: | gluster-bugs, jbyers, pkarampu, viktor.krivak | ||||||||
Target Milestone: | --- | ||||||||||
Target Release: | --- | ||||||||||
Hardware: | x86_64 | ||||||||||
OS: | Linux | ||||||||||
Whiteboard: | |||||||||||
Fixed In Version: | Doc Type: | Bug Fix | |||||||||
Doc Text: | Story Points: | --- | |||||||||
Clone Of: | Environment: | ||||||||||
Last Closed: | 2014-06-16 10:18:07 UTC | Type: | Bug | ||||||||
Regression: | --- | Mount Type: | --- | ||||||||
Documentation: | --- | CRM: | |||||||||
Verified Versions: | Category: | --- | |||||||||
oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |||||||||
Cloudforms Team: | --- | Target Upstream Version: | |||||||||
Embargoed: | |||||||||||
Attachments: |
|
Created attachment 868204 [details]
Core file from similar crash during expand/rebalance.
Created attachment 868205 [details]
Logfile file from similar crash during expand/rebalance.
Hello, Using glusterfs 3.4.2, we had a similar 'glusterfs' crash as described here. However, we were not removing bricks, instead had just expanded a volume which was in the process of rebalancing. Below are the crash details, and I've attached the full log and core file. [2014-02-26 16:59:52.686563] I [afr-common.c:2120:afr_discovery_cbk] 0-SF-Engr-replicate-2: selecting local read_child SF-Engr-client-4 [2014-02-26 16:59:52.686589] I [afr-common.c:2120:afr_discovery_cbk] 0-SF-Engr-replicate-1: selecting local read_child SF-Engr-client-2 pending frames: frame : type(0) op(0) frame : type(0) op(0) frame : type(0) op(0) frame : type(1) op(LOOKUP) frame : type(0) op(0) patchset: git://git.gluster.com/glusterfs.git signal received: 11 SIGSEGV time of crash: 2014-02-26 17:44:44configuration details: argp 1 backtrace 1 dlfcn 1 fdatasync 1 libpthread 1 llistxattr 1 setfsid 1 spinlock 1 epoll.h 1 xattr.h 1 st_atim.tv_nsec 1 package-string: glusterfs 3.4.2 /lib64/libc.so.6[0x30008329a0] /lib64/libpthread.so.0(pthread_spin_lock+0x0)[0x300100c380] /usr/local/lib/libglusterfs.so.0(fd_unref+0x36)[0x7f0dd8bb0ee6] /usr/local/lib/glusterfs/3.4.2/xlator/protocol/client.so(client_local_wipe+0x28)[0x7f0dd558bc18] /usr/local/lib/glusterfs/3.4.2/xlator/protocol/client.so(client3_3_finodelk_cbk+0x157)[0x7f0dd5597237] /usr/local/lib/libgfrpc.so.0(rpc_clnt_handle_reply+0xa5)[0x7f0dd896fa75] /usr/local/lib/libgfrpc.so.0(rpc_clnt_notify+0x11f)[0x7f0dd8970a4f] /usr/local/lib/libgfrpc.so.0(rpc_transport_notify+0x28)[0x7f0dd896c388] /usr/local/lib/glusterfs/3.4.2/rpc-transport/socket.so(+0x8da6)[0x7f0dd351eda6] /usr/local/lib/glusterfs/3.4.2/rpc-transport/socket.so(+0xa95d)[0x7f0dd352095d] /usr/local/lib/libglusterfs.so.0(+0x5cdf7)[0x7f0dd8bd4df7] /usr/local/sbin/glusterfs(main+0x5c6)[0x406706] /lib64/libc.so.6(__libc_start_main+0xfd)[0x300081ed1d] /usr/local/sbin/glusterfs[0x4045b9] Core was generated by `/usr/local/sbin/glusterfs --acl --volfile=/var/lib/glusterd/vols/SF-Engr/SF-Eng'. # gdb /usr/local/sbin/glusterfs /core.23276 Program terminated with signal 11, Segmentation fault. #0 pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/i386/pthread_spin_lock.c:35 35 asm ("\n" (gdb) bt #0 pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/i386/pthread_spin_lock.c:35 #1 0x00007f0dd8bb0ee6 in fd_unref (fd=0x174f08c) at fd.c:540 #2 0x00007f0dd558bc18 in client_local_wipe (local=0x166999c) at client-helpers.c:128 #3 0x00007f0dd5597237 in client3_3_finodelk_cbk (req=<value optimized out>, iov=<value optimized out>, count=<value optimized out>, myframe=0x7f0dd79cdef4) at client-rpc-fops.c:1593 #4 0x00007f0dd896fa75 in rpc_clnt_handle_reply (clnt=0x167de90, pollin=0x1864380) at rpc-clnt.c:771 #5 0x00007f0dd8970a4f in rpc_clnt_notify (trans=<value optimized out>, mydata=0x167dec0, event=<value optimized out>, data=<value optimized out>) at rpc-clnt.c:891 #6 0x00007f0dd896c388 in rpc_transport_notify (this=<value optimized out>, event=<value optimized out>, data=<value optimized out>) at rpc-transport.c:497 #7 0x00007f0dd351eda6 in socket_event_poll_in (this=0x168d920) at socket.c:2118 #8 0x00007f0dd352095d in socket_event_handler (fd=<value optimized out>, idx=<value optimized out>, data=0x168d920, poll_in=1, poll_out=0, poll_err=0) at socket.c:2230 #9 0x00007f0dd8bd4df7 in event_dispatch_epoll_handler (event_pool=0x163d520) at event-epoll.c:384 #10 event_dispatch_epoll (event_pool=0x163d520) at event-epoll.c:445 #11 0x0000000000406706 in main (argc=4, argv=0x7fffcf96f6b8) at glusterfsd.c:1934 (gdb) f 1 #1 0x00007f0dd8bb0ee6 in fd_unref (fd=0x174f08c) at fd.c:540 540 fd.c: No such file or directory. in fd.c (gdb) p *fd $1 = {pid = 25200, flags = 32770, refcount = 0, inode_list = {next = 0x174f09c, prev = 0x174f09c}, inode = 0xaaaaaaaa, lock = 1, _ctx = 0x7f0dc4000be0, xl_count = 16, lk_ctx = 0x7f0dc4090df0, anonymous = _gf_false} (gdb) f 0 #0 pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/i386/pthread_spin_lock.c:35 35 asm ("\n" (gdb) f 1 #1 0x00007f0dd8bb0ee6 in fd_unref (fd=0x174f08c) at fd.c:540 540 fd.c: No such file or directory. in fd.c (gdb) info locals refcount = 0 __FUNCTION__ = "fd_unref" (gdb) f 0 #0 pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/i386/pthread_spin_lock.c:35 35 asm ("\n" (gdb) p lock $4 = (volatile int *) 0xaaaaaac2 (gdb) ptype lock type = volatile int * (gdb) f 3 #3 0x00007f0dd5597237 in client3_3_finodelk_cbk (req=<value optimized out>, iov=<value optimized out>, count=<value optimized out>, myframe=0x7f0dd79cdef4) at client-rpc-fops.c:1593 1593 client-rpc-fops.c: No such file or directory. in client-rpc-fops.c (gdb) info locals __local = 0x166999c frame = 0x7f0dd79cdef4 rsp = {op_ret = 0, op_errno = 0, xdata = {xdata_len = 0, xdata_val = 0x0}} ret = <value optimized out> this = <value optimized out> xdata = 0x0 local = <value optimized out> __FUNCTION__ = "client3_3_finodelk_cbk" (gdb) p *frame $3 = {root = 0x7f0dd7803408, parent = 0x7f0dd79cd184, next = 0x7f0dd79cd184, prev = 0x7f0dd79ce0f8, local = 0x7f0dd38456e4, this = 0x1656850, ret = 0x7f0dd50ece40 <dht_revalidate_cbk>, ref_count = 2, lock = 1, cookie = 0x7f0dd79cdef4, complete = _gf_false, op = GF_FOP_NULL, begin = {tv_sec = 0, tv_usec = 0}, end = {tv_sec = 0, tv_usec = 0}, wind_from = 0x7f0dd5103607 "dht_lookup", wind_to = 0x7f0dd51025e0 "conf->subvolumes[i]->fops->lookup", unwind_from = 0x0, unwind_to = 0x7f0dd5102048 "dht_revalidate_cbk"} *** This bug has been marked as a duplicate of bug 1022510 *** |
Created attachment 812961 [details] Glusterfs log Description of problem: glusterfs crash when I remove brick with file which is still opened and data are written into it. It result in "Transport endpoint is not connected" error. Version-Release number of selected component (if applicable): Gluster 3.4.1 Also found in 3.3.2 How reproducible: Steps to Reproduce: 1. Create distributed volume on two bricks, start it and mount 2. Go to mounted directory and start write to some file dd if=/dev/zero of=test.data 3. Identify on which brick file is actually stored 4. Remove that brick with: gluster volume remove-brick instances brick1:/mnt/gluster/ start 5. Wait until migration finish 6. Stop dd by ctrl+c and type ls Actual results: Glusterfs crash with sigsev and volume is not accessible until you umount and again mount. But no data are lost. Expected results: Data are migrated and volume is still accessible. Additional info: It's no need for commit brick remove. It fail before it. Same result on 4 bricks or 2x2 distributed-replicated bricks. Originally I found this bug when runing kvm stored image on gluster. But it end with different backtrace. I append log from glusterfs, gdb backtrace and some variable dump GDB: Program received signal SIGSEGV, Segmentation fault. pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/x86_64/../i386/pthread_spin_lock.c:35 35 ../nptl/sysdeps/x86_64/../i386/pthread_spin_lock.c: No such file or directory. (gdb) backtrace #0 pthread_spin_lock (lock=0xaaaaaac2) at ../nptl/sysdeps/x86_64/../i386/pthread_spin_lock.c:35 #1 0x00007f26bf835006 in fd_unref (fd=0x7f26c0904d4c) at fd.c:540 #2 0x00007f26ba395928 in client_local_wipe (local=local@entry=0x7f26c08bb1e4) at client-helpers.c:128 #3 0x00007f26ba3a1dee in client3_3_flush_cbk (req=<optimized out>, iov=<optimized out>, count=<optimized out>, myframe=0x7f26bda7a388) at client-rpc-fops.c:937 #4 0x00007f26bf5f21a4 in rpc_clnt_handle_reply (clnt=clnt@entry=0x7f26c08cecb0, pollin=0x7f26b000d230) at rpc-clnt.c:771 #5 0x00007f26bf5f252d in rpc_clnt_notify (trans=<optimized out>, mydata=0x7f26c08cece0, event=<optimized out>, data=<optimized out>) at rpc-clnt.c:891 #6 0x00007f26bf5eebf3 in rpc_transport_notify (this=this@entry=0x7f26c08de710, event=event@entry=RPC_TRANSPORT_MSG_RECEIVED, data=data@entry=0x7f26b000d230) at rpc-transport.c:497 #7 0x00007f26bb652f69 in socket_event_poll_in (this=this@entry=0x7f26c08de710) at socket.c:2118 #8 0x00007f26bb655d7c in socket_event_handler (fd=<optimized out>, idx=<optimized out>, data=0x7f26c08de710, poll_in=1, poll_out=0, poll_err=0) at socket.c:2230 #9 0x00007f26bf85d6ca in event_dispatch_epoll_handler (i=<optimized out>, events=0x7f26c089cf80, event_pool=0x7f26c08843f0) at event-epoll.c:384 #10 event_dispatch_epoll (event_pool=0x7f26c08843f0) at event-epoll.c:445 #11 0x00007f26bfca5fe5 in main (argc=5, argv=0x7fff791145f8) at glusterfsd.c:1934 (gdb) up #1 0x00007f26bf835006 in fd_unref (fd=0x7f26c0904d4c) at fd.c:540 540 fd.c: No such file or directory. (gdb) print *fd $1 = {pid = 29291, flags = 33345, refcount = 0, inode_list = {next = 0x7f26c0904d5c, prev = 0x7f26c0904d5c}, inode = 0xaaaaaaaa, lock = 1, _ctx = 0x7f26b0001a00, xl_count = 11, lk_ctx = 0x7f26b0001ac0, anonymous = _gf_false} (gdb) print *fd->inode Cannot access memory at address 0xaaaaaaaa