Bug 867338

Summary: glusterfsd crash
Product: [Red Hat Storage] Red Hat Gluster Storage Reporter: Vidya Sakar <vinaraya>
Component: glusterfsAssignee: Raghavendra Bhat <rabhat>
Status: CLOSED WORKSFORME QA Contact: SATHEESARAN <sasundar>
Severity: unspecified Docs Contact:
Priority: high    
Version: 2.0CC: amarts, gluster-bugs, rfortier, rhs-bugs, shaines, tomoaki.sato, vbellur
Target Milestone: ---   
Target Release: ---   
Hardware: x86_64   
OS: Linux   
Whiteboard:
Fixed In Version: glusterfs-3.4.0qa4 Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: 860114 Environment:
Last Closed: 2012-12-04 09:55:50 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Bug Depends On: 860114    
Bug Blocks:    

Description Vidya Sakar 2012-10-17 10:46:53 UTC
+++ This bug was initially created as a clone of Bug #860114 +++

Created attachment 616799 [details]
core file of the glusterfsd

Description of problem:

patchset: git://git.gluster.com/glusterfs.git
signal received: 6
time of crash: 2012-09-21 15:54:50
configuration details:
argp 1
backtrace 1
dlfcn 1
fdatasync 1
libpthread 1
llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 3.3.0
/lib64/libc.so.6[0x3beb432920]
/lib64/libc.so.6(gsignal+0x35)[0x3beb4328a5]
/lib64/libc.so.6(abort+0x175)[0x3beb434085]
/lib64/libc.so.6[0x3beb46fa37]
/lib64/libc.so.6[0x3beb475366]
/usr/lib64/libglusterfs.so.0(call_stub_destroy+0x1e5)[0x7f08d1ebf0e5]
/usr/lib64/glusterfs/3.3.0/xlator/performance/io-threads.so(iot_worker+0x15b)[0x7f08cd300d9b]
/lib64/libpthread.so.0[0x3beb807851]
/lib64/libc.so.6(clone+0x6d)[0x3beb4e767d]
---------

(gdb) bt
#0  0x0000003beb4328a5 in raise () from /lib64/libc.so.6
#1  0x0000003beb434085 in abort () from /lib64/libc.so.6
#2  0x0000003beb46fa37 in __libc_message () from /lib64/libc.so.6
#3  0x0000003beb475366 in malloc_printerr () from /lib64/libc.so.6
#4  0x00007f08d1ebf0e5 in call_stub_destroy_wind (stub=0x7f08d09b5828) at call-stub.c:3804
#5  call_stub_destroy (stub=0x7f08d09b5828) at call-stub.c:4126
#6  0x00007f08cd300d9b in iot_worker (data=0x20f95d0) at io-threads.c:131
#7  0x0000003beb807851 in start_thread () from /lib64/libpthread.so.0
#8  0x0000003beb4e767d in clone () from /lib64/libc.so.6
(gdb) up
#1  0x0000003beb434085 in abort () from /lib64/libc.so.6
(gdb) up
#2  0x0000003beb46fa37 in __libc_message () from /lib64/libc.so.6
(gdb) up
#3  0x0000003beb475366 in malloc_printerr () from /lib64/libc.so.6
(gdb) up
#4  0x00007f08d1ebf0e5 in call_stub_destroy_wind (stub=0x7f08d09b5828) at call-stub.c:3804
3804                            GF_FREE ((char *)stub->args.finodelk.volume);
(gdb) list
3799                    break;
3800            }
3801            case GF_FOP_FINODELK:
3802            {
3803                    if (stub->args.finodelk.volume)
3804                            GF_FREE ((char *)stub->args.finodelk.volume);
3805    
3806                    if (stub->args.finodelk.fd)
3807                            fd_unref (stub->args.finodelk.fd);
3808                    break;
(gdb) up
#5  call_stub_destroy (stub=0x7f08d09b5828) at call-stub.c:4126
4126                    call_stub_destroy_wind (stub);
(gdb) list
4121    call_stub_destroy (call_stub_t *stub)
4122    {
4123            GF_VALIDATE_OR_GOTO ("call-stub", stub, out);
4124    
4125            if (stub->wind) {
4126                    call_stub_destroy_wind (stub);
4127            } else {
4128                    call_stub_destroy_unwind (stub);
4129            }
4130    
(gdb) up
#6  0x00007f08cd300d9b in iot_worker (data=0x20f95d0) at io-threads.c:131
131                             call_resume (stub);
(gdb) list
126                             stub = __iot_dequeue (conf, &pri);
127                     }
128                     pthread_mutex_unlock (&conf->mutex);
129     
130                     if (stub) /* guard against spurious wakeups */
131                             call_resume (stub);
132     
133                     if (bye)
134                             break;
135             }
(gdb) 

Version-Release number of selected component (if applicable):

# rpm -qa | grep gluster
glusterfs-fuse-3.3.0-1.el6.x86_64
glusterfs-3.3.0-1.el6.x86_64
glusterfs-server-3.3.0-1.el6.x86_64
glusterfs-debuginfo-3.3.0-1.el6.x86_64
glusterfs-devel-3.3.0-1.el6.x86_64

How reproducible:

mkfs.xfs -f -i size=512 /dev/sdb1
mkdir -p /export/sdb1
mount -t xfs -o allocsize=4096 /dev/sdb1 /export/sdb1

gluster peer probe 10.100.10.10
gluster volume create sdb1 replica 2 transport tcp 10.100.10.11:/export/sdb1 10.100.10.10:/export/sdb1
gluster volume start sdb1
mkdir /mnt/sdb1
mount -t glusterfs -o direct-io-mode=enable 10.100.10.11:/sdb1 /mnt/sdb1

IN_FILE=/dev/zero
OUT_FILE=/mnt/sdb1/1GB
OUT_FILE2=/mnt/sdb1/1GB-2
OUT_FILE3=/mnt/sdb1/1GB-3
OUT_FILE4=/mnt/sdb1/1GB-4
OUT_FILE5=/mnt/sdb1/1GB-5
OUT_FILE6=/mnt/sdb1/1GB-6
OUT_FILE7=/mnt/sdb1/1GB-7
OUT_FILE8=/mnt/sdb1/1GB-8
OUT_FLAG=sync
BS=512

dd if=${IN_FILE} of=${OUT_FILE} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE2} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE3} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE4} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE5} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE6} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE7} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
dd if=${IN_FILE} of=${OUT_FILE8} bs=${BS} count=$(expr 1024 \* 1024 \* 1024 \/ ${BS}) oflag=${OUT_FLAG} &
wait

  
Actual results:

glusterfsd is sometime crashed.

Expected results:

the script finish successfully.

Additional info:

Comment 1 Raghavendra Bhat 2012-12-04 09:55:50 UTC
With master branch not seen this happening anymore, running the similar type of tests in longevity test-bed for more than 2weeks and this issue is not seen. Marking it as WORKSFORME (with Fixed in version as 3.4.0qa4), please feel free to reopen if seen again.