Bug 809352

Summary: glusterfs client crashed while running dbench
Product: [Community] GlusterFS Reporter: Shwetha Panduranga <shwetha.h.panduranga>
Component: replicateAssignee: Pranith Kumar K <pkarampu>
Status: CLOSED NOTABUG QA Contact:
Severity: high Docs Contact:
Priority: unspecified    
Version: mainlineCC: gluster-bugs
Target Milestone: ---   
Target Release: ---   
Hardware: Unspecified   
OS: Unspecified   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2012-04-04 11:05:35 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Attachments:
Description Flags
Client Log File none

Description Shwetha Panduranga 2012-04-03 07:34:50 UTC
Description of problem:
(gdb) bt full
#0  0x0000003f71e32885 in raise () from /lib64/libc.so.6
No symbol table info available.
#1  0x0000003f71e34065 in abort () from /lib64/libc.so.6
No symbol table info available.
#2  0x0000003f71e2b9fe in __assert_fail_base () from /lib64/libc.so.6
No symbol table info available.
#3  0x0000003f71e2bac0 in __assert_fail () from /lib64/libc.so.6
No symbol table info available.
#4  0x00007fa54dfe12c9 in client_lookup (frame=0x7fa551548448, this=0x5ede830, loc=0x7fa52082333c, xdata=0x144356c) at client.c:367
        ret = -1
        conf = 0x5f207b0
        proc = 0x0
        args = {loc = 0x0, fd = 0x0, linkname = 0x0, iobref = 0x0, vector = 0x0, xattr = 0x0, stbuf = 0x0, oldloc = 0x0, newloc = 0x0, name = 0x0, flock = 0x0, 
          volume = 0x0, basename = 0x0, offset = 0, mask = 0, cmd = 0, size = 0, mode = 0, rdev = 0, flags = 0, count = 0, datasync = 0, cmd_entrylk = ENTRYLK_LOCK, 
          type = ENTRYLK_RDLCK, optype = GF_XATTROP_ADD_ARRAY, valid = 0, len = 0, umask = 0, xdata = 0x0}
        op_errno = 107
        __PRETTY_FUNCTION__ = "client_lookup"
        __FUNCTION__ = "client_lookup"
#5  0x00007fa54dd8d5e0 in afr_sh_common_lookup (frame=0x7fa551355f9c, this=0x5ee18f0, loc=0x7fa52082333c, lookup_done=0x7fa54dd9b920 <afr_sh_entry_fix>, gfid=0x0, 
    flags=3, xdata=0x0) at afr-self-heal-common.c:1794
        _new = 0x7fa551548448
        old_THIS = 0x5ee18f0
        tmp_cbk = 0x7fa54dd8b8a3 <afr_sh_common_lookup_cbk>
        local = 0x7fa520823304
        i = 0
        call_count = 3
        priv = 0x5f690c0
        xattr_req = 0x144356c
        sh = 0x7fa5208258cc
        __FUNCTION__ = "afr_sh_common_lookup"
        __PRETTY_FUNCTION__ = "afr_sh_common_lookup"
#6  0x00007fa54dd9bdc2 in afr_sh_post_nonblocking_entry_cbk (frame=0x7fa551355f9c, this=0x5ee18f0) at afr-self-heal-entry.c:2380
        int_lock = 0x7fa5208233fc
        local = 0x7fa520823304
        sh = 0x7fa5208258cc
        __FUNCTION__ = "afr_sh_post_nonblocking_entry_cbk"
---Type <return> to continue, or q <return> to quit---
#7  0x00007fa54dda42bf in afr_nonblocking_entrylk_cbk (frame=0x7fa551355f9c, cookie=0x1, this=0x5ee18f0, op_ret=0, op_errno=0, xdata=0x0) at afr-lk-common.c:1215
        int_lock = 0x7fa5208233fc
        local = 0x7fa520823304
        call_count = 0
        child_index = 1
        __FUNCTION__ = "afr_nonblocking_entrylk_cbk"
#8  0x00007fa54dff626f in client3_1_entrylk_cbk (req=0x6f9697c, iov=0x6f969bc, count=1, myframe=0x7fa55155278c) at client3_1-fops.c:1592
        fn = 0x7fa54dda4005 <afr_nonblocking_entrylk_cbk>
        _parent = 0x7fa551355f9c
        old_THIS = 0x5edf1f0
        __local = 0x0
        frame = 0x7fa55155278c
        rsp = {op_ret = 0, op_errno = 0, xdata = {xdata_len = 0, xdata_val = 0x0}}
        ret = 12
        this = 0x5edf1f0
        xdata = 0x0
        __FUNCTION__ = "client3_1_entrylk_cbk"
#9  0x00007fa5524ec9fc in rpc_clnt_handle_reply (clnt=0x5f0c980, pollin=0x5e81860) at rpc-clnt.c:797
        conn = 0x5f0c9b0
        saved_frame = 0x6bf4d4c
        ret = 0
        req = 0x6f9697c
        xid = 22161
        __FUNCTION__ = "rpc_clnt_handle_reply"
#10 0x00007fa5524ecd99 in rpc_clnt_notify (trans=0x6beb210, mydata=0x5f0c9b0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x5e81860) at rpc-clnt.c:916
        conn = 0x5f0c9b0
        clnt = 0x5f0c980
        ret = -1
        req_info = 0x0
        pollin = 0x5e81860
        tv = {tv_sec = 0, tv_usec = 0}
#11 0x00007fa5524e8e7c in rpc_transport_notify (this=0x6beb210, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x5e81860) at rpc-transport.c:498
        ret = -1
        __FUNCTION__ = "rpc_transport_notify"
#12 0x00007fa54ee43270 in socket_event_poll_in (this=0x6beb210) at socket.c:1686
        ret = 0
---Type <return> to continue, or q <return> to quit---
        pollin = 0x5e81860
#13 0x00007fa54ee437f4 in socket_event_handler (fd=24, idx=2, data=0x6beb210, poll_in=1, poll_out=0, poll_err=0) at socket.c:1801
        this = 0x6beb210
        priv = 0x5f46f00
        ret = 0
        __FUNCTION__ = "socket_event_handler"
#14 0x00007fa552744628 in event_dispatch_epoll_handler (event_pool=0x14353a0, events=0x1458420, i=4) at event.c:794
        event_data = 0x1458454
        handler = 0x7fa54ee435d7 <socket_event_handler>
        data = 0x6beb210
        idx = 2
        ret = -1
        __FUNCTION__ = "event_dispatch_epoll_handler"
#15 0x00007fa55274484b in event_dispatch_epoll (event_pool=0x14353a0) at event.c:856
        events = 0x1458420
        size = 5
        i = 4
        ret = 0
        __FUNCTION__ = "event_dispatch_epoll"
#16 0x00007fa552744bd6 in event_dispatch (event_pool=0x14353a0) at event.c:956
        ret = -1
        __FUNCTION__ = "event_dispatch"
#17 0x0000000000408057 in main (argc=4, argv=0x7fff2a82cbd8) at glusterfsd.c:1650
        ctx = 0x141d010
        ret = 0
        __FUNCTION__ = "main"
(gdb) Killed


Version-Release number of selected component (if applicable):
mainline

script1.sh (to run on storage node for graph changes):-
-----------------------------------------------------
#!/bin/bash

function sleep_now {
	sleep 30
}

for i in {1..10}
	do
	gluster volume set dstore stat-prefetch off
	sleep_now
	gluster volume quota dstore enable
	sleep_now
	gluster volume set dstore stat-prefetch on
	sleep_now
	gluster volume quota dstore disable --mode=script
	sleep_now
	done

Steps to Reproduce:
1.create a distribute-replicate volume(3X3). start the volume
2.create 2 fuse mounts on different clients
3.run "dbench -s -F -S --stat-check 10" on one fuse mount
4.run "find . | xargs stat" in a loop on other fuse mount
5.on one of the storage node run the "script1"
6.while dbench in progress bring down bricks from each replica set. 
7.bring back the brick online. 

Actual results:
dbench failed with "Transport End Point Not Connected". 

Additional info:
[2012-04-03 17:45:50.981366] I [afr-self-heal-common.c:2045:afr_self_heal_completion_cbk] 8-dstore-replicate-0: background  entry self-heal completed on /clients/client6/~dmtmp/PM
pending frames:
frame : type(1) op(LOOKUP)
frame : type(1) op(LOOKUP)
frame : type(1) op(OPENDIR)
frame : type(1) op(OPENDIR)
frame : type(1) op(LOOKUP)
frame : type(1) op(LOOKUP)
frame : type(1) op(LOOKUP)
frame : type(1) op(LOOKUP)

patchset: git://git.gluster.com/glusterfs.git
signal received: 6
time of crash: 2012-04-03 17:45:50
configuration details:
argp 1
backtrace 1
dlfcn 1
fdatasync 1
libpthread 1Expected results:

llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 3git
/lib64/libc.so.6[0x3f71e32900]
/lib64/libc.so.6(gsignal+0x35)[0x3f71e32885]

/lib64/libc.so.6(abort+0x175)[0x3f71e34065]
/lib64/libc.so.6[0x3f71e2b9fe]
/lib64/libc.so.6(__assert_perror_fail+0x0)[0x3f71e2bac0]
/usr/local/lib/glusterfs/3git/xlator/protocol/client.so(client_lookup+0xe6)[0x7fa54dfe12c9]
/usr/local/lib/glusterfs/3git/xlator/cluster/replicate.so(afr_sh_common_lookup+0x4f4)[0x7fa54dd8d5e0]
/usr/local/lib/glusterfs/3git/xlator/cluster/replicate.so(afr_sh_post_nonblocking_entry_cbk+0x17d)[0x7fa54dd9bdc2]
/usr/local/lib/glusterfs/3git/xlator/cluster/replicate.so(+0x592bf)[0x7fa54dda42bf]
/usr/local/lib/glusterfs/3git/xlator/protocol/client.so(client3_1_entrylk_cbk+0x4ae)[0x7fa54dff626f]
/usr/local/lib/libgfrpc.so.0(rpc_clnt_handle_reply+0x211)[0x7fa5524ec9fc]
/usr/local/lib/libgfrpc.so.0(rpc_clnt_notify+0x2d3)[0x7fa5524ecd99]
/usr/local/lib/libgfrpc.so.0(rpc_transport_notify+0x130)[0x7fa5524e8e7c]
/usr/local/lib/glusterfs/3git/rpc-transport/socket.so(socket_event_poll_in+0x54)[0x7fa54ee43270]
/usr/local/lib/glusterfs/3git/rpc-transport/socket.so(socket_event_handler+0x21d)[0x7fa54ee437f4]
/usr/local/lib/libglusterfs.so.0(+0x4e628)[0x7fa552744628]
/usr/local/lib/libglusterfs.so.0(+0x4e84b)[0x7fa55274484b]
/usr/local/lib/libglusterfs.so.0(event_dispatch+0x88)[0x7fa552744bd6]
/usr/local/sbin/glusterfs(main+0x238)[0x408057]
/lib64/libc.so.6(__libc_start_main+0xfd)[0x3f71e1ecdd]
/usr/local/sbin/glusterfs[0x4040c9]

Comment 1 Shwetha Panduranga 2012-04-03 07:36:50 UTC
Created attachment 574757 [details]
Client Log File