Bug 762589 (GLUSTER-857)

Summary: Crash in afr_sh_entry_expunge_entry_cbk
Product: [Community] GlusterFS Reporter: Anush Shetty <anush>
Component: replicateAssignee: Pranith Kumar K <pkarampu>
Status: CLOSED CURRENTRELEASE QA Contact:
Severity: high Docs Contact:
Priority: low    
Version: mainlineCC: gluster-bugs, shehjart, vijay
Target Milestone: ---   
Target Release: ---   
Hardware: All   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: Type: ---
Regression: RTP Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:

Description Anush Shetty 2010-04-26 11:44:13 UTC
While running dbench on Distribute-replicate + NFS Xlator setup with error-gen, I got this crash

(gdb) bt full
#0  0x00007fe96b13e4dc in afr_sh_entry_expunge_entry_cbk (expunge_frame=0x7fe96400c4e8, cookie=0x1, this=0x164e990, op_ret=-1, op_errno=2, inode=0x0, 
    buf=0x0, x=0x0, postparent=0x0) at afr-self-heal-entry.c:885
        priv = 0x1652b10
        expunge_local = 0x7fe964010310
        expunge_sh = 0x7fe964011b30
        source = 1
        frame = 0x7fe96400f638
        active_src = 2
        __FUNCTION__ = "afr_sh_entry_expunge_entry_cbk"
#1  0x00007fe96b359c95 in error_gen_lookup (frame=0x7fe964000e90, this=0x164bd30, loc=0x7fe964010340, xattr_req=0x0) at error-gen.c:402
        fn = 0x7fe96b13e3b5 <afr_sh_entry_expunge_entry_cbk>
        _parent = 0x7fe96400c4e8
        old_THIS = 0x164bd30
        op_errno = 2
        egp = 0x1654b60
        enable = 1
        __FUNCTION__ = "error_gen_lookup"
#2  0x00007fe96b13ecd7 in afr_sh_entry_expunge_entry (frame=0x7fe96400f638, this=0x164e990, name=0x7fe964019380 "~dmtmp") at afr-self-heal-entry.c:972
        _new = 0x7fe964000e90
        old_THIS = 0x164e990
        tmp_cbk = 0x7fe96b13e3b5 <afr_sh_entry_expunge_entry_cbk>
        priv = 0x1652b10
        local = 0x7fe964034b90
        sh = 0x7fe9640363b0
        ret = 0
        expunge_frame = 0x7fe96400c4e8
        expunge_local = 0x7fe964010310
        expunge_sh = 0x7fe964011b30
        active_src = 2
        source = 1
        op_errno = 0
        __FUNCTION__ = "afr_sh_entry_expunge_entry"
#3  0x00007fe96b13efb1 in afr_sh_entry_expunge_readdir_cbk (frame=0x7fe96400f638, cookie=0x7fe96400c360, this=0x164e990, op_ret=4, op_errno=2, 
---Type <return> to continue, or q <return> to quit---
    entries=0x7ffff18df0b0) at afr-self-heal-entry.c:1039
        priv = 0x1652b10
        local = 0x7fe964034b90
        sh = 0x7fe9640363b0
        entry = 0x7fe9640192f0
        last_offset = 2147483647
        active_src = 2
        entry_count = 4
        __FUNCTION__ = "afr_sh_entry_expunge_readdir_cbk"
#4  0x00007fe96b58a280 in client_readdir_cbk (frame=0x7fe96400c360, hdr=0x7fe96401ab70, hdrlen=551, iobuf=0x0) at client-protocol.c:4252
        fn = 0x7fe96b13ed23 <afr_sh_entry_expunge_readdir_cbk>
        _parent = 0x7fe96400f638
        old_THIS = 0x164ced0
        rsp = 0x7fe96401abdc
        op_ret = 4
        op_errno = 2
        buf_size = 439
        entries = {{list = {next = 0x7fe9640192f0, prev = 0x7fe96400ba40}, {next = 0x7fe9640192f0, prev = 0x7fe96400ba40}}, d_ino = 140737246002544, 
          d_off = 0, d_len = 0, d_type = 0, d_stat = {ia_ino = 140640516126540, ia_gen = 29, ia_dev = 140640505109752, ia_type = 4052611408, ia_prot = {
              suid = 1 '\001', sgid = 1 '\001', sticky = 1 '\001', owner = {read = 1 '\001', write = 1 '\001', exec = 1 '\001'}, group = {read = 0 '\000', 
                write = 0 '\000', exec = 0 '\000'}, other = {read = 0 '\000', write = 0 '\000', exec = 0 '\000'}}, ia_nlink = 3048, ia_uid = 0, 
            ia_gid = 4052614404, ia_rdev = 23415456, ia_size = 140640381879136, ia_blksize = 23415464, ia_blocks = 140640381951728, ia_atime = 4008636142, 
            ia_atime_nsec = 0, ia_mtime = 4052611456, ia_mtime_nsec = 32767, ia_ctime = 1800906549, ia_ctime_nsec = 32745}, 
          d_name = 0x7ffff18df140 "\320\316d\001"}
#5  0x00007fe96b58fde6 in protocol_client_interpret (this=0x164ced0, trans=0x1654500, hdr_p=0x7fe96401ab70 "", hdrlen=551, iobuf=0x0)
    at client-protocol.c:6093
        ret = -1
        frame = 0x7fe96400c360
        hdr = 0x7fe96401ab70
        callid = 3048
        type = 4
        op = 29
        conn = 0x1654a20
---Type <return> to continue, or q <return> to quit---
        __FUNCTION__ = "protocol_client_interpret"
#6  0x00007fe96b590b67 in protocol_client_pollin (this=0x164ced0, trans=0x1654500) at client-protocol.c:6391
        conf = 0x1654000
        ret = 0
        iobuf = 0x0
        hdr = 0x7fe96401ab70 ""
        hdrlen = 551
#7  0x00007fe96b591160 in notify (this=0x164ced0, event=2, data=0x1654500) at client-protocol.c:6510
        i = 0
        ret = -1
        child_down = 1
        was_not_down = 0
        trans = 0x1654500
        conn = 0x0
        conf = 0x1654000
        parent = 0x0
        __FUNCTION__ = "notify"
#8  0x00007fe96c7431c8 in xlator_notify (xl=0x164ced0, event=2, data=0x1654500) at xlator.c:919
        old_THIS = 0x7fe96c988180
        ret = 0
#9  0x00007fe969fe945a in socket_event_poll_in (this=0x1654500) at socket.c:731
        ret = 0
#10 0x00007fe969fe978d in socket_event_handler (fd=13, idx=5, data=0x1654500, poll_in=1, poll_out=0, poll_err=0) at socket.c:831
        this = 0x1654500
        priv = 0x1654950
        ret = 0
#11 0x00007fe96c76a736 in event_dispatch_epoll_handler (event_pool=0x1644310, events=0x1658400, i=2) at event.c:804
        event_data = 0x165841c
        handler = 0x7fe969fe96c9 <socket_event_handler>
        data = 0x1654500
        idx = 5
        ret = -1
        __FUNCTION__ = "event_dispatch_epoll_handler"
---Type <return> to continue, or q <return> to quit---
#12 0x00007fe96c76a928 in event_dispatch_epoll (event_pool=0x1644310) at event.c:867
        events = 0x1658400
        size = 7
        i = 2
        ret = 0
        __FUNCTION__ = "event_dispatch_epoll"
#13 0x00007fe96c76ac47 in event_dispatch (event_pool=0x1644310) at event.c:975
        ret = -1
        __FUNCTION__ = "event_dispatch"
#14 0x0000000000406cb0 in main (argc=7, argv=0x7ffff18dfd78) at glusterfsd.c:1494
        ctx = 0x1644010
        cmd_args = 0x1644010
        pool = 0x16443b0
        stbuf = {st_dev = 18865251664, st_ino = 140640523546032, st_nlink = 18446712307976241152, st_mode = 1824152088, st_uid = 32745, st_gid = 1, 
          __pad0 = 0, st_rdev = 140640521439337, st_size = 140737246000976, st_blksize = -31765909471232, st_blocks = -12709036284313600, st_atim = {
            tv_sec = 0, tv_nsec = 0}, st_mtim = {tv_sec = 0, tv_nsec = 0}, st_ctim = {tv_sec = 8, tv_nsec = 18865251666}, __unused = {140737246001904, 
            140737246002016, 140640528258368}}
        tmp_logfile = '\000' <repeats 1023 times>
        tmp_logfile_dyn = 0x0
        tmp_logfilebase = 0x0
        timestr = '\000' <repeats 255 times>
        utime = 1272281644
        tm = 0x0
        ret = 0
        lim = {rlim_cur = 18446744073709551615, rlim_max = 18446744073709551615}
        specfp = 0x16452e0
        graph = 0x1650ba0
        trav = 0x0
        fuse_volume_found = 0
        xl_count = 8
        process_mode = 1 '\001'
        pipe_fd = {6, 7}
        gf_success = 0
---Type <return> to continue, or q <return> to quit---
        gf_failure = -1
        __FUNCTION__ = "main"

NFS Server log file:

[2010-04-26 17:05:42] T [nfs3.c:1027:nfs3_lookup] nfs-nfsv3: FH to Volume: qr
[2010-04-26 17:05:42] T [nfs3-helpers.c:2766:nfs3_fh_resolve_entry_hard] nfs-nfsv3: FH hard resolution: ino: 21299501, gen: 5464390344130953834, entry: ~dmtmp, hashidx: 0
[2010-04-26 17:05:42] T [nfs3-helpers.c:2774:nfs3_fh_resolve_entry_hard] nfs-nfsv3: Entry needs lookup: /clients/client1/~dmtmp
[2010-04-26 17:05:42] T [nfs-fops.c:256:nfs_fop_lookup] nfs: Lookup: /clients/client1/~dmtmp
[2010-04-26 17:05:42] D [afr-self-heal-entry.c:1421:afr_sh_entry_impunge_mkdir] dist: creating missing directory /clients/client5/~dmtmp/PWRPNT on client1
[2010-04-26 17:05:42] T [afr-self-heal-entry.c:1033:afr_sh_entry_expunge_readdir_cbk] dist: readdir'ed 4 entries from client3
[2010-04-26 17:05:42] T [afr-self-heal-entry.c:947:afr_sh_entry_expunge_entry] dist: inspecting existance of ~dmtmp under /clients/client6
[2010-04-26 17:05:42] T [afr-self-heal-entry.c:970:afr_sh_entry_expunge_entry] dist: looking up /clients/client6/~dmtmp on errgen
[2010-04-26 17:05:42] E [error-gen.c:401:error_gen_lookup] errgen: unwind(-1, No such file or directory)
[2010-04-26 17:05:42] T [afr-self-heal-entry.c:883:afr_sh_entry_expunge_entry_cbk] dist: missing entry /clients/client6/~dmtmp on errgen
pending frames:

patchset: git://git.sv.gnu.org/gluster.git
signal received: 11
time of crash: 2010-04-26 17:05:42
configuration details:
argp 1
backtrace 1
dlfcn 1
fdatasync 1
libpthread 1
llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 3.0.0git
/lib/libc.so.6[0x7fe96bfcc530]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_entry_cbk+0x127)[0x7fe96b13e4dc]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/debug/error-gen.so(error_gen_lookup+0x1af)[0x7fe96b359c95]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_entry+0x5ea)[0x7fe96b13ecd7]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_readdir_cbk+0x28e)[0x7fe96b13efb1]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/protocol/client.so(client_readdir_cbk+0x1c3)[0x7fe96b58a280]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/protocol/client.so(protocol_client_interpret+0x1df)[0x7fe96b58fde6]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/protocol/client.so(protocol_client_pollin+0xc6)[0x7fe96b590b67]
/gluster/nfs-git/lib/glusterfs/3.0.0git/xlator/protocol/client.so(notify+0x158)[0x7fe96b591160]
/gluster/nfs-git/lib/libglusterfs.so.0(xlator_notify+0xd8)[0x7fe96c7431c8]
/gluster/nfs-git/lib/glusterfs/3.0.0git/transport/socket.so(socket_event_poll_in+0x46)[0x7fe969fe945a]
/gluster/nfs-git/lib/glusterfs/3.0.0git/transport/socket.so(socket_event_handler+0xc4)[0x7fe969fe978d]
/gluster/nfs-git/lib/libglusterfs.so.0[0x7fe96c76a736]
/gluster/nfs-git/lib/libglusterfs.so.0[0x7fe96c76a928]
/gluster/nfs-git/lib/libglusterfs.so.0(event_dispatch+0x74)[0x7fe96c76ac47]
/gluster/nfs-git/sbin/glusterfs(main+0x1280)[0x406cb0]
/lib/libc.so.6(__libc_start_main+0xfd)[0x7fe96bfb7abd]
/gluster/nfs-git/sbin/glusterfs[0x402bb9]

Complete core and log file moved to /share/tickets/bugid

Comment 1 Shehjar Tikoo 2010-05-04 05:58:27 UTC
Re-assigning it to myself since this was part of NFS tests and unless seen otherwise, the culprit can be assumed to be nfs. Anyway, if I need replicate help I'll ask.

Comment 2 Shehjar Tikoo 2010-05-05 05:22:19 UTC
*** Bug 879 has been marked as a duplicate of this bug. ***

Comment 3 Shehjar Tikoo 2010-05-05 05:22:24 UTC
*** Bug 858 has been marked as a duplicate of this bug. ***

Comment 4 Shehjar Tikoo 2010-05-05 05:32:31 UTC
This and another report of crash is due to error gen. I am not fixing this ASAP  because as a first step to handle error-gen generated errors numbers, I need to translate each and every error number correctly into NFSv3 error numbers. This translation does not happen accurately right now because each POSIX error number needs to be translated to a NFSv3 errno depending on the v3 request that is being handled when the POSIX errno was returned. Since most basic error numbers are handled properly, I am going to leave this translation for later.

Comment 5 Shehjar Tikoo 2010-05-05 05:39:28 UTC
*** Bug 855 has been marked as a duplicate of this bug. ***

Comment 6 Shehjar Tikoo 2010-05-05 05:42:27 UTC
I see that the crash in afr_unlink path happens even when error-gen is not loaded. This could be a pointer to different problem also but since the symptoms are similar I am marking all as duplicate of this bug to keep a track of all the info related to this problem.

Comment 7 Anand Avati 2010-05-08 10:37:31 UTC
PATCH: http://patches.gluster.com/patch/3235 in master (nfs: Handle failed parent loc filling)

Comment 8 Anand Avati 2010-05-08 10:37:37 UTC
PATCH: http://patches.gluster.com/patch/3236 in master (nfs3: Use standard macro to return ESTALE)

Comment 9 Anand Avati 2010-05-10 06:17:59 UTC
PATCH: http://patches.gluster.com/patch/3243 in master (nfs: Fine tune nfs_entry_loc_fill return values)

Comment 10 Anand Avati 2010-05-13 16:12:05 UTC
PATCH: http://patches.gluster.com/patch/3260 in master (nfs3: Final unref only on successful remove)

Comment 11 Anush Shetty 2010-05-14 03:26:17 UTC
This bug is  reproducible on a non-nfs setup too. I was able to reproduce this with error-gen

/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_entry_cbk+0x13c)[0x7ff9befc78dc]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/debug/error-gen.so(error_gen_lookup+0x184)[0x7ff9bf1e1e4b]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_entry+0x545)[0x7ff9befc7fc2]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/cluster/replicate.so(afr_sh_entry_expunge_readdir_cbk+0x28e)[0x7ff9befc829c]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/protocol/client.so(client_readdir_cbk+0x195)[0x7ff9bf4107e0]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/protocol/client.so(protocol_client_interpret+0x1df)[0x7ff9bf416778]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/protocol/client.so(protocol_client_pollin+0xc6)[0x7ff9bf4174f9]
/gluster/git3.0/lib/glusterfs/3.0.0git/xlator/protocol/client.so(notify+0x158)[0x7ff9bf417af2]
/gluster/git3.0/lib/libglusterfs.so.0(xlator_notify+0xd8)[0x7ff9c05c9d7a]
/gluster/git3.0/lib/glusterfs/3.0.0git/transport/socket.so(socket_event_poll_in+0x46)[0x7ff9be15645a]
/gluster/git3.0/lib/glusterfs/3.0.0git/transport/socket.so(socket_event_handler+0xc4)[0x7ff9be15678d]
/gluster/git3.0/lib/libglusterfs.so.0[0x7ff9c05efa3e]
/gluster/git3.0/lib/libglusterfs.so.0[0x7ff9c05efc30]
/gluster/git3.0/lib/libglusterfs.so.0(event_dispatch+0x74)[0x7ff9c05eff4f]
/gluster/git3.0/sbin/glusterfs(main+0x1110)[0x40688c]
/lib/libc.so.6(__libc_start_main+0xfd)[0x7ff9bfe3eabd]
/gluster/git3.0/sbin/glusterfs[0x402ab9]

Comment 12 Anush Shetty 2010-05-14 03:31:02 UTC
Was running Dbench,doing self-heal from the client and also bringing the server up and down

Comment 13 Shehjar Tikoo 2010-05-14 04:12:35 UTC
Bugs marked dup of this one have been re-opened. I was under the impression that this too was related to bz 855 but it is not. The patches submitted as part of this bug are actually fixes for 855 and its duplicates.

Re-assigning to Vikas, as Anush us able to reproduce this for AFR only config.

Comment 14 Anand Avati 2010-08-13 03:39:05 UTC
PATCH: http://patches.gluster.com/patch/4097 in master (cluster/afr: Avoid Null de-reference when post parent is NULL)

Comment 15 Anand Avati 2010-08-13 03:39:23 UTC
PATCH: http://patches.gluster.com/patch/4096 in release-3.0 (cluster/afr: Avoid Null de-reference when post parent is NULL)

Comment 16 Pranith Kumar K 2010-08-13 10:39:58 UTC
when Error gen unwinds with postparent as NULL, afr should not de-reference it.