Bug 762421 (GLUSTER-689) - Segfault (11) on op UNLINK
Summary: Segfault (11) on op UNLINK
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: GLUSTER-689
Product: GlusterFS
Classification: Community
Component: replicate
Version: 3.0.2
Hardware: All
OS: Linux
medium
medium
Target Milestone: ---
Assignee: Anand Avati
QA Contact:
URL:
Whiteboard:
Depends On:
Blocks:
TreeView+ depends on / blocked
 
Reported: 2010-03-02 01:39 UTC by Naoki
Modified: 2015-12-01 16:45 UTC (History)
3 users (show)

Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Clone Of:
Environment:
Last Closed:
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:


Attachments (Terms of Use)

Description Vikas Gorur 2010-03-01 22:50:57 UTC
Do you still have the core file? Can you get a full backtrace by doing:

# gdb /path/to/glusterfs/binary /path/to/core

(gdb) bt full

Comment 1 Naoki 2010-03-01 23:04:31 UTC
Certainly.

(gdb) bt full
#0  afr_unlink_wind_cbk (frame=0x7f37b400d768, cookie=<value optimized out>, this=0x1ec8750, op_ret=0, 
    op_errno=<value optimized out>, preparent=0x0, postparent=0x0) at afr-dir-write.c:1645
        local = 0x7f37b4007150
        call_count = <value optimized out>
        child_index = 0
#1  0x00007f37bc7ddf21 in dht_unlink_cbk (frame=0x1ed5ef0, cookie=<value optimized out>, this=<value optimized out>, 
    op_ret=<value optimized out>, op_errno=<value optimized out>, preparent=<value optimized out>, 
    postparent=0x7fff14d3f330) at dht-common.c:1216
        fn = 0x7f37bc5a4980 <afr_unlink_wind_cbk>
        _parent = 0x7f37b400d768
        old_THIS = 0x1ec7b50
        __local = 0x1edab00
        __xl = 0x1ec7b50
        local = 0x1edab00
        this_call_cnt = <value optimized out>
        __FUNCTION__ = "dht_unlink_cbk"
#2  0x00007f37bca0cf8a in client_unlink_cbk (frame=0x1ed6e50, hdr=<value optimized out>, hdrlen=<value optimized out>, 
    iobuf=<value optimized out>) at client-protocol.c:4562
        fn = 0x7f37bc7dde50 <dht_unlink_cbk>
        _parent = 0x1ed5ef0
        old_THIS = 0x1ec6480
        rsp = <value optimized out>
        op_ret = 0
        op_errno = 0
        preparent = {st_dev = 5439812256340639918, st_ino = 4177921, st_nlink = 4, st_mode = 17901, st_uid = 511, 
          st_gid = 516, __pad0 = 0, st_rdev = 0, st_size = 4096, st_blksize = 4096, st_blocks = 16, st_atim = {
            tv_sec = 1267495031, tv_nsec = 0}, st_mtim = {tv_sec = 1267467606, tv_nsec = 0}, st_ctim = {
            tv_sec = 1267494728, tv_nsec = 0}, __unused = {0, 0, 0}}
        postparent = {st_dev = 5439812256340639918, st_ino = 4177921, st_nlink = 4, st_mode = 17901, st_uid = 511, 
          st_gid = 516, __pad0 = 0, st_rdev = 0, st_size = 4096, st_blksize = 4096, st_blocks = 16, st_atim = {
            tv_sec = 1267495031, tv_nsec = 0}, st_mtim = {tv_sec = 1267495072, tv_nsec = 0}, st_ctim = {
            tv_sec = 1267495072, tv_nsec = 0}, __unused = {0, 0, 0}}
#3  0x00007f37bc9f87ca in protocol_client_pollin (this=0x1ec6480, trans=0x1ed0360) at client-protocol.c:6809
        conf = 0x1ed02e0
        ret = 0
        iobuf = 0x0
        hdr = 0x1ed9da0 ""
        hdrlen = 268
#4  0x00007f37bca07568 in notify (this=0x7f37b4008550, event=<value optimized out>, data=0x1ed0360)
    at client-protocol.c:6928
        ret = <value optimized out>
        was_not_down = <value optimized out>
        conn = <value optimized out>
        conf = 0x1ed02e0
        parent = <value optimized out>
---Type <return> to continue, or q <return> to quit---
        __FUNCTION__ = "notify"
#5  0x00007f37bdbc23d3 in xlator_notify (xl=0x1ec6480, event=2, data=0x1ed0360) at xlator.c:923
        old_THIS = 0x7f37bddf26a0
        ret = <value optimized out>
#6  0x00007f37baf4d1e8 in socket_event_handler (fd=<value optimized out>, idx=2, data=0x1ed0360, poll_in=1, poll_out=0, 
    poll_err=<value optimized out>) at socket.c:829
        priv = 0x1ed0720
        ret = 0
#7  0x00007f37bdbdc82d in event_dispatch_epoll_handler (event_pool=0x1ec1090) at event.c:804
        data = <value optimized out>
        idx = <value optimized out>
        event_data = 0x1ed31c4
        handler = 0xe
#8  event_dispatch_epoll (event_pool=0x1ec1090) at event.c:867
        events = 0x1ed31c0
        i = 0
        ret = 1
        __FUNCTION__ = "event_dispatch_epoll"
#9  0x0000000000404132 in main (argc=<value optimized out>, argv=<value optimized out>) at glusterfsd.c:1413
        ctx = 0x1ec0010
        stbuf = {st_dev = 3620248, st_ino = 3639464, st_nlink = 1503232, st_mode = 3, st_uid = 0, st_gid = 0, __pad0 = 0, 
          st_rdev = 94208, st_size = 91468, st_blksize = 91468, st_blocks = 0, st_atim = {tv_sec = 5, tv_nsec = 2187264}, 
          st_mtim = {tv_sec = 2195456, tv_nsec = 2191968}, st_ctim = {tv_sec = 2208640, tv_nsec = 90112}, __unused = {3, 
            2105344, 2113536}}
        tmp_logfile = '\000' <repeats 1023 times>
        tmp_logfile_dyn = <value optimized out>
        tmp_logfilebase = <value optimized out>
        timestr = '\000' <repeats 255 times>
        utime = 1267494071
        tm = <value optimized out>
        ret = 0
        lim = {rlim_cur = 18446744073709551615, rlim_max = 18446744073709551615}
        specfp = <value optimized out>
        graph = 0x0
        trav = <value optimized out>
        fuse_volume_found = <value optimized out>
        xl_count = <value optimized out>
        pipe_fd = {6, 7}
        gf_success = 0
        gf_failure = -1
        __FUNCTION__ = "main"

Comment 2 Vikas Gorur 2010-03-02 00:40:44 UTC
Thanks for the info. The problem appears to be unchecked access of the preparent and postparent variables. A patch to fix this will soon be in the repository.

Comment 3 Naoki 2010-03-02 00:53:58 UTC
Excellent, if you need any additional traces or tests please let me know. Cheers.

Comment 4 Naoki 2010-03-02 01:39:35 UTC
Clients (F12) cannot delete over a RAID 10 system (six nodes).

Trace is :

frame : type(1) op(UNLINK)
frame : type(1) op(UNLINK)

patchset: v3.0.2
signal received: 11
time of crash: 2010-03-01 20:02:07
configuration details:
argp 1
backtrace 1
dlfcn 1
fdatasync 1
libpthread 1
llistxattr 1
setfsid 1
spinlock 1
epoll.h 1
xattr.h 1
st_atim.tv_nsec 1
package-string: glusterfs 3.0.2
/lib64/libc.so.6(+0x32740)[0x7f10f7641740]
/usr/lib64/glusterfs/3.0.2/xlator/cluster/replicate.so(afr_unlink_wind_cbk+0x10a)[0x7f10f679ea8a]
/usr/lib64/glusterfs/3.0.2/xlator/cluster/distribute.so(dht_unlink_cbk+0xd1)[0x7f10f69d7f21]
/usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(client_unlink_cbk+0x27a)[0x7f10f6c06f8a]
/usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(protocol_client_pollin+0xca)[0x7f10f6bf27ca]
/usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(notify+0xe8)[0x7f10f6c01568]
/usr/lib64/libglusterfs.so.0(xlator_notify+0x43)[0x7f10f7dbc3d3]
/usr/lib64/glusterfs/3.0.2/transport/socket.so(socket_event_handler+0xc8)[0x7f10f51471e8]
/usr/lib64/libglusterfs.so.0(+0x2e82d)[0x7f10f7dd682d]
/usr/sbin/glusterfs(main+0x852)[0x404132]
/lib64/libc.so.6(__libc_start_main+0xfd)[0x7f10f762db1d]
/usr/sbin/glusterfs[0x4026f9]


Same trace on both clients. Server logs show nothing abnormal.

Comment 5 Shehjar Tikoo 2010-03-02 07:50:31 UTC
Please use configuration generated by glusterfs-volgen ( refer documentation) only

Comment 6 Anand Avati 2010-03-04 09:37:24 UTC
PATCH: http://patches.gluster.com/patch/2873 in master (dht: preserve and return proper pre/postparent structures during unlink)

Comment 7 Anand Avati 2010-03-04 09:37:28 UTC
PATCH: http://patches.gluster.com/patch/2872 in release-3.0 (dht: preserve and return proper pre/postparent structures during unlink)


Note You need to log in before you can comment on or make changes to this bug.