Bug 1201621 - RDMA: Crash seen during smallfile read test.
Summary: RDMA: Crash seen during smallfile read test.
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: GlusterFS
Classification: Community
Component: rdma
Version: mainline
Hardware: x86_64
OS: Linux
high
urgent
Target Milestone: ---
Assignee: Mohammed Rafi KC
QA Contact:
URL:
Whiteboard:
Depends On: 1201613
Blocks:
TreeView+ depends on / blocked
 
Reported: 2015-03-13 06:09 UTC by Mohammed Rafi KC
Modified: 2015-05-14 06:36 UTC (History)
8 users (show)

Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Clone Of: 1201613
Environment:
Last Closed: 2015-05-14 06:36:56 UTC
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:


Attachments (Terms of Use)

Description Mohammed Rafi KC 2015-03-13 06:09:32 UTC
+++ This bug was initially created as a clone of Bug #1201613 +++

Description of problem:

During smallfile read test I saw a crash in:

Program terminated with signal 11, Segmentation fault.
#0  0x00007f04626fd6b0 in __gf_rdma_deregister_mr (context=0x7f0450009798) at rdma.c:1667
1667	                        if (tmp->mr == mr[i]) {

Version-Release number of selected component (if applicable):

3.6.0.51

How reproducible:

1 in 25 runs so far.

Steps to Reproduce:
1.  Run smallfile reads over RDMA mount
2.
3.

Actual results:

Crash

Expected results:

No crash

Additional info:

--- Additional comment from Ben Turner on 2015-03-13 00:49:45 EDT ---

I have the core on:

[root@gqac023 tmp]# hostname
gqac023.sbu.lab.eng.bos.redhat.com
[root@gqac023 tmp]# pwd
/tmp
[root@gqac023 tmp]# ll
total 613336
-rwx------ 1 root root      8118 Mar 13 00:28 bt.PID=14416UID=0
-rwx------ 1 root root 627536163 Mar 13 00:28 core.dump.PID=14416UID=0
-rw-r--r-- 1 root root       112 Mar 13 00:28 core.info.PID=14416UID=0

--- Additional comment from Ben Turner on 2015-03-13 01:16:39 EDT ---

(gdb) bt full
#0  0x00007f04626fd6b0 in __gf_rdma_deregister_mr (context=0x7f0450009798) at rdma.c:1667
        tmp = 0xbabebabe
        i = <value optimized out>
        found = 0
#1  __gf_rdma_request_context_destroy (context=0x7f0450009798) at rdma.c:1733
        peer = 0x7f045c05fce0
        priv = <value optimized out>
        device = 0x7f0450000b70
        ret = 0
        __FUNCTION__ = "__gf_rdma_request_context_destroy"
#2  0x00007f04626fe188 in gf_rdma_pollin_notify (peer=0x7f045c05fce0, post=<value optimized out>) at rdma.c:3712
        ret = <value optimized out>
        msg_type = <value optimized out>
        rpc_req = <value optimized out>
        request_context = 0x7f0450009798
        request_info = {xid = 34619545, prognum = 1298437, progver = 330, procnum = 27, rpc_req = 0x7f0460e935a4, rsp = {rsphdr = 0x0, 
            rsphdr_count = 0, rsp_payload = 0x0, rsp_payload_count = 0, rsp_iobref = 0x0}}
        priv = 0x7f045c05fcd0
        ptr = <value optimized out>
        pollin = 0x7f0430741e50
        __FUNCTION__ = "gf_rdma_pollin_notify"
#3  0x00007f04626fe431 in gf_rdma_recv_reply (peer=0x7f045c05fce0, post=0x7f0450f956e0) at rdma.c:3813
        ret = 0
        header = <value optimized out>
        reply_info = 0x7f045001247c
        wc_array = <value optimized out>
        i = <value optimized out>
        ptr = <value optimized out>
        ctx = <value optimized out>
        request_info = {xid = 34619545, prognum = 1298437, progver = 330, procnum = 27, rpc_req = 0x7f0460e935a4, rsp = {rsphdr = 0x0, 
            rsphdr_count = 0, rsp_payload = 0x0, rsp_payload_count = 0, rsp_iobref = 0x0}}
        rpc_req = <value optimized out>
        __FUNCTION__ = "gf_rdma_recv_reply"
#4  0x00007f04626fe83b in gf_rdma_process_recv (peer=0x7f045c05fce0, wc=<value optimized out>) at rdma.c:3946
        post = 0x7f0450f956e0
        readch = 0x0
        ret = <value optimized out>
        ptr = <value optimized out>
        msg_type = <value optimized out>
        header = 0x7f0452cea000
        priv = 0x7f045c05fcd0
        __FUNCTION__ = "gf_rdma_process_recv"
#5  0x00007f04626fea25 in gf_rdma_recv_completion_proc (data=0x7f0450018cb0) at rdma.c:4083
        chan = <value optimized out>
        device = 0x7f0450000b70
        post = 0x7f0450f956e0
        peer = <value optimized out>
        event_cq = 0x7f0450018cd0
        wc = {{wr_id = 139656515114720, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 52, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515115440, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 64, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515162224, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515180768, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 192, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515213152, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 72, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515542080, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 72, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517359392, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 192, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517358736, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517358208, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517357680, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}}
        event_ctx = 0x7f0450000b70
        ret = <value optimized out>
        num_wr = 1
        index = <value optimized out>
---Type <return> to continue, or q <return> to quit---
        failed = 0 '\000'
        __FUNCTION__ = "gf_rdma_recv_completion_proc"
#6  0x0000003acfc079d1 in start_thread (arg=0x7f0455fff700) at pthread_create.c:301
        __res = <value optimized out>
        pd = 0x7f0455fff700
        now = <value optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {139656599435008, 3437653819718065564, 252595782496, 139656599435712, 0, 3, 
                -3333697194911900260, 3441137622062161308}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, 
              cleanup = 0x0, canceltype = 0}}}
        not_first_call = <value optimized out>
        pagesize_m1 = <value optimized out>
        sp = <value optimized out>
        freesize = <value optimized out>
#7  0x0000003acf8e8b6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115

Comment 1 Anand Avati 2015-03-13 06:10:54 UTC
REVIEW: http://review.gluster.org/9872 (rdma:changing list iteration to safe mode) posted (#1) for review on master by mohammed rafi  kc (rkavunga@redhat.com)

Comment 3 Anand Avati 2015-03-15 16:07:08 UTC
COMMIT: http://review.gluster.org/9872 committed in master by Vijay Bellur (vbellur@redhat.com) 
------
commit b3f63120e8f2b6f99d44ebe244aafafeb6ac890e
Author: Mohammed Rafi KC <rkavunga@redhat.com>
Date:   Fri Mar 13 11:37:14 2015 +0530

    rdma:changing list iteration to safe mode
    
    Change-Id: I2299378f02a5577a8bf2874664ba79e92c3811b5
    BUG: 1201621
    Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
    Reviewed-on: http://review.gluster.org/9872
    Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
    Tested-by: Gluster Build System <jenkins@build.gluster.com>
    Reviewed-by: Raghavendra Talur <rtalur@redhat.com>
    Reviewed-by: Vijay Bellur <vbellur@redhat.com>

Comment 4 Mohammed Rafi KC 2015-05-14 06:36:56 UTC
fix available from 3.6.3 on-wards


Note You need to log in before you can comment on or make changes to this bug.