Bug 1201621

Summary: RDMA: Crash seen during smallfile read test.
Product: [Community] GlusterFS Reporter: Mohammed Rafi KC <rkavunga>
Component: rdmaAssignee: Mohammed Rafi KC <rkavunga>
Status: CLOSED CURRENTRELEASE QA Contact:
Severity: urgent Docs Contact:
Priority: high    
Version: mainlineCC: aavati, bturner, bugs, gluster-bugs, nlevinki, rkavunga, rwheeler, vbellur
Target Milestone: ---Keywords: Triaged
Target Release: ---   
Hardware: x86_64   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: 1201613 Environment:
Last Closed: 2015-05-14 06:36:56 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Bug Depends On: 1201613    
Bug Blocks:    

Description Mohammed Rafi KC 2015-03-13 06:09:32 UTC
+++ This bug was initially created as a clone of Bug #1201613 +++

Description of problem:

During smallfile read test I saw a crash in:

Program terminated with signal 11, Segmentation fault.
#0  0x00007f04626fd6b0 in __gf_rdma_deregister_mr (context=0x7f0450009798) at rdma.c:1667
1667	                        if (tmp->mr == mr[i]) {

Version-Release number of selected component (if applicable):

3.6.0.51

How reproducible:

1 in 25 runs so far.

Steps to Reproduce:
1.  Run smallfile reads over RDMA mount
2.
3.

Actual results:

Crash

Expected results:

No crash

Additional info:

--- Additional comment from Ben Turner on 2015-03-13 00:49:45 EDT ---

I have the core on:

[root@gqac023 tmp]# hostname
gqac023.sbu.lab.eng.bos.redhat.com
[root@gqac023 tmp]# pwd
/tmp
[root@gqac023 tmp]# ll
total 613336
-rwx------ 1 root root      8118 Mar 13 00:28 bt.PID=14416UID=0
-rwx------ 1 root root 627536163 Mar 13 00:28 core.dump.PID=14416UID=0
-rw-r--r-- 1 root root       112 Mar 13 00:28 core.info.PID=14416UID=0

--- Additional comment from Ben Turner on 2015-03-13 01:16:39 EDT ---

(gdb) bt full
#0  0x00007f04626fd6b0 in __gf_rdma_deregister_mr (context=0x7f0450009798) at rdma.c:1667
        tmp = 0xbabebabe
        i = <value optimized out>
        found = 0
#1  __gf_rdma_request_context_destroy (context=0x7f0450009798) at rdma.c:1733
        peer = 0x7f045c05fce0
        priv = <value optimized out>
        device = 0x7f0450000b70
        ret = 0
        __FUNCTION__ = "__gf_rdma_request_context_destroy"
#2  0x00007f04626fe188 in gf_rdma_pollin_notify (peer=0x7f045c05fce0, post=<value optimized out>) at rdma.c:3712
        ret = <value optimized out>
        msg_type = <value optimized out>
        rpc_req = <value optimized out>
        request_context = 0x7f0450009798
        request_info = {xid = 34619545, prognum = 1298437, progver = 330, procnum = 27, rpc_req = 0x7f0460e935a4, rsp = {rsphdr = 0x0, 
            rsphdr_count = 0, rsp_payload = 0x0, rsp_payload_count = 0, rsp_iobref = 0x0}}
        priv = 0x7f045c05fcd0
        ptr = <value optimized out>
        pollin = 0x7f0430741e50
        __FUNCTION__ = "gf_rdma_pollin_notify"
#3  0x00007f04626fe431 in gf_rdma_recv_reply (peer=0x7f045c05fce0, post=0x7f0450f956e0) at rdma.c:3813
        ret = 0
        header = <value optimized out>
        reply_info = 0x7f045001247c
        wc_array = <value optimized out>
        i = <value optimized out>
        ptr = <value optimized out>
        ctx = <value optimized out>
        request_info = {xid = 34619545, prognum = 1298437, progver = 330, procnum = 27, rpc_req = 0x7f0460e935a4, rsp = {rsphdr = 0x0, 
            rsphdr_count = 0, rsp_payload = 0x0, rsp_payload_count = 0, rsp_iobref = 0x0}}
        rpc_req = <value optimized out>
        __FUNCTION__ = "gf_rdma_recv_reply"
#4  0x00007f04626fe83b in gf_rdma_process_recv (peer=0x7f045c05fce0, wc=<value optimized out>) at rdma.c:3946
        post = 0x7f0450f956e0
        readch = 0x0
        ret = <value optimized out>
        ptr = <value optimized out>
        msg_type = <value optimized out>
        header = 0x7f0452cea000
        priv = 0x7f045c05fcd0
        __FUNCTION__ = "gf_rdma_process_recv"
#5  0x00007f04626fea25 in gf_rdma_recv_completion_proc (data=0x7f0450018cb0) at rdma.c:4083
        chan = <value optimized out>
        device = 0x7f0450000b70
        post = 0x7f0450f956e0
        peer = <value optimized out>
        event_cq = 0x7f0450018cd0
        wc = {{wr_id = 139656515114720, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 52, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515115440, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 64, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515162224, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515180768, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 192, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515213152, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 72, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656515542080, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 72, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517359392, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 192, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517358736, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517358208, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 132, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 5, sl = 0 '\000', dlid_path_bits = 0 '\000'}, {
            wr_id = 139656517357680, status = IBV_WC_SUCCESS, opcode = IBV_WC_RECV, vendor_err = 0, byte_len = 176, imm_data = 0, 
            qp_num = 130, src_qp = 1, wc_flags = 0, pkey_index = 0, slid = 4, sl = 0 '\000', dlid_path_bits = 0 '\000'}}
        event_ctx = 0x7f0450000b70
        ret = <value optimized out>
        num_wr = 1
        index = <value optimized out>
---Type <return> to continue, or q <return> to quit---
        failed = 0 '\000'
        __FUNCTION__ = "gf_rdma_recv_completion_proc"
#6  0x0000003acfc079d1 in start_thread (arg=0x7f0455fff700) at pthread_create.c:301
        __res = <value optimized out>
        pd = 0x7f0455fff700
        now = <value optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {139656599435008, 3437653819718065564, 252595782496, 139656599435712, 0, 3, 
                -3333697194911900260, 3441137622062161308}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, 
              cleanup = 0x0, canceltype = 0}}}
        not_first_call = <value optimized out>
        pagesize_m1 = <value optimized out>
        sp = <value optimized out>
        freesize = <value optimized out>
#7  0x0000003acf8e8b6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115

Comment 1 Anand Avati 2015-03-13 06:10:54 UTC
REVIEW: http://review.gluster.org/9872 (rdma:changing list iteration to safe mode) posted (#1) for review on master by mohammed rafi  kc (rkavunga)

Comment 3 Anand Avati 2015-03-15 16:07:08 UTC
COMMIT: http://review.gluster.org/9872 committed in master by Vijay Bellur (vbellur) 
------
commit b3f63120e8f2b6f99d44ebe244aafafeb6ac890e
Author: Mohammed Rafi KC <rkavunga>
Date:   Fri Mar 13 11:37:14 2015 +0530

    rdma:changing list iteration to safe mode
    
    Change-Id: I2299378f02a5577a8bf2874664ba79e92c3811b5
    BUG: 1201621
    Signed-off-by: Mohammed Rafi KC <rkavunga>
    Reviewed-on: http://review.gluster.org/9872
    Reviewed-by: Krishnan Parthasarathi <kparthas>
    Tested-by: Gluster Build System <jenkins.com>
    Reviewed-by: Raghavendra Talur <rtalur>
    Reviewed-by: Vijay Bellur <vbellur>

Comment 4 Mohammed Rafi KC 2015-05-14 06:36:56 UTC
fix available from 3.6.3 on-wards