Bug 1450698 - a core generated when running regression test /tests/bugs/glusterd/bug-1004744.t
Summary: a core generated when running regression test /tests/bugs/glusterd/bug-1004744.t
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: GlusterFS
Classification: Community
Component: distribute
Version: mainline
Hardware: Unspecified
OS: Unspecified
unspecified
unspecified
Target Milestone: ---
Assignee: Nithya Balachandran
QA Contact:
URL:
Whiteboard:
Depends On: 1452102
Blocks:
TreeView+ depends on / blocked
 
Reported: 2017-05-14 21:59 UTC by Zhou Zhengping
Modified: 2018-08-29 03:18 UTC (History)
4 users (show)

Fixed In Version: glusterfs-4.1.3 (or higher)
Doc Type: If docs needed, set a value
Doc Text:
Clone Of:
Environment:
Last Closed: 2018-08-29 03:18:49 UTC
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Embargoed:


Attachments (Terms of Use)

Description Zhou Zhengping 2017-05-14 21:59:18 UTC
Description of problem:
a core generated when running centos regressions for https://review.gluster.org/#/c/17280/3

The jenkins link :
https://build.gluster.org/job/centos6-regression/4583/consoleFull

the core backtrace info :

(gdb) bt
#0  0x00007fe7b7d69bf1 in dht_selfheal_dir_mkdir_lookup_done (frame=0x7fe7a8002d70, this=0x7fe7b80115b0)
    at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-selfheal.c:1338
#1  0x00007fe7b7d6a382 in dht_selfheal_dir_mkdir_lookup_cbk (frame=0x7fe7a8002d70, cookie=0x7fe7b800f290, this=0x7fe7b80115b0, op_ret=-1, 
    op_errno=2, inode=0x7fe7a80065a0, stbuf=0x7fe7b7213840, xattr=0x0, postparent=0x7fe7b72137d0)
    at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-selfheal.c:1425
#2  0x00007fe7bc0a18ec in client3_3_lookup_cbk (req=0x7fe7ac009600, iov=0x7fe7ac009640, count=1, myframe=0x7fe7ac031b30)
    at /home/jenkins/root/workspace/centos6-regression/xlators/protocol/client/src/client-rpc-fops.c:2867
#3  0x00007fe7c972484d in rpc_clnt_handle_reply (clnt=0x7fe7b80269a0, pollin=0x7fe7ac006180)
    at /home/jenkins/root/workspace/centos6-regression/rpc/rpc-lib/src/rpc-clnt.c:778
#4  0x00007fe7c9724e17 in rpc_clnt_notify (trans=0x7fe7b8026bd0, mydata=0x7fe7b80269d0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x7fe7ac006180)
    at /home/jenkins/root/workspace/centos6-regression/rpc/rpc-lib/src/rpc-clnt.c:971
#5  0x00007fe7c9720dac in rpc_transport_notify (this=0x7fe7b8026bd0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x7fe7ac006180)
    at /home/jenkins/root/workspace/centos6-regression/rpc/rpc-lib/src/rpc-transport.c:538
#6  0x00007fe7be50658a in socket_event_poll_in (this=0x7fe7b8026bd0, notify_handled=_gf_true)
    at /home/jenkins/root/workspace/centos6-regression/rpc/rpc-transport/socket/src/socket.c:2315
#7  0x00007fe7be506bd5 in socket_event_handler (fd=14, idx=4, gen=4, data=0x7fe7b8026bd0, poll_in=1, poll_out=0, poll_err=0)
    at /home/jenkins/root/workspace/centos6-regression/rpc/rpc-transport/socket/src/socket.c:2467
#8  0x00007fe7c99d025e in event_dispatch_epoll_handler (event_pool=0x18b6fc0, event=0x7fe7b7213e70)
    at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/event-epoll.c:572
#9  0x00007fe7c99d0560 in event_dispatch_epoll_worker (data=0x7fe7b80216c0)
    at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/event-epoll.c:648
#10 0x00007fe7c8c37aa1 in start_thread () from ./lib64/libpthread.so.0
#11 0x00007fe7c859fbcd in clone () from ./lib64/libc.so.6
(gdb) p frame->ref_count 
$4 = -1

the racing thread:
Thread 7 (LWP 3527):
#0  0x00007fe7c8c3f06d in open64 () from ./lib64/libpthread.so.0
#1  0x00007fe7c99e53ac in gf_mkostemp (tmpl=0x7fe7b5e0f7e0 "/tmp/btDBbLPQ", suffixlen=0, flags=2)
    at /home/jenkins/root/workspace/centos6-regression/contrib/stdlib/gf_mkostemp.c:96
#2  0x00007fe7c996e00c in gf_backtrace_fillframes (
    buf=0x7fe7a4001890 "(--> /build/install/lib/libglusterfs.so.0(synctask_yield+0x41)[0x7fe7c99a9d04] (--> /build/install/lib/libglusterfs.so.0(syncop_readdirp+0x3fd)[0x7fe7c99ac6c5] (--> /build/install/lib/glusterfs/3.12de"...)
    at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/common-utils.c:3916
#3  0x00007fe7c996e1fa in gf_backtrace_save (
    buf=0x7fe7a4001890 "(--> /build/install/lib/libglusterfs.so.0(synctask_yield+0x41)[0x7fe7c99a9d04] (--> /build/install/lib/libglusterfs.so.0(syncop_readdirp+0x3fd)[0x7fe7c99ac6c5] (--> /build/install/lib/glusterfs/3.12de"...)
    at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/common-utils.c:3979
#4  0x00007fe7c99a9d04 in synctask_yield (task=0x7fe7a4001400)
    at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/syncop.c:336
#5  0x00007fe7c99abe54 in syncop_lookup (subvol=0x7fe7b80115b0, loc=0x7fe7b5e11b30, iatt=0x7fe7b5e11a10, parent=0x0, xdata_in=0x0, 
    xdata_out=0x0) at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/syncop.c:1230
#6  0x00007fe7b7d614dd in gf_defrag_fix_layout (this=0x7fe7b80115b0, defrag=0x7fe7b80209a0, loc=0x7fe7b5e11d60, fix_layout=0x7fe7a8004c00, 
    migrate_data=0x0) at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-rebalance.c:3650
#7  0x00007fe7b7d617a8 in gf_defrag_fix_layout (this=0x7fe7b80115b0, defrag=0x7fe7b80209a0, loc=0x7fe7b5e11f30, fix_layout=0x7fe7a8004c00, 
    migrate_data=0x0) at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-rebalance.c:3713
#8  0x00007fe7b7d63332 in gf_defrag_start_crawl (data=0x7fe7b80115b0)
    at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-rebalance.c:4281
#9  0x00007fe7c99a9e31 in synctask_wrap () at /home/jenkins/root/workspace/centos6-regression/libglusterfs/src/syncop.c:375
#10 0x00007fe7c84fa760 in ?? () from ./lib64/libc.so.6
#11 0x0000000000000000 in ?? ()


Version-Release number of selected component (if applicable):


How reproducible:


Steps to Reproduce:
1.
2.
3.

Actual results:


Expected results:


Additional info:

Comment 1 Zhou Zhengping 2017-05-14 22:00:28 UTC
Zhou Zhengping
5:40 AM

Patch Set 3:

    http://build.gluster.org/job/centos6-regression/4583/consoleFull :
    FAILED
    18:52:49 1 test(s) generated core 
    18:52:49 ./tests/bugs/glusterd/bug-1004744.t

Seems there is a bug around dht xlator when add bricks because race condition when dht use conf->subvolume_cnt to start a for loop.like this:

local->call_cnt = conf->subvolume_cnt; //would be changed when add bricks
for (i = 0; i < conf->subvolume_cnt; i++) {
    STACK_WIND_COOKIE (frame, dht_selfheal_dir_mkdir_lookup_cbk...
}

Comment 2 Mohit Agrawal 2017-05-15 02:18:55 UTC
It seems core is generated by this test case /tests/bugs/glusterd/bug-1004744.t.

Comment 3 Zhou Zhengping 2017-05-16 03:08:25 UTC
(In reply to Mohit Agrawal from comment #2)
> It seems core is generated by this test case
> /tests/bugs/glusterd/bug-1004744.t.

Comment 4 Nithya Balachandran 2017-05-26 04:52:00 UTC
From the core dump:

Program terminated with signal 11, Segmentation fault.
#0  0x00007fe7b7d69bf1 in dht_selfheal_dir_mkdir_lookup_done (frame=0x7fe7a8002d70, this=0x7fe7b80115b0)
    at /home/jenkins/root/workspace/centos6-regression/xlators/cluster/dht/src/dht-selfheal.c:1338
warning: Source file is more recent than executable.
1338	                        gf_msg_debug (this->name, 0,
(gdb) l
1333	
1334	        for (i = 0; i < layout->cnt; i++) {
1335	                if (layout->list[i].err == ESTALE ||
1336	                    layout->list[i].err == ENOENT ||
1337	                    local->selfheal.force_mkdir) {
1338	                        gf_msg_debug (this->name, 0,
1339	                                      "Creating directory %s on subvol %s",
1340	                                      loc->path, layout->list[i].xlator->name);
1341	
1342	                        STACK_WIND_COOKIE (frame, dht_selfheal_dir_mkdir_cbk,
(gdb) p loc->path
$1 = 0x0
(gdb) p *layout
$2 = {spread_cnt = -1379869184, cnt = 8382430, preset = 30064640, commit_hash = 8382392, gen = 41216, type = -768, ref = 32895, search_unhashed = _gf_false, 
  list = 0x7fe7ac00b330}
(gdb) p i
$3 = 469
(gdb) p loc
$4 = (loc_t *) 0x7fe7a8001d08
(gdb) p layout->list[i].xlator->name
Cannot access memory at address 0x0
(gdb) p *frame
$5 = {root = 0xfffffffffffffffd, parent = 0x0, frames = {next = 0x7fe7a8002d80, prev = 0x7fe7a8002d80}, local = 0x0, this = 0x0, ret = 0x0, ref_count = -1, lock = {
    spinlock = 0, mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = -1476332528, __spins = 32743, __list = {__prev = 0x7, 
          __next = 0x7fe7a800f4d0}}, __size = '\000' <repeats 16 times>, "\020\364\000\250\347\177\000\000\a\000\000\000\000\000\000\000\320\364\000\250\347\177\000", 
      __align = 0}}, cookie = 0x0, complete = _gf_false, op = GF_FOP_NULL, begin = {tv_sec = 0, tv_usec = 0}, end = {tv_sec = 0, tv_usec = 0}, wind_from = 0x0, 
  wind_to = 0x0, unwind_from = 0x0, unwind_to = 0x0}



The frame and the layout variable have already been freed because the call has unwound. 

This is similar to the issue reported by 1452102 and has been fixed by https://review.gluster.org/17343.

Marking this dependent on 1452102 and moving it to Modified.


Note You need to log in before you can comment on or make changes to this bug.