Bug 765371 (GLUSTER-3639) - [glusterfs-3.3.0qa1]: glusterfs client crashed
Summary: [glusterfs-3.3.0qa1]: glusterfs client crashed
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: GLUSTER-3639
Product: GlusterFS
Classification: Community
Component: replicate
Version: pre-release
Hardware: x86_64
OS: Linux
low
high
Target Milestone: ---
Assignee: Pranith Kumar K
QA Contact:
URL:
Whiteboard:
Depends On:
Blocks:
TreeView+ depends on / blocked
 
Reported: 2011-09-27 06:16 UTC by Raghavendra Bhat
Modified: 2015-12-01 16:45 UTC (History)
1 user (show)

Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Clone Of:
Environment:
Last Closed:
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:


Attachments (Terms of Use)

Description Raghavendra Bhat 2011-09-27 04:20:54 UTC
rm -rf from 2 clients simultaneously will reproduce this bug.

Comment 1 Raghavendra Bhat 2011-09-27 06:16:59 UTC
After the tests of bug 765369, tried to remove everything from the mount point by doing rm -rf from both fuse and nfs clients simultaneously. The fuse client crashed with the following backtrace.

Core was generated by `/usr/local/sbin/glusterfs --volfile-id=mirror --volfile-server=10.1.11.73 /clie'.
Program terminated with signal 6, Aborted.
#0  0x00007f270a314a75 in raise () from /lib/libc.so.6
(gdb) bt
#0  0x00007f270a314a75 in raise () from /lib/libc.so.6
#1  0x00007f270a3185c0 in abort () from /lib/libc.so.6
#2  0x00007f270a30d941 in __assert_fail () from /lib/libc.so.6
#3  0x00007f2704bb93e0 in afr_inode_set_read_ctx (this=0x7f26edf52ea0, inode=0x7f269bd24d20, read_child=-1, fresh_children=0x12b8180)
    at ../../../../../xlators/cluster/afr/src/afr-common.c:417
#4  0x00007f2704ba87f1 in afr_sh_entry_fix (frame=0x7f270955a37c, this=0x7f26edf52ea0)
    at ../../../../../xlators/cluster/afr/src/afr-self-heal-entry.c:2259
#5  0x00007f2704ba88c1 in afr_sh_entry_lookup_cbk (frame=0x7f270955a37c, cookie=0x1, this=0x7f26edf52ea0, op_ret=-1, op_errno=2, 
    inode=0x7f269bd24d20, buf=0x7fffb9b6ca10, xattr=0x0, postparent=0x7fffb9b6c9a0)
    at ../../../../../xlators/cluster/afr/src/afr-self-heal-entry.c:2288
#6  0x00007f2704dfc01e in client3_1_lookup_cbk (req=0x7f269c2fc060, iov=0x7f269c2fc0a0, count=1, myframe=0x7f27097d8778)
    at ../../../../../xlators/protocol/client/src/client3_1-fops.c:2255
#7  0x00007f270acb130d in rpc_clnt_handle_reply (clnt=0x7f26edf5fc10, pollin=0x15c3da0) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:789
#8  0x00007f270acb165a in rpc_clnt_notify (trans=0x7f26edf5ff40, mydata=0x7f26edf5fc40, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x15c3da0)
    at ../../../../rpc/rpc-lib/src/rpc-clnt.c:902
#9  0x00007f270acad6b8 in rpc_transport_notify (this=0x7f26edf5ff40, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x15c3da0)
    at ../../../../rpc/rpc-lib/src/rpc-transport.c:498
#10 0x00007f2707e327d0 in socket_event_poll_in (this=0x7f26edf5ff40) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1675
#11 0x00007f2707e32d54 in socket_event_handler (fd=340, idx=221, data=0x7f26edf5ff40, poll_in=1, poll_out=0, poll_err=0)
    at ../../../../../rpc/rpc-transport/socket/src/socket.c:1790
#12 0x00007f270af072fa in event_dispatch_epoll_handler (event_pool=0xf1f7c0, events=0xf24d70, i=0) at ../../../libglusterfs/src/event.c:794
#13 0x00007f270af0751d in event_dispatch_epoll (event_pool=0xf1f7c0) at ../../../libglusterfs/src/event.c:856
#14 0x00007f270af078a8 in event_dispatch (event_pool=0xf1f7c0) at ../../../libglusterfs/src/event.c:956
#15 0x000000000040837c in main (argc=4, argv=0x7fffb9b6cfc8) at ../../../glusterfsd/src/glusterfsd.c:1592
(gdb) f 3
#3  0x00007f2704bb93e0 in afr_inode_set_read_ctx (this=0x7f26edf52ea0, inode=0x7f269bd24d20, read_child=-1, fresh_children=0x12b8180)
    at ../../../../../xlators/cluster/afr/src/afr-common.c:417
417             GF_ASSERT (read_child >= 0);
(gdb) p read_child
$1 = -1
(gdb) f 4
#4  0x00007f2704ba87f1 in afr_sh_entry_fix (frame=0x7f270955a37c, this=0x7f26edf52ea0)
    at ../../../../../xlators/cluster/afr/src/afr-self-heal-entry.c:2259
2259            afr_inode_set_read_ctx (this, sh->inode, sh->source,
(gdb) p sh->source
$2 = -1
(gdb) l
2254            sh->source = source;
2255
2256            afr_reset_children (sh->fresh_children, priv->child_count);
2257            afr_get_fresh_children (sh->success_children, sh->sources,
2258                                    sh->fresh_children, priv->child_count);
2259            afr_inode_set_read_ctx (this, sh->inode, sh->source,
2260                                    sh->fresh_children);
2261
2262
2263    heal:
(gdb) source
source command requires file name of file to source.
(gdb)  p source
$3 = -1
(gdb) l afr_sh_entry_fix
2217    }
2218
2219
2220    int
2221    afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
2222    {
2223            afr_local_t     *local = NULL;
2224            afr_self_heal_t *sh = NULL;
2225            afr_private_t   *priv = NULL;
2226            int              source = 0;
(gdb) 
2227
2228            int nsources = 0;
2229
2230            local = frame->local;
2231            sh = &local->self_heal;
2232            priv = this->private;
2233
2234            if (sh->forced_merge) {
2235                    sh->source = -1;
2236                    goto heal;
(gdb) 
2237            }
2238
2239            nsources = afr_build_sources (this, sh->xattr, sh->buf,
2240                                          sh->pending_matrix, sh->sources,
2241                                          sh->success_children,
2242                                          AFR_ENTRY_TRANSACTION);
2243            if (nsources == 0) {
2244                    gf_log (this->name, GF_LOG_TRACE,
2245                            "No self-heal needed for %s",
2246                            local->loc.path);
(gdb) 
2247
2248                    afr_sh_entry_finish (frame, this);
2249                    return 0;
2250            }
2251
2252            source = afr_sh_select_source (sh->sources, priv->child_count);
2253
2254            sh->source = source;


 l afr_build_sources
607
608     int
609     afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
610                        int32_t **pending_matrix, int32_t *sources,
611                        int32_t *success_children, afr_transaction_type type)
612     {
613             afr_private_t           *priv = NULL;
614             afr_self_heal_type      sh_type    = AFR_SELF_HEAL_INVALID;
615             int                     nsources   = -1;
616
(gdb) 
617             priv = this->private;
618
619             if (afr_get_children_count (success_children, priv->child_count) == 0)
620                     goto out;
621
622             afr_build_pending_matrix (priv->pending_key, pending_matrix,
623                                       xattr, type, priv->child_count);
624
625             sh_type = afr_self_heal_type_for_transaction (type);
626             if (AFR_SELF_HEAL_INVALID == sh_type)
(gdb) 
627                     goto out;
628
629             afr_sh_print_pending_matrix (pending_matrix, this);
630
631             nsources = afr_mark_sources (sources, pending_matrix, bufs,
632                                          priv->child_count, sh_type,
633                                          success_children, this->name);
634     out:
635             return nsources;
636     }

Here nsources can be -1 if it cannot identify the sources properly. But we are checking nsources only for 0 after calling afr_build_sources and instead of returning we are continuing with read_child set to -1.

Comment 2 Pranith Kumar K 2011-09-28 07:14:32 UTC
The crash is observed because the entry-self-heal goes ahead with the self-heal even when the lookups failed, this has already been fixed as part of 3557 patch.
Marking it resolved fixed

Comment 3 Anand Avati 2011-10-03 10:34:28 UTC
CHANGE: http://review.gluster.com/556 (Change-Id: Ifdf0db71594ce526ad85c21103726798d9aceef4) merged in master by Vijay Bellur (vijay)


Note You need to log in before you can comment on or make changes to this bug.