Hide Forgot
glusterfs client crashed due to assert. Setup: replicate cluster with replica count 2. 2 fuse clients. One client was executing sanity script. Other client was running ping_pong, and find <mount_point> | xargs stat was running. One server was brought down, and after some time brought up. The client running ping_pong crashed. This is the backtrace. Core was generated by `/usr/local/sbin/glusterfs --volfile-id=mirror --volfile-server=hyperspace /mnt/'. Program terminated with signal 6, Aborted. #0 0x00007fbf95a5fd05 in raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64 64 ../nptl/sysdeps/unix/sysv/linux/raise.c: Transport endpoint is not connected. in ../nptl/sysdeps/unix/sysv/linux/raise.c (gdb) bt #0 0x00007fbf95a5fd05 in raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64 #1 0x00007fbf95a63ab6 in abort () at abort.c:92 #2 0x00007fbf95a587c5 in __assert_fail (assertion=0x7fbf90b538d9 "read_child >= 0", file=<value optimized out>, line=453, function=<value optimized out>) at assert.c:81 #3 0x00007fbf90b17ec4 in afr_transaction_rm_stale_children (frame=0x7fbf94cd1938, this=0x11b7f90, inode=0x7fbf8d8ad0e0, type=AFR_DATA_TRANSACTION) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:453 #4 0x00007fbf90b1807c in afr_changelog_post_op (frame=0x7fbf94cd1938, this=0x11b7f90) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:521 #5 0x00007fbf90b1bf25 in afr_transaction_resume (frame=0x7fbf94cd1938, this=0x11b7f90) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:1231 #6 0x00007fbf90b0d55e in afr_writev_wind_cbk (frame=0x7fbf94cd1938, cookie=0x1, this=0x11b7f90, op_ret=1, op_errno=0, prebuf=0x7fff7c9a4320, postbuf=0x7fff7c9a42b0) at ../../../../../xlators/cluster/afr/src/afr-inode-write.c:124 #7 0x00007fbf90d7d569 in client3_1_writev_cbk (req=0x7fbf8fc1a4cc, iov=0x7fbf8fc1a50c, count=1, myframe=0x7fbf94f3b568) at ../../../../../xlators/protocol/client/src/client3_1-fops.c:685 #8 0x00007fbf9641a721 in rpc_clnt_handle_reply (clnt=0x11c79c0, pollin=0x7fbf770b6670) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:747 #9 0x00007fbf9641aa5d in rpc_clnt_notify (trans=0x11c7be0, mydata=0x11c79f0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x7fbf770b6670) at ../../../../rpc/rpc-lib/src/rpc-clnt.c:860 #10 0x00007fbf96416e49 in rpc_transport_notify (this=0x11c7be0, event=RPC_TRANSPORT_MSG_RECEIVED, data=0x7fbf770b6670) at ../../../../rpc/rpc-lib/src/rpc-transport.c:498 #11 0x00007fbf93db931a in socket_event_poll_in (this=0x11c7be0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1676 #12 0x00007fbf93db988e in socket_event_handler (fd=9, idx=1, data=0x11c7be0, poll_in=1, poll_out=0, poll_err=0) at ../../../../../rpc/rpc-transport/socket/src/socket.c:1791 #13 0x00007fbf9666ee31 in event_dispatch_epoll_handler (event_pool=0x11ab110, events=0x11afe50, i=0) at ../../../libglusterfs/src/event.c:794 #14 0x00007fbf9666f04b in event_dispatch_epoll (event_pool=0x11ab110) at ../../../libglusterfs/src/event.c:856 #15 0x00007fbf9666f3bd in event_dispatch (event_pool=0x11ab110) at ../../../libglusterfs/src/event.c:956 #16 0x00000000004076cf in main (argc=4, argv=0x7fff7c9a4868) at ../../../glusterfsd/src/glusterfsd.c:1557 (gdb) f 3 #3 0x00007fbf90b17ec4 in afr_transaction_rm_stale_children (frame=0x7fbf94cd1938, this=0x11b7f90, inode=0x7fbf8d8ad0e0, type=AFR_DATA_TRANSACTION) at ../../../../../xlators/cluster/afr/src/afr-transaction.c:453 453 GF_ASSERT (read_child >= 0); (gdb) l 448 stale_children[count++] = i; 449 } 450 } 451 452 if (!rm_stale_children) { 453 GF_ASSERT (read_child >= 0); 454 goto out; 455 } 456 457 if (fresh_children[0] == -1) { (gdb) l afr_transaction_rm_stale_children 404 405 406 void 407 afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, 408 inode_t *inode, afr_transaction_type type) 409 { 410 int i = -1; 411 int count = 0; 412 int read_child = -1; 413 afr_private_t *priv = NULL; (gdb) 414 afr_local_t *local = NULL; 415 int **pending = NULL; 416 int idx = 0; 417 int32_t *stale_children = NULL; 418 int32_t *fresh_children = NULL; 419 gf_boolean_t rm_stale_children = _gf_false; 420 421 idx = afr_index_for_transaction_type (type); 422 423 priv = this->private; (gdb) 424 local = frame->local; 425 pending = local->pending; 426 427 stale_children = afr_children_create (priv->child_count); 428 if (!stale_children) 429 goto out; 430 431 fresh_children = local->fresh_children; 432 read_child = afr_inode_get_read_ctx (this, inode, fresh_children); 433 (gdb) 434 GF_ASSERT (read_child >= 0); 435 436 if (pending[read_child][idx] == 0) 437 read_child = -1; 438 439 for (i = 0; i < priv->child_count; i++) { 440 if (!afr_is_child_present (fresh_children, 441 priv->child_count, i)) 442 continue; 443 if ((!priv->child_up[i]) || (pending[i][idx] == 0)) { (gdb) 444 /* child is down or op failed on it */ 445 rm_stale_children = _gf_true; 446 afr_children_rm_child (fresh_children, i, 447 priv->child_count); 448 stale_children[count++] = i; 449 } 450 } 451 452 if (!rm_stale_children) { 453 GF_ASSERT (read_child >= 0); (gdb) 454 goto out; 455 } 456 457 if (fresh_children[0] == -1) { 458 //All children failed. leave as-is 459 goto out; 460 } 461 462 if (read_child == -1) 463 read_child = fresh_children[0]; (gdb) In this function we are only setting read_child to -1. if (pending[read_child][idx] == 0) 437 read_child = -1;
CHANGE: http://review.gluster.com/293 (The code is checking for priv->child_up[i], which can change while the fop) merged in master by Vijay Bellur (vijay)
This crash is fixed since now we are not depending upon priv->child_up to determine the down children instead we make use of the pending matrix itself.