Description of problem: glusterfs volume with following configuration. sudo gluster volume info mirror Volume Name: mirror Type: Distributed-Replicate Volume ID: 3382aaa7-37d0-4fab-bd3c-dc9a7a350acf Status: Started Number of Bricks: 2 x 2 = 4 Transport-type: tcp Bricks: Brick1: hyperspace:/mnt/sda7/export3 Brick2: hyperspace:/mnt/sda8/export3 Brick3: hyperspace:/mnt/sda7/last33 Brick4: hyperspace:/mnt/sda8/last33 Options Reconfigured: performance.stat-prefetch: on geo-replication.indexing: on diagnostics.count-fop-hits: on diagnostics.latency-measurement: on features.limit-usage: /:22GB features.quota: on features.lock-heal: on Ran posix compliance, dbench (100 sec, 22 clients), untar glusterfs tarball, rm -rf *, then fileop (fileop -f 30 -t). One of the servers crashed. Backtrace. Core was generated by `/usr/local/sbin/glusterfsd -s localhost --volfile-id mirror.hyperspace.mnt-sda7'. Program terminated with signal 11, Segmentation fault. #0 0x00007f89d2cfa077 in server_readdir_cbk (frame=0x7f89d657afe0, cookie=0x0, this=0x25e3f80, op_ret=-1, op_errno=2, entries=0x0, xdata=0x0) at ../../../../../xlators/protocol/server/src/server3_1-fops.c:670 670 gf_log (this->name, GF_LOG_INFO, (gdb) bt #0 0x00007f89d2cfa077 in server_readdir_cbk (frame=0x7f89d657afe0, cookie=0x0, this=0x25e3f80, op_ret=-1, op_errno=2, entries=0x0, xdata=0x0) at ../../../../../xlators/protocol/server/src/server3_1-fops.c:670 #1 0x00007f89d2d023c4 in server_readdir_resume (frame=0x7f89d657afe0, bound_xl=0x25e2ca0) at ../../../../../xlators/protocol/server/src/server3_1-fops.c:2340 #2 0x00007f89d2cf080d in server_resolve_done (frame=0x7f89d657afe0) at ../../../../../xlators/protocol/server/src/server-resolve.c:535 #3 0x00007f89d2cf090e in server_resolve_all (frame=0x7f89d657afe0) at ../../../../../xlators/protocol/server/src/server-resolve.c:570 #4 0x00007f89d2cf07a1 in server_resolve (frame=0x7f89d657afe0) at ../../../../../xlators/protocol/server/src/server-resolve.c:517 #5 0x00007f89d2cf08e5 in server_resolve_all (frame=0x7f89d657afe0) at ../../../../../xlators/protocol/server/src/server-resolve.c:566 #6 0x00007f89d2ceff37 in resolve_continue (frame=0x7f89d657afe0) at ../../../../../xlators/protocol/server/src/server-resolve.c:224 #7 0x00007f89d2cefa9e in resolve_gfid_cbk (frame=0x7f89d657afe0, cookie=0x7f89d6788ae0, this=0x25e3f80, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xdata=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/protocol/server/src/server-resolve.c:163 #8 0x00007f89d2f2b023 in io_stats_lookup_cbk (frame=0x7f89d6788ae0, cookie=0x7f89d6786f54, this=0x25e2ca0, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xdata=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/debug/io-stats/src/io-stats.c:1479 #9 0x00007f89d314f1a0 in marker_lookup_cbk (frame=0x7f89d6786f54, cookie=0x7f89d6788988, this=0x25e18c0, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, dict=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/features/marker/src/marker.c:2215 #10 0x00007f89d84e1169 in default_lookup_cbk (frame=0x7f89d6788988, cookie=0x7f89d6784908, this=0x25e0690, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xdata=0x0, postparent=0x7f89d15dfb70) at ../../../libglusterfs/src/defaults.c:37 #11 0x00007f89d3575cdd in iot_lookup_cbk (frame=0x7f89d6784908, cookie=0x7f89d6786290, this=0x25df400, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xdata=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/performance/io-threads/src/io-threads.c:278 #12 0x00007f89d379aa92 in pl_lookup_cbk (frame=0x7f89d6786290, cookie=0x7f89d6784dbc, this=0x25de2a0, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xdata=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/features/locks/src/posix.c:1621 #13 0x00007f89d39ae902 in posix_acl_lookup_cbk (frame=0x7f89d6784dbc, cookie=0x7f89d67842fc, this=0x25dd0c0, op_ret=-1, op_errno=2, inode=0x7f89d1974040, buf=0x7f89d15dfbe0, xattr=0x0, postparent=0x7f89d15dfb70) at ../../../../../xlators/system/posix-acl/src/posix-acl.c:746 #14 0x00007f89d3bc2c03 in posix_lookup (frame=0x7f89d67842fc, this=0x25dbd20, loc=0x7f89d6446558, xdata=0x7f89d63f14e4) at ../../../../../xlators/storage/posix/src/posix.c:187 #15 0x00007f89d39aed2d in posix_acl_lookup (frame=0x7f89d6784dbc, this=0x25dd0c0, loc=0x7f89d6446558, xattr=0x0) at ../../../../../xlators/system/posix-acl/src/posix-acl.c:798 #16 0x00007f89d379af90 in pl_lookup (frame=0x7f89d6786290, this=0x25de2a0, loc=0x7f89d6446558, xdata=0x0) at ../../../../../xlators/features/locks/src/posix.c:1663 #17 0x00007f89d3575f20 in iot_lookup_wrapper (frame=0x7f89d6784908, this=0x25df400, loc=0x7f89d6446558, xdata=0x0) at ../../../../../xlators/performance/io-threads/src/io-threads.c:288 #18 0x00007f89d8505f19 in call_resume_wind (stub=0x7f89d6446518) at ../../../libglusterfs/src/call-stub.c:2689 #19 0x00007f89d850d36f in call_resume (stub=0x7f89d6446518) at ../../../libglusterfs/src/call-stub.c:4151 #20 0x00007f89d357590d in iot_worker (data=0x25f61d0) at ../../../../../xlators/performance/io-threads/src/io-threads.c:131 #21 0x00007f89d7e76d8c in start_thread (arg=0x7f89d15e0700) at pthread_create.c:304 #22 0x00007f89d783304d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112 #23 0x0000000000000000 in ?? () (gdb) f 0 #0 0x00007f89d2cfa077 in server_readdir_cbk (frame=0x7f89d657afe0, cookie=0x0, this=0x25e3f80, op_ret=-1, op_errno=2, entries=0x0, xdata=0x0) at ../../../../../xlators/protocol/server/src/server3_1-fops.c:670 670 gf_log (this->name, GF_LOG_INFO, (gdb) l 665 666 req = frame->local; 667 state = CALL_STATE(frame); 668 669 if (op_ret < 0) { 670 gf_log (this->name, GF_LOG_INFO, 671 "%"PRId64": READDIR %"PRId64" (%s) ==> (%s)", 672 frame->root->unique, state->resolve.fd_no, 673 uuid_utoa (state->fd->inode->gfid), 674 strerror (op_errno)); (gdb) p frame->root $1 = (call_stack_t *) 0x7f89d657a870 (gdb) p frame->root->unique $2 = 3634 (gdb) p state->resolve.fd_no $3 = 18446744073709551614 (gdb) p state->fd $4 = (fd_t *) 0x0 (gdb) p state->loc $5 = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>} (gdb) p state->loc2 $6 = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>} (gdb) p *state->loc_now $7 = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>} (gdb) p *state $8 = {conn = 0x261da00, xprt = 0x26021f0, itable = 0x26039e0, resume_fn = 0x7f89d2d0210f <server_readdir_resume>, loc = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>}, loc2 = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>}, resolve = { type = RESOLVE_MUST, fd_no = 18446744073709551614, gfid = "\322(\272\a\213.A\247\200\024XOy\001\224", <incomplete sequence \372>, pargfid = '\000' <repeats 15 times>, path = 0x0, bname = 0x0, op_ret = -1, op_errno = 2, resolve_loc = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>}}, resolve2 = {type = 0, fd_no = 18446744073709551615, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>, path = 0x0, bname = 0x0, op_ret = -1, op_errno = 22, resolve_loc = {path = 0x0, name = 0x0, inode = 0x0, parent = 0x0, gfid = '\000' <repeats 15 times>, pargfid = '\000' <repeats 15 times>}}, loc_now = 0x7f89cc024190, resolve_now = 0x7f89cc024258, stbuf = {ia_ino = 0, ia_gfid = '\000' <repeats 15 times>, ia_dev = 0, ia_type = IA_INVAL, ia_prot = {suid = 0 '\000', sgid = 0 '\000', sticky = 0 '\000', owner = {read = 0 '\000', write = 0 '\000', exec = 0 '\000'}, group = {read = 0 '\000', write = 0 '\000', exec = 0 '\000'}, other = { read = 0 '\000', write = 0 '\000', exec = 0 '\000'}}, ia_nlink = 0, ia_uid = 0, ia_gid = 0, ia_rdev = 0, ia_size = 0, ia_blksize = 0, ia_blocks = 0, ia_atime = 0, ia_atime_nsec = 0, ia_mtime = 0, ia_mtime_nsec = 0, ia_ctime = 0, ia_ctime_nsec = 0}, valid = 0, fd = 0x0, params = 0x0, flags = 0, wbflags = 0, payload_vector = {{iov_base = 0x0, iov_len = 0} <repeats 16 times>}, payload_count = 0, iobuf = 0x0, iobref = 0x0, size = 130944, offset = 0, mode = 0, dev = 0, nr_count = 0, cmd = 0, type = 0, name = 0x0, name_len = 0, mask = 0, is_revalidate = 0 '\000', dict = 0x0, flock = {l_type = 0, l_whence = 0, l_start = 0, l_len = 0, l_pid = 0, l_owner = {len = 0, data = '\000' <repeats 1023 times>}}, volume = 0x0, entry = 0x0, xdata = 0x0, umask = 0} (gdb) Version-Release number of selected component (if applicable): How reproducible: Steps to Reproduce: 1. 2. 3. Actual results: glusterfs server crashed. Expected results: glusterfs server should not crash. Additional info:
this got fixed by http://review.gluster.com/3437 in master. May need 'selective' backporting. But anyways, is there a graph change happening while the tests were going on?
Commit http://review.gluster.com/3504 fixes the issue where if the resolve fails in opendir, then fd is not accessed in the callback.