Hide Forgot
Do you still have the core file? Can you get a full backtrace by doing: # gdb /path/to/glusterfs/binary /path/to/core (gdb) bt full
Certainly. (gdb) bt full #0 afr_unlink_wind_cbk (frame=0x7f37b400d768, cookie=<value optimized out>, this=0x1ec8750, op_ret=0, op_errno=<value optimized out>, preparent=0x0, postparent=0x0) at afr-dir-write.c:1645 local = 0x7f37b4007150 call_count = <value optimized out> child_index = 0 #1 0x00007f37bc7ddf21 in dht_unlink_cbk (frame=0x1ed5ef0, cookie=<value optimized out>, this=<value optimized out>, op_ret=<value optimized out>, op_errno=<value optimized out>, preparent=<value optimized out>, postparent=0x7fff14d3f330) at dht-common.c:1216 fn = 0x7f37bc5a4980 <afr_unlink_wind_cbk> _parent = 0x7f37b400d768 old_THIS = 0x1ec7b50 __local = 0x1edab00 __xl = 0x1ec7b50 local = 0x1edab00 this_call_cnt = <value optimized out> __FUNCTION__ = "dht_unlink_cbk" #2 0x00007f37bca0cf8a in client_unlink_cbk (frame=0x1ed6e50, hdr=<value optimized out>, hdrlen=<value optimized out>, iobuf=<value optimized out>) at client-protocol.c:4562 fn = 0x7f37bc7dde50 <dht_unlink_cbk> _parent = 0x1ed5ef0 old_THIS = 0x1ec6480 rsp = <value optimized out> op_ret = 0 op_errno = 0 preparent = {st_dev = 5439812256340639918, st_ino = 4177921, st_nlink = 4, st_mode = 17901, st_uid = 511, st_gid = 516, __pad0 = 0, st_rdev = 0, st_size = 4096, st_blksize = 4096, st_blocks = 16, st_atim = { tv_sec = 1267495031, tv_nsec = 0}, st_mtim = {tv_sec = 1267467606, tv_nsec = 0}, st_ctim = { tv_sec = 1267494728, tv_nsec = 0}, __unused = {0, 0, 0}} postparent = {st_dev = 5439812256340639918, st_ino = 4177921, st_nlink = 4, st_mode = 17901, st_uid = 511, st_gid = 516, __pad0 = 0, st_rdev = 0, st_size = 4096, st_blksize = 4096, st_blocks = 16, st_atim = { tv_sec = 1267495031, tv_nsec = 0}, st_mtim = {tv_sec = 1267495072, tv_nsec = 0}, st_ctim = { tv_sec = 1267495072, tv_nsec = 0}, __unused = {0, 0, 0}} #3 0x00007f37bc9f87ca in protocol_client_pollin (this=0x1ec6480, trans=0x1ed0360) at client-protocol.c:6809 conf = 0x1ed02e0 ret = 0 iobuf = 0x0 hdr = 0x1ed9da0 "" hdrlen = 268 #4 0x00007f37bca07568 in notify (this=0x7f37b4008550, event=<value optimized out>, data=0x1ed0360) at client-protocol.c:6928 ret = <value optimized out> was_not_down = <value optimized out> conn = <value optimized out> conf = 0x1ed02e0 parent = <value optimized out> ---Type <return> to continue, or q <return> to quit--- __FUNCTION__ = "notify" #5 0x00007f37bdbc23d3 in xlator_notify (xl=0x1ec6480, event=2, data=0x1ed0360) at xlator.c:923 old_THIS = 0x7f37bddf26a0 ret = <value optimized out> #6 0x00007f37baf4d1e8 in socket_event_handler (fd=<value optimized out>, idx=2, data=0x1ed0360, poll_in=1, poll_out=0, poll_err=<value optimized out>) at socket.c:829 priv = 0x1ed0720 ret = 0 #7 0x00007f37bdbdc82d in event_dispatch_epoll_handler (event_pool=0x1ec1090) at event.c:804 data = <value optimized out> idx = <value optimized out> event_data = 0x1ed31c4 handler = 0xe #8 event_dispatch_epoll (event_pool=0x1ec1090) at event.c:867 events = 0x1ed31c0 i = 0 ret = 1 __FUNCTION__ = "event_dispatch_epoll" #9 0x0000000000404132 in main (argc=<value optimized out>, argv=<value optimized out>) at glusterfsd.c:1413 ctx = 0x1ec0010 stbuf = {st_dev = 3620248, st_ino = 3639464, st_nlink = 1503232, st_mode = 3, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 94208, st_size = 91468, st_blksize = 91468, st_blocks = 0, st_atim = {tv_sec = 5, tv_nsec = 2187264}, st_mtim = {tv_sec = 2195456, tv_nsec = 2191968}, st_ctim = {tv_sec = 2208640, tv_nsec = 90112}, __unused = {3, 2105344, 2113536}} tmp_logfile = '\000' <repeats 1023 times> tmp_logfile_dyn = <value optimized out> tmp_logfilebase = <value optimized out> timestr = '\000' <repeats 255 times> utime = 1267494071 tm = <value optimized out> ret = 0 lim = {rlim_cur = 18446744073709551615, rlim_max = 18446744073709551615} specfp = <value optimized out> graph = 0x0 trav = <value optimized out> fuse_volume_found = <value optimized out> xl_count = <value optimized out> pipe_fd = {6, 7} gf_success = 0 gf_failure = -1 __FUNCTION__ = "main"
Thanks for the info. The problem appears to be unchecked access of the preparent and postparent variables. A patch to fix this will soon be in the repository.
Excellent, if you need any additional traces or tests please let me know. Cheers.
Clients (F12) cannot delete over a RAID 10 system (six nodes). Trace is : frame : type(1) op(UNLINK) frame : type(1) op(UNLINK) patchset: v3.0.2 signal received: 11 time of crash: 2010-03-01 20:02:07 configuration details: argp 1 backtrace 1 dlfcn 1 fdatasync 1 libpthread 1 llistxattr 1 setfsid 1 spinlock 1 epoll.h 1 xattr.h 1 st_atim.tv_nsec 1 package-string: glusterfs 3.0.2 /lib64/libc.so.6(+0x32740)[0x7f10f7641740] /usr/lib64/glusterfs/3.0.2/xlator/cluster/replicate.so(afr_unlink_wind_cbk+0x10a)[0x7f10f679ea8a] /usr/lib64/glusterfs/3.0.2/xlator/cluster/distribute.so(dht_unlink_cbk+0xd1)[0x7f10f69d7f21] /usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(client_unlink_cbk+0x27a)[0x7f10f6c06f8a] /usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(protocol_client_pollin+0xca)[0x7f10f6bf27ca] /usr/lib64/glusterfs/3.0.2/xlator/protocol/client.so(notify+0xe8)[0x7f10f6c01568] /usr/lib64/libglusterfs.so.0(xlator_notify+0x43)[0x7f10f7dbc3d3] /usr/lib64/glusterfs/3.0.2/transport/socket.so(socket_event_handler+0xc8)[0x7f10f51471e8] /usr/lib64/libglusterfs.so.0(+0x2e82d)[0x7f10f7dd682d] /usr/sbin/glusterfs(main+0x852)[0x404132] /lib64/libc.so.6(__libc_start_main+0xfd)[0x7f10f762db1d] /usr/sbin/glusterfs[0x4026f9] Same trace on both clients. Server logs show nothing abnormal.
Please use configuration generated by glusterfs-volgen ( refer documentation) only
PATCH: http://patches.gluster.com/patch/2873 in master (dht: preserve and return proper pre/postparent structures during unlink)
PATCH: http://patches.gluster.com/patch/2872 in release-3.0 (dht: preserve and return proper pre/postparent structures during unlink)