Hide Forgot
Two locks in picture a) inode_table_t->lock and b) pl_inode_t->lock. The process has deadlocked itself in the middle of a process state dump. - [Thread 5] State dump has first locked the inode table (a) to loop over inode list and while dumping the inode details has locked pl_inode_t->lock (b). - [Thread 1] pl_update_refkeeper() has held lock on pl_inode_t->lock (b) followed by inode_ref which locks inode_table_t->lock (a). (gdb) thread apply all bt Thread 5 (Thread 0x7fb7d4c80700 (LWP 2269)): #0 0x00007fb7d5e0f464 in __lll_lock_wait () from /lib/libpthread.so.0 #1 0x00007fb7d5e0a5d9 in _L_lock_953 () from /lib/libpthread.so.0 #2 0x00007fb7d5e0a3fb in pthread_mutex_lock () from /lib/libpthread.so.0 #3 0x00007fb7d2f41ae5 in get_entrylk_count (this=0xa4db90, inode=<value optimized out>) at entrylk.c:811 #4 0x00007fb7d2f419b6 in pl_dump_inode_priv (this=0xa4db90, inode=0x7fb7d699e88c) at posix.c:1728 #5 0x00007fb7d667b1bf in inode_dump (inode=0x7fb7d699e88c, prefix=0x7fb7d4c7dbe0 "xlator.protocol.server.conn.1.bound_xl./g01.active.544") at inode.c:1566 #6 0x00007fb7d667b4c7 in inode_table_dump (itable=0xa57220, prefix=0x7fb7d4c7ec40 "xlator.protocol.server.conn.1.bound_xl./g01") at inode.c:1623 #7 0x00007fb7d26ca686 in server_inode (this=<value optimized out>) at server.c:294 #8 0x00007fb7d6691045 in gf_proc_dump_xlator_info (this_xl=0xa521e0) at statedump.c:269 #9 0x00007fb7d6691321 in gf_proc_dump_info (signum=<value optimized out>) at statedump.c:436 #10 0x0000000000405402 in glusterfs_sigwaiter (arg=<value optimized out>) at glusterfsd.c:1271 #11 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 #12 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 #13 0x0000000000000000 in ?? () Thread 4 (Thread 0x7fb7d3d72700 (LWP 2271)): #0 0x00007fb7d5b2839d in nanosleep () from /lib/libc.so.6 #1 0x00007fb7d5b5d844 in usleep () from /lib/libc.so.6 #2 0x00007fb7d667946a in gf_timer_proc (ctx=0xa41010) at timer.c:181 #3 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 #4 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 #5 0x0000000000000000 in ?? () Thread 3 (Thread 0x7fb7d23bb700 (LWP 2290)): #0 0x00007fb7d5e0cbc9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib/libpthread.so.0 #1 0x00007fb7d335cbbc in janitor_get_next_fd (data=<value optimized out>) at posix.c:1408 #2 posix_janitor_thread_proc (data=<value optimized out>) at posix.c:1454 #3 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 #4 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 #5 0x0000000000000000 in ?? () Thread 2 (Thread 0x7fb7d1564700 (LWP 2304)): #0 0x00007fb7d5e0cbc9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib/libpthread.so.0 #1 0x00007fb7d2d32740 in iot_worker (data=0xa556e0) at io-threads.c:101 #2 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 #3 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 #4 0x0000000000000000 in ?? () Thread 1 (Thread 0x7fb7d6ac4700 (LWP 2268)): #0 0x00007fb7d5e0f464 in __lll_lock_wait () from /lib/libpthread.so.0 #1 0x00007fb7d5e0a5d9 in _L_lock_953 () from /lib/libpthread.so.0 #2 0x00007fb7d5e0a3fb in pthread_mutex_lock () from /lib/libpthread.so.0 #3 0x00007fb7d667a151 in inode_ref (inode=0x7fb7d699e88c) at inode.c:473 #4 0x00007fb7d2f3aaa2 in pl_update_refkeeper (this=<value optimized out>, inode=0x7fb7d699e88c) at common.c:416 #5 0x00007fb7d2f441b2 in pl_common_inodelk (frame=0x7fb7d4f8fa70, this=<value optimized out>, volume=0x7fb7c80114a8 " \025\001ȷ\177", inode=0xffffffffffffffff, cmd=10842656, flock=0x7fb7cc027318, loc=0x0, fd=0x7fb7d1969ee8) at inodelk.c:648 #6 0x00007fb7d2f44add in pl_finodelk (frame=0xa57220, this=0x80, volume=0x7fb7c80114a8 " \025\001ȷ\177", fd=<value optimized out>, cmd=10842656, flock=0x8dc) at inodelk.c:672 #7 0x00007fb7d666c746 in default_finodelk (frame=<value optimized out>, this=0xa4ebe0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, lock=0x7fb7cc027318) at defaults.c:1070 #8 0x00007fb7d666c746 in default_finodelk (frame=<value optimized out>, this=0xa4fca0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, lock=0x7fb7cc027318) at defaults.c:1070 #9 0x00007fb7d28f8c63 in io_stats_finodelk (frame=<value optimized out>, this=0xa50ff0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, flock=0x7fb7cc027318) at io-stats.c:1701 #10 0x00007fb7d26d7ff1 in server_finodelk_resume (frame=<value optimized out>, bound_xl=0xa50ff0) at server3_1-fops.c:1972 #11 0x00007fb7d26cacd3 in server_resolve_done (frame=0x7fb7d4d6b614) at server-resolve.c:597 #12 0x00007fb7d26cb905 in server_resolve_all (frame=0xa57220) at server-resolve.c:632 #13 0x00007fb7d26cb836 in server_resolve (frame=0x7fb7d4d6b614) at server-resolve.c:579 #14 0x00007fb7d26cb8fe in server_resolve_all (frame=0xa57220) at server-resolve.c:628 #15 0x00007fb7d26cbafb in server_resolve_fd (frame=0x7fb7d4d6b614) at server-resolve.c:540 #16 0x00007fb7d26cb74b in server_resolve (frame=0x7fb7d4d6b614) at server-resolve.c:557 #17 0x00007fb7d26cb8de in server_resolve_all (frame=0xa57220) at server-resolve.c:621 #18 0x00007fb7d26cbaa4 in resolve_and_resume (frame=0xa57220, fn=0x80) at server-resolve.c:651 #19 0x00007fb7d26da723 in server_finodelk (req=<value optimized out>) at server3_1-fops.c:4494 #20 0x00007fb7d644127f in rpcsvc_handle_rpc_call (svc=0xa53970, trans=<value optimized out>, msg=<value optimized out>) at rpcsvc.c:480 #21 0x00007fb7d64413e3 in rpcsvc_notify (trans=0x7fb7c402f170, mydata=0xa57220, event=<value optimized out>, data=0xffffffffffffffff) at rpcsvc.c:576 #22 0x00007fb7d6441d98 in rpc_transport_notify (this=0xa57220, event=<value optimized out>, data=<value optimized out>) at rpc-transport.c:919 #23 0x00007fb7d41d0044 in socket_event_poll_in (this=0x7fb7c402f170) at socket.c:1647 #24 0x00007fb7d41d0127 in socket_event_handler (fd=<value optimized out>, idx=39, data=0x7fb7c402f170, poll_in=1, poll_out=0, poll_err=<value optimized out>) at socket.c:1762 #25 0x00007fb7d668b904 in event_dispatch_epoll_handler (event_pool=0xa41350) at event.c:794 #26 event_dispatch_epoll (event_pool=0xa41350) at event.c:856 #27 0x00000000004063ce in main (argc=17, argv=0x7fff32c81658) at glusterfsd.c:1509 (gdb) thread apply all bt full Thread 5 (Thread 0x7fb7d4c80700 (LWP 2269)): #0 0x00007fb7d5e0f464 in __lll_lock_wait () from /lib/libpthread.so.0 No symbol table info available. #1 0x00007fb7d5e0a5d9 in _L_lock_953 () from /lib/libpthread.so.0 No symbol table info available. #2 0x00007fb7d5e0a3fb in pthread_mutex_lock () from /lib/libpthread.so.0 No symbol table info available. #3 0x00007fb7d2f41ae5 in get_entrylk_count (this=0xa4db90, inode=<value optimized out>) at entrylk.c:811 tmp_pl_inode = 140427311256704 ret = <value optimized out> count = 0 #4 0x00007fb7d2f419b6 in pl_dump_inode_priv (this=0xa4db90, inode=0x7fb7d699e88c) at posix.c:1728 ret = 0 tmp_pl_inode = 140427311256704 pl_inode = 0x7fb7c8011480 key = "xlator.feature.locks.inode.544079874.mandatory", '\000' <repeats 3866 times>, "\v\000\000\000\000\000\000\000\320\000\000\000\000\000\000\000\v", '\000' <repeats 16 times>, "P\210\256,\000;\t@\316\337շ\177\000\000\214\350\231ַ\177\000\000\030\000\000\000\000\000\000\000@\316\337շ\177\000\000\360\033\004\000\000\000\000\000\020\204\303\000\000\000\000\000@\316\337շ\177\000\000Tįշ\177\000\000\003B\000\000\267\177\000\000\205\000\000\000\267\177\000\000\005\000\000\000\000\000\000\000d\000\000\000\267\177\000\000n\000\000\000\060\000\000\000\214\350\231ַ\177\000\000\030\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000-\000\000\000\000\000\000\000\340\333\307Է\177\000" count = <value optimized out> __FUNCTION__ = "pl_dump_inode_priv" #5 0x00007fb7d667b1bf in inode_dump (inode=0x7fb7d699e88c, prefix=0x7fb7d4c7dbe0 "xlator.protocol.server.conn.1.bound_xl./g01.active.544") at inode.c:1566 key = "xlator.protocol.server.conn.1.bound_xl./g01.active.544.ia_type", '\000' <repeats 3890 times>, "\035dnҷ\177\000\000\000P\210\256,\000;\t\000\000\000\000\000\000\000\000\246\347iַ\177\000\000\026\334\307Է\177\000\000\246\347iַ\177\000\000\000\000\000\000\000\000\000\000\246\347iַ\177\000\000(\000\000\000\060\000\000\000\060\354\307Է\177\000\000`\353\307Է\177\000\000\"\niַ\177\000\000\061.bound_xl./g01", '\000' <repeats 32 times> ret = <value optimized out> i = 1 fd = <value optimized out> inode_ctx = 0xba97e0 fd_wrapper = <value optimized out> tmp = <value optimized out> fd_list = {next = 0xa7a1a8, prev = 0xa7a1a8} __FUNCTION__ = "inode_dump" #6 0x00007fb7d667b4c7 in inode_table_dump (itable=0xa57220, prefix=0x7fb7d4c7ec40 "xlator.protocol.server.conn.1.bound_xl./g01") at inode.c:1623 i = 545 inode = 0x7fb7d699e88c key = "xlator.protocol.server.conn.1.bound_xl./g01.active.544", '\000' <repeats 4041 times> ret = <value optimized out> __FUNCTION__ = "inode_table_dump" #7 0x00007fb7d26ca686 in server_inode (this=<value optimized out>) at server.c:294 conf = <value optimized out> trav = 0x7fb7bc0215d0 key = "xlator.protocol.server.conn.1.bound_xl./g01\000-bytes-write", '\000' <repeats 4039 times> i = 2 ret = <value optimized out> __FUNCTION__ = "server_inode" #8 0x00007fb7d6691045 in gf_proc_dump_xlator_info (this_xl=0xa521e0) at statedump.c:269 ctx = 0xa41010 this_xlator = <value optimized out> #9 0x00007fb7d6691321 in gf_proc_dump_info (signum=<value optimized out>) at statedump.c:436 ret = <value optimized out> ctx = <value optimized out> #10 0x0000000000405402 in glusterfs_sigwaiter (arg=<value optimized out>) at glusterfsd.c:1271 set = {__val = {18947, 0 <repeats 15 times>}} ret = <value optimized out> sig = 10 #11 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 No symbol table info available. #12 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 No symbol table info available. #13 0x0000000000000000 in ?? () No symbol table info available. Thread 4 (Thread 0x7fb7d3d72700 (LWP 2271)): #0 0x00007fb7d5b2839d in nanosleep () from /lib/libc.so.6 No symbol table info available. #1 0x00007fb7d5b5d844 in usleep () from /lib/libc.so.6 No symbol table info available. #2 0x00007fb7d667946a in gf_timer_proc (ctx=0xa41010) at timer.c:181 now = 1319223961961762 now_tv = {tv_sec = 1319223961, tv_usec = 961762} event = 0x7fb7b400a730 reg = 0xa45920 __FUNCTION__ = "gf_timer_proc" #3 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 No symbol table info available. #4 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 No symbol table info available. #5 0x0000000000000000 in ?? () No symbol table info available. Thread 3 (Thread 0x7fb7d23bb700 (LWP 2290)): #0 0x00007fb7d5e0cbc9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib/libpthread.so.0 No symbol table info available. #1 0x00007fb7d335cbbc in janitor_get_next_fd (data=<value optimized out>) at posix.c:1408 priv = 0xa55ae0 timeout = {tv_sec = 1319224513, tv_nsec = 0} #2 posix_janitor_thread_proc (data=<value optimized out>) at posix.c:1454 this = 0xa4b880 priv = 0xa55ae0 pfd = <value optimized out> now = 1319223913 __FUNCTION__ = "posix_janitor_thread_proc" #3 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 No symbol table info available. #4 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 No symbol table info available. #5 0x0000000000000000 in ?? () No symbol table info available. Thread 2 (Thread 0x7fb7d1564700 (LWP 2304)): #0 0x00007fb7d5e0cbc9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib/libpthread.so.0 No symbol table info available. #1 0x00007fb7d2d32740 in iot_worker (data=0xa556e0) at io-threads.c:101 this = <value optimized out> stub = 0x0 sleep_till = {tv_sec = 1319224034, tv_nsec = 0} ret = <value optimized out> timeout = 0 '\000' bye = 0 '\000' __FUNCTION__ = "iot_worker" #2 0x00007fb7d5e079ca in start_thread () from /lib/libpthread.so.0 No symbol table info available. #3 0x00007fb7d5b6470d in clone () from /lib/libc.so.6 No symbol table info available. #4 0x0000000000000000 in ?? () No symbol table info available. Thread 1 (Thread 0x7fb7d6ac4700 (LWP 2268)): #0 0x00007fb7d5e0f464 in __lll_lock_wait () from /lib/libpthread.so.0 No symbol table info available. #1 0x00007fb7d5e0a5d9 in _L_lock_953 () from /lib/libpthread.so.0 No symbol table info available. #2 0x00007fb7d5e0a3fb in pthread_mutex_lock () from /lib/libpthread.so.0 No symbol table info available. #3 0x00007fb7d667a151 in inode_ref (inode=0x7fb7d699e88c) at inode.c:473 No locals. #4 0x00007fb7d2f3aaa2 in pl_update_refkeeper (this=<value optimized out>, inode=0x7fb7d699e88c) at common.c:416 is_empty = -512 #5 0x00007fb7d2f441b2 in pl_common_inodelk (frame=0x7fb7d4f8fa70, this=<value optimized out>, volume=0x7fb7c80114a8 " \025\001ȷ\177", inode=0xffffffffffffffff, cmd=10842656, flock=0x7fb7cc027318, loc=0x0, fd=0x7fb7d1969ee8) at inodelk.c:648 op_ret = 0 op_errno = 0 ret = <value optimized out> can_block = <value optimized out> transport = 0x7fb7cc058460 client_pid = <value optimized out> owner = <value optimized out> pinode = 0x7fb7c8011480 reqlock = 0x7fb7cc058460 dom = 0x7fb7c8011520 __FUNCTION__ = "pl_common_inodelk" #6 0x00007fb7d2f44add in pl_finodelk (frame=0xa57220, this=0x80, volume=0x7fb7c80114a8 " \025\001ȷ\177", fd=<value optimized out>, cmd=10842656, flock=0x8dc) at inodelk.c:672 No locals. #7 0x00007fb7d666c746 in default_finodelk (frame=<value optimized out>, this=0xa4ebe0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, lock=0x7fb7cc027318) at defaults.c:1070 _new = <value optimized out> old_THIS = 0xa4ebe0 __FUNCTION__ = "default_finodelk" #8 0x00007fb7d666c746 in default_finodelk (frame=<value optimized out>, this=0xa4fca0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, lock=0x7fb7cc027318) at defaults.c:1070 _new = <value optimized out> old_THIS = 0xa4fca0 __FUNCTION__ = "default_finodelk" #9 0x00007fb7d28f8c63 in io_stats_finodelk (frame=<value optimized out>, this=0xa50ff0, volume=<value optimized out>, fd=0x7fb7d1969ee8, cmd=6, flock=0x7fb7cc027318) at io-stats.c:1701 _new = <value optimized out> old_THIS = 0xa50ff0 __FUNCTION__ = "io_stats_finodelk" #10 0x00007fb7d26d7ff1 in server_finodelk_resume (frame=<value optimized out>, bound_xl=0xa50ff0) at server3_1-fops.c:1972 _new = <value optimized out> old_THIS = 0xa521e0 state = 0x7fb7cc026ee0 __FUNCTION__ = "server_finodelk_resume" #11 0x00007fb7d26cacd3 in server_resolve_done (frame=0x7fb7d4d6b614) at server-resolve.c:597 state = 0x7fb7cc026ee0 bound_xl = 0xa50ff0 #12 0x00007fb7d26cb905 in server_resolve_all (frame=0xa57220) at server-resolve.c:632 state = <value optimized out> this = 0xfffffffffffffe00 __FUNCTION__ = "server_resolve_all" #13 0x00007fb7d26cb836 in server_resolve (frame=0x7fb7d4d6b614) at server-resolve.c:579 resolve = 0x7fb7cc027050 __FUNCTION__ = "server_resolve" #14 0x00007fb7d26cb8fe in server_resolve_all (frame=0xa57220) at server-resolve.c:628 state = 0x7fb7c80114a8 this = <value optimized out> __FUNCTION__ = "server_resolve_all" #15 0x00007fb7d26cbafb in server_resolve_fd (frame=0x7fb7d4d6b614) at server-resolve.c:540 state = 0x7fb7cc026ee0 resolve = 0x7fb7cc026f90 __FUNCTION__ = "server_resolve_fd" #16 0x00007fb7d26cb74b in server_resolve (frame=0x7fb7d4d6b614) at server-resolve.c:557 resolve = 0x7fb7cc026f90 __FUNCTION__ = "server_resolve" #17 0x00007fb7d26cb8de in server_resolve_all (frame=0xa57220) at server-resolve.c:621 state = 0x7fb7c80114a8 this = <value optimized out> __FUNCTION__ = "server_resolve_all" #18 0x00007fb7d26cbaa4 in resolve_and_resume (frame=0xa57220, fn=0x80) at server-resolve.c:651 No locals. #19 0x00007fb7d26da723 in server_finodelk (req=<value optimized out>) at server3_1-fops.c:4494 state = 0x7fb7cc026ee0 frame = <value optimized out> args = {gfid = '\000' <repeats 15 times>, fd = 0, cmd = 1, type = 1, flock = {type = 1, whence = 0, start = 5987348480, len = 4096, pid = 0, owner = 0}, volume = 0x7fff32c80330 "images-replicate-2"} ret = <value optimized out> #20 0x00007fb7d644127f in rpcsvc_handle_rpc_call (svc=0xa53970, trans=<value optimized out>, msg=<value optimized out>) at rpcsvc.c:480 actor = <value optimized out> req = 0x7fb7d69cee24 ret = <value optimized out> port = <value optimized out> __FUNCTION__ = "rpcsvc_handle_rpc_call" #21 0x00007fb7d64413e3 in rpcsvc_notify (trans=0x7fb7c402f170, mydata=0xa57220, event=<value optimized out>, data=0xffffffffffffffff) at rpcsvc.c:576 ret = <value optimized out> listener = <value optimized out> __FUNCTION__ = "rpcsvc_notify" #22 0x00007fb7d6441d98 in rpc_transport_notify (this=0xa57220, event=<value optimized out>, data=<value optimized out>) at rpc-transport.c:919 ret = <value optimized out> __FUNCTION__ = "rpc_transport_notify" #23 0x00007fb7d41d0044 in socket_event_poll_in (this=0x7fb7c402f170) at socket.c:1647 ret = -512 pollin = 0x7fb7cc058390 #24 0x00007fb7d41d0127 in socket_event_handler (fd=<value optimized out>, idx=39, data=0x7fb7c402f170, poll_in=1, poll_out=0, poll_err=<value optimized out>) at socket.c:1762 priv = 0x7fb7c402f440 ret = -512 __FUNCTION__ = "socket_event_handler" #25 0x00007fb7d668b904 in event_dispatch_epoll_handler (event_pool=0xa41350) at event.c:794 data = <value optimized out> idx = <value optimized out> event_data = 0x202 handler = 0x7fb7d41d0060 <socket_event_handler> #26 event_dispatch_epoll (event_pool=0xa41350) at event.c:856 events = 0xa45af0 i = 0 ret = <value optimized out> __FUNCTION__ = "event_dispatch_epoll" #27 0x00000000004063ce in main (argc=17, argv=0x7fff32c81658) at glusterfsd.c:1509 ctx = <value optimized out> ret = -512 __FUNCTION__ = "main"
CHANGE: http://review.gluster.com/629 (Performing inode_ref inside the critical section of pl_update_refkeeper) merged in release-3.2 by Vijay Bellur (vijay)
CHANGE: http://review.gluster.com/628 (Performing inode_ref inside the critical section of pl_update_refkeeper) merged in master by Vijay Bellur (vijay)
CHANGE: http://review.gluster.com/630 (Performing inode_ref inside the critical section of pl_update_refkeeper) merged in release-3.1 by Vijay Bellur (vijay)
Its fixed now. Checked with glusterfs-3.2.5qa4. Applied the below diff to both glusterfs-3.2.4 and glusterfs-3.2.5qa4. Started untarring on the mount point and at the same time took the statedump of the servers. On 3.2.4 the untarring was not at all conrinuing due to the deadlock. But in glusterfs-3.2.5qa4 untarring was continuing after the sleep time was over. tar -xvf /home/raghu/work/glusterfs_tarballs/glusterfs-3.2.1.tar.gz glusterfs-3.2.1/ glusterfs-3.2.1/NEWS git diff diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 5e80c16..f63aecc 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -413,6 +413,8 @@ pl_update_refkeeper (xlator_t *this, inode_t *inode) pl_inode->refkeeper = NULL; } + sleep (30); + if (!is_empty && !pl_inode->refkeeper) { need_ref = 1; pl_inode->refkeeper = inode; raghu@hyperspace:~/work/src/glusterfs.git/build/xlators/features/locks/src$