Bug 1209143
Summary: | Freezing task with active RPC hangs other tasks trying to do rpc | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
Product: | Red Hat Enterprise Linux 6 | Reporter: | Frank Sorenson <fsorenso> | ||||||||
Component: | kernel | Assignee: | nfs-maint | ||||||||
kernel sub component: | NFS | QA Contact: | Filesystem QE <fs-qe> | ||||||||
Status: | CLOSED WONTFIX | Docs Contact: | |||||||||
Severity: | high | ||||||||||
Priority: | unspecified | CC: | bcodding, dwysocha, fsorenso, mdonauer, steved | ||||||||
Version: | 6.6 | ||||||||||
Target Milestone: | rc | ||||||||||
Target Release: | --- | ||||||||||
Hardware: | x86_64 | ||||||||||
OS: | Linux | ||||||||||
Whiteboard: | |||||||||||
Fixed In Version: | Doc Type: | Bug Fix | |||||||||
Doc Text: | Story Points: | --- | |||||||||
Clone Of: | Environment: | ||||||||||
Last Closed: | 2015-10-24 11:40:44 UTC | Type: | Bug | ||||||||
Regression: | --- | Mount Type: | --- | ||||||||
Documentation: | --- | CRM: | |||||||||
Verified Versions: | Category: | --- | |||||||||
oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |||||||||
Cloudforms Team: | --- | Target Upstream Version: | |||||||||
Embargoed: | |||||||||||
Bug Depends On: | |||||||||||
Bug Blocks: | 1172231 | ||||||||||
Attachments: |
|
Description
Frank Sorenson
2015-04-06 12:44:29 UTC
Created attachment 1011378 [details]
dmesg with rpcdebug (and a few lines of systemtap annotation)
Created attachment 1011379 [details]
systemtap output
systemtap output with result of xprt_reserve_xprt() call and backtraces
Here are the results from 3 customer vmcores: 805293386 305011377 329596338 * tasks making rpc calls are blocked waiting for the rpc_task to execute * the rpc_xprt has a backlog in the 'sending' rpc_wait_queue, and the rpc_xprt.state indicates that it is not connected * tcp status shows the socket is in TCP_CLOSE, and analysis of previous tcpdumps indicates that the connection was being shut down * the 'owner' of the rpc_wait_queue is a task that is being frozen * rpc_task is stuck in the FSM in __rpc_execute (tk_runstate is 5 = RUNNING | ACTIVE, while all other rpc_tasks are 6 = QUEUED | ACTIVE) (as this is easy to reproduce, additional vmcores can be provided if necessary) ==== vmcore 805293386 crash> list -H all_clients -o rpc_clnt.cl_clients -s rpc_clnt.cl_server,cl_vers,cl_protname,cl_xprt | egrep -e ^f -e cl_server -e cl_vers -e cl_protname -e cl_program -e cl_xprt | paste -s - -d ' \n' | grep -w nfs ffff880872099e00 cl_server = 0xffff88086f96ff68 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03be651 "nfs" cl_xprt = 0xffff8808714a5000 ffff8806c9ecc800 cl_server = 0xffff88086f96ff68 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03be651 "nfs" cl_xprt = 0xffff8808714a5000 ffff88086f8bae00 cl_server = 0xffff88086f96ff68 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03be651 "nfs" cl_xprt = 0xffff8808714a5000 ffff88087214d000 cl_server = 0xffff88086f96ff68 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03be651 "nfs" cl_xprt = 0xffff8808714a5000 ffff88086f96fe00 cl_server = 0xffff88086f96ff68 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03be651 "nfs" cl_xprt = 0xffff8808714a5000 crash> rpc_xprt.address_strings 0xffff8808714a5000 address_strings = {0xffff880871930820 "10.67.6.11", 0xffff880871930840 "2049", 0xffffffffa02ffee3 "tcp", 0xffff880871930860 "0a43060b", 0xffff880871930880 " 801", 0xffffffffa02ffee3 "tcp"} crash> rpc_xprt.sending 0xffff8808714a5000 -o struct rpc_xprt { [ffff8808714a5190] struct rpc_wait_queue sending; crash> rpc_wait_queue.owner,qlen ffff8808714a5190 -d owner = 20168 qlen = 110 look for tasks which are freezing: crash> foreach task -R flags | paste -s - -d' \n' > fe_task-flags crash> !egrep '(4|6|c)[0-9a-f]{3},' fe_task-flags PID: 19703 TASK: ffff8802ca22f500 CPU: 0 COMMAND: "tcsh" flags = 0x416100, flags = 0x80, PID: 19911 TASK: ffff881074dcb540 CPU: 9 COMMAND: "arch" flags = 0x416000, flags = 0x80, PID: 20167 TASK: ffff880870bdd500 CPU: 9 COMMAND: "arch" flags = 0x416040, flags = 0x80, PID: 20168 TASK: ffff88082a913540 CPU: 16 COMMAND: "manpath" flags = 0x416000, flags = 0x80, crash> ps -p 20168 PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper" PID: 1 TASK: ffff8808748d5500 CPU: 17 COMMAND: "init" PID: 3128 TASK: ffff881072fd1500 CPU: 16 COMMAND: "sge_execd" PID: 19685 TASK: ffff8803608f0080 CPU: 0 COMMAND: "sge_shepherd" PID: 19703 TASK: ffff8802ca22f500 CPU: 0 COMMAND: "tcsh" PID: 19911 TASK: ffff881074dcb540 CPU: 9 COMMAND: "arch" PID: 20167 TASK: ffff880870bdd500 CPU: 9 COMMAND: "arch" PID: 20168 TASK: ffff88082a913540 CPU: 16 COMMAND: "manpath" crash> p *(((struct task_struct *)0xffff8802ca22f500)->cgroups.subsys[6]) $4 = { cgroup = 0xffff8802ca0e9000, crash> p *(((struct task_struct *)0xffff881074dcb540)->cgroups.subsys[6]) $3 = { cgroup = 0xffff8802ca0e9000, crash> p *(((struct task_struct *)0xffff880870bdd500)->cgroups.subsys[6]) $2 = { cgroup = 0xffff8802ca0e9000, crash> p *(((struct task_struct *)0xffff88082a913540)->cgroups.subsys[6]) $1 = { cgroup = 0xffff8802ca0e9000, crash> lscgroup 0xffff8802ca0e9000 CGROUP CONTROLLER:PATH ffff8802ca0e9000 freezer:/UGE/5891797.1 pid 20168 owns the rpc_wait_queue, and is freezing PID: 19703 TASK: ffff8802ca22f500 CPU: 0 COMMAND: "tcsh" #0 [ffff88061d357c88] schedule at ffffffff81529990 #1 [ffff88061d357d60] refrigerator at ffffffff810a8875 #2 [ffff88061d357d90] get_signal_to_deliver at ffffffff810903b5 #3 [ffff88061d357e30] do_signal at ffffffff8100a265 #4 [ffff88061d357f30] do_notify_resume at ffffffff8100aa80 #5 [ffff88061d357f50] int_signal at ffffffff8100b341 PID: 19911 TASK: ffff881074dcb540 CPU: 9 COMMAND: "arch" #0 [ffff881030c7dc88] schedule at ffffffff81529990 #1 [ffff881030c7dd60] refrigerator at ffffffff810a8875 #2 [ffff881030c7dd90] get_signal_to_deliver at ffffffff810903b5 #3 [ffff881030c7de30] do_signal at ffffffff8100a265 #4 [ffff881030c7df30] do_notify_resume at ffffffff8100aa80 #5 [ffff881030c7df50] int_signal at ffffffff8100b341 PID: 20167 TASK: ffff880870bdd500 CPU: 9 COMMAND: "arch" #0 [ffff8801fd259c88] schedule at ffffffff81529990 #1 [ffff8801fd259d60] refrigerator at ffffffff810a8875 #2 [ffff8801fd259d90] get_signal_to_deliver at ffffffff810903b5 #3 [ffff8801fd259e30] do_signal at ffffffff8100a265 #4 [ffff8801fd259f30] do_notify_resume at ffffffff8100aa80 #5 [ffff8801fd259f50] int_signal at ffffffff8100b341 PID: 20168 TASK: ffff88082a913540 CPU: 16 COMMAND: "manpath" #0 [ffff88061c3637e8] schedule at ffffffff81529990 #1 [ffff88061c3638c0] refrigerator at ffffffff810a8875 #2 [ffff88061c3638f0] rpc_wait_bit_killable at ffffffffa02e9455 [sunrpc] #3 [ffff88061c363900] __wait_on_bit at ffffffff8152ac9f #4 [ffff88061c363950] out_of_line_wait_on_bit at ffffffff8152ad48 #5 [ffff88061c3639c0] __rpc_execute at ffffffffa02e9965 [sunrpc] #6 [ffff88061c363a30] rpc_execute at ffffffffa02e9c21 [sunrpc] #7 [ffff88061c363a60] rpc_run_task at ffffffffa02e0465 [sunrpc] #8 [ffff88061c363a80] rpc_call_sync at ffffffffa02e0582 [sunrpc] #9 [ffff88061c363ae0] nfs3_rpc_wrapper.clone.0 at ffffffffa039282c [nfs] #10 [ffff88061c363b20] nfs3_proc_access at ffffffffa0392bcc [nfs] #11 [ffff88061c363b90] nfs_do_access at ffffffffa037a82c [nfs] #12 [ffff88061c363c20] nfs_permission at ffffffffa037a978 [nfs] #13 [ffff88061c363c50] __link_path_walk at ffffffff8119e213 #14 [ffff88061c363d30] path_walk at ffffffff8119f41a #15 [ffff88061c363d70] filename_lookup at ffffffff8119f62b #16 [ffff88061c363db0] user_path_at at ffffffff811a0757 #17 [ffff88061c363e80] vfs_fstatat at ffffffff81193bd0 #18 [ffff88061c363ee0] vfs_stat at ffffffff81193d4b #19 [ffff88061c363ef0] sys_newstat at ffffffff81193d74 #20 [ffff88061c363f80] system_call_fastpath at ffffffff8100b072 ==== vmcore 305011377 list -H all_clients -o rpc_clnt.cl_clients -s rpc_clnt.cl_server,cl_vers,cl_protname,cl_xprt | egrep -e ^f -e cl_server -e cl_vers -e cl_protname -e cl_program -e cl_xprt | paste -s - -d ' \n' | grep -w nfs ffff88086fcb3600 cl_server = 0xffff8808707dd168 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03ae651 "nfs" cl_xprt = 0xffff88086ec96000 ffff8807f8f69c00 cl_server = 0xffff8808707dd168 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03ae651 "nfs" cl_xprt = 0xffff88086ec96000 ffff880871678600 cl_server = 0xffff8808707dd168 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03ae651 "nfs" cl_xprt = 0xffff88086ec96000 ffff8808707dd000 cl_server = 0xffff8808707dd168 "manetai" cl_vers = 3 cl_protname = 0xffffffffa03ae651 "nfs" cl_xprt = 0xffff88086ec96000 crash> rpc_clnt.cl_tasks ffff88086fcb3600 -o struct rpc_clnt { [ffff88086fcb3618] struct list_head cl_tasks; crash> list -H ffff88086fcb3618 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status -d | paste -s - -d' \n' ffff8808703199c0 tk_owner = 105297 tk_runstate = 5 tk_status = 0 ffff88039d6a0ec0 tk_owner = 104887 tk_runstate = 6 tk_status = -11 ffff8807cfc4a480 tk_owner = 105486 tk_runstate = 5 tk_status = -11 ... ffff88086ed1f7c0 tk_owner = 105252 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff8807f8f69c00 -o struct rpc_clnt { [ffff8807f8f69c18] struct list_head cl_tasks; crash> list -H ffff8807f8f69c18 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status -d | paste -s - -d' \n' ffff8807fdb1c5c0 tk_owner = 105397 tk_runstate = 6 tk_status = -11 ffff88067f9c03c0 tk_owner = 105020 tk_runstate = 6 tk_status = -11 ffff88086fe53b80 tk_owner = 104843 tk_runstate = 6 tk_status = -11 ffff8807cfc4a280 tk_owner = 105134 tk_runstate = 6 tk_status = -11 ffff8807f8d26080 tk_owner = 104712 tk_runstate = 6 tk_status = -11 ffff88086fc9e880 tk_owner = 105135 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff880871678600 -o struct rpc_clnt { [ffff880871678618] struct list_head cl_tasks; crash> list -H ffff880871678618 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status -d | paste -s - -d' \n' ffff8806101f3880 tk_owner = 104807 tk_runstate = 6 tk_status = -11 ffff88106b417d80 tk_owner = 104778 tk_runstate = 6 tk_status = -11 ... ffff8807fdb1c2c0 tk_owner = 112444 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff8808707dd000 -o struct rpc_clnt { [ffff8808707dd018] struct list_head cl_tasks; crash> list -H ffff8808707dd018 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status -d | paste -s - -d' \n' (empty) crash> rpc_xprt.address_strings 0xffff88086ec96000 address_strings = {0xffff8808700f1980 "10.67.6.11", 0xffff8808702f6a80 "2049", 0xffffffffa02eaee3 "tcp", 0xffff8808702f67c0 "0a43060b", 0xffff88086e7a7d80 " 801", 0xffffffffa02eaee3 "tcp"} crash> rpc_xprt.sending 0xffff88086ec96000 -o struct rpc_xprt { [ffff88086ec96190] struct rpc_wait_queue sending; crash> rpc_wait_queue.owner,qlen ffff88086ec96190 -d owner = 105486 qlen = 53 look for freezing tasks: crash> foreach task -R flags | paste -s - -d' \n' > fe_task-flags crash> !egrep '(4|6|c)[0-9]{3},' fe_task-flags PID: 105297 TASK: ffff8807cffa0040 CPU: 23 COMMAND: "python" flags = 0x416100, flags = 0x84, PID: 105486 TASK: ffff88102b315500 CPU: 0 COMMAND: "python" flags = 0x416100, flags = 0x80, crash> ps -p 105486 105297 PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper" PID: 1 TASK: ffff8808748d5500 CPU: 2 COMMAND: "init" PID: 3586 TASK: ffff881070a84ae0 CPU: 0 COMMAND: "sge_execd" PID: 104765 TASK: ffff8806837d3540 CPU: 6 COMMAND: "sge_shepherd" PID: 105297 TASK: ffff8807cffa0040 CPU: 23 COMMAND: "python" PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper" PID: 1 TASK: ffff8808748d5500 CPU: 2 COMMAND: "init" PID: 3586 TASK: ffff881070a84ae0 CPU: 0 COMMAND: "sge_execd" PID: 104778 TASK: ffff88086ff6d500 CPU: 15 COMMAND: "sge_shepherd" PID: 105486 TASK: ffff88102b315500 CPU: 0 COMMAND: "python" crash> p *(((struct task_struct *)0xffff8807cffa0040)->cgroups.subsys[6]) $3 = { cgroup = 0xffff8807faeed200, crash> lscgroup 0xffff8807faeed200 CGROUP CONTROLLER:PATH ffff8807faeed200 freezer:/UGE/5628500.1 crash> p *(((struct task_struct *)0xffff88102b315500)->cgroups.subsys[6]) $2 = { cgroup = 0xffff8807cfb5f000, crash> lscgroup 0xffff8807cfb5f000 CGROUP CONTROLLER:PATH ffff8807cfb5f000 freezer:/UGE/5628501.1 PID: 105297 TASK: ffff8807cffa0040 CPU: 23 COMMAND: "python" #0 [ffff88067fa55928] schedule at ffffffff81529990 #1 [ffff88067fa55a00] refrigerator at ffffffff810a8875 #2 [ffff88067fa55a30] rpc_wait_bit_killable at ffffffffa02d4455 [sunrpc] #3 [ffff88067fa55a40] __wait_on_bit at ffffffff8152ac9f #4 [ffff88067fa55a90] out_of_line_wait_on_bit at ffffffff8152ad48 #5 [ffff88067fa55b00] __rpc_execute at ffffffffa02d4965 [sunrpc] #6 [ffff88067fa55b70] rpc_execute at ffffffffa02d4c21 [sunrpc] #7 [ffff88067fa55ba0] rpc_run_task at ffffffffa02cb465 [sunrpc] #8 [ffff88067fa55bc0] rpc_call_sync at ffffffffa02cb582 [sunrpc] #9 [ffff88067fa55c20] nfs3_rpc_wrapper.clone.0 at ffffffffa038282c [nfs] #10 [ffff88067fa55c60] nfs3_do_create at ffffffffa0383393 [nfs] #11 [ffff88067fa55c90] nfs3_proc_create at ffffffffa038387a [nfs] #12 [ffff88067fa55ce0] nfs_create at ffffffffa036c16c [nfs] #13 [ffff88067fa55d70] vfs_create at ffffffff8119d606 #14 [ffff88067fa55db0] do_filp_open at ffffffff811a148e #15 [ffff88067fa55f20] do_sys_open at ffffffff8118ae17 #16 [ffff88067fa55f70] sys_open at ffffffff8118af20 #17 [ffff88067fa55f80] system_call_fastpath at ffffffff8100b072 PID: 105486 TASK: ffff88102b315500 CPU: 0 COMMAND: "python" #0 [ffff88102d8059b8] schedule at ffffffff81529990 #1 [ffff88102d805a90] refrigerator at ffffffff810a8875 #2 [ffff88102d805ac0] rpc_wait_bit_killable at ffffffffa02d4455 [sunrpc] #3 [ffff88102d805ad0] __wait_on_bit at ffffffff8152ac9f #4 [ffff88102d805b20] out_of_line_wait_on_bit at ffffffff8152ad48 #5 [ffff88102d805b90] __rpc_execute at ffffffffa02d4965 [sunrpc] #6 [ffff88102d805c00] rpc_execute at ffffffffa02d4c21 [sunrpc] #7 [ffff88102d805c30] rpc_run_task at ffffffffa02cb465 [sunrpc] #8 [ffff88102d805c50] rpc_call_sync at ffffffffa02cb582 [sunrpc] #9 [ffff88102d805cb0] nfs3_rpc_wrapper.clone.0 at ffffffffa038282c [nfs] #10 [ffff88102d805cf0] nfs3_proc_setattr at ffffffffa0382932 [nfs] #11 [ffff88102d805d80] nfs_setattr at ffffffffa0371540 [nfs] #12 [ffff88102d805dc0] notify_change at ffffffff811ad0d8 #13 [ffff88102d805e30] utimes_common at ffffffff811c15dc #14 [ffff88102d805eb0] do_utimes at ffffffff811c1780 #15 [ffff88102d805f10] sys_futimesat at ffffffff811c1850 #16 [ffff88102d805f70] sys_utimes at ffffffff811c18e9 #17 [ffff88102d805f80] system_call_fastpath at ffffffff8100b072 ==== vmcore 329596338 crash> list -H all_clients -o rpc_clnt.cl_clients -s rpc_clnt.cl_server,cl_vers,cl_protname,cl_xprt | egrep -e ^f -e cl_server -e cl_vers -e cl_protname -e cl_program -e cl_xprt | paste -s - -d ' \n' | grep -w nfs ffff88086a992e00 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 ffff880870483400 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 ffff8808703b1400 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 ffff8806dee03e00 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 ffff88086f9fb800 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 ffff880871f57000 cl_server = 0xffff880871f57168 "manetai" cl_vers = 0x3 cl_protname = 0xffffffffa03a9651 "nfs" cl_xprt = 0xffff880870c38000 crash> rpc_clnt.cl_tasks ffff88086a992e00 -o struct rpc_clnt { [ffff88086a992e18] struct list_head cl_tasks; crash> list -H ffff88086a992e18 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status | paste -s - -d' \n' ffff8807db417e80 tk_owner = 82034 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff880870483400 -o struct rpc_clnt { [ffff880870483418] struct list_head cl_tasks; crash> rpc_clnt.cl_tasks ffff8808703b1400 -o struct rpc_clnt { [ffff8808703b1418] struct list_head cl_tasks; crash> rpc_clnt.cl_tasks ffff8806dee03e00 -o struct rpc_clnt { [ffff8806dee03e18] struct list_head cl_tasks; crash> list -H ffff8806dee03e18 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status | paste -s - -d' \n' ffff88069d5965c0 tk_owner = 84902 tk_runstate = 5 tk_status = -11 ffff881072299880 tk_owner = 62073 tk_runstate = 6 tk_status = -11 ffff8802e288cbc0 tk_owner = 75263 tk_runstate = 6 tk_status = -11 ... ffff8807e3899870 tk_owner = 77805 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff88086f9fb800 -o struct rpc_clnt { [ffff88086f9fb818] struct list_head cl_tasks; crash> list -H ffff88086f9fb818 -o rpc_task.tk_task -s rpc_task.tk_owner,tk_runstate,tk_status | paste -s - -d' \n' ffff88074576eec0 tk_owner = 84886 tk_runstate = 6 tk_status = -11 ffff88086ad34180 tk_owner = 3606 tk_runstate = 6 tk_status = -11 ... ffff880feb3c13c0 tk_owner = 85899 tk_runstate = 6 tk_status = -11 crash> rpc_clnt.cl_tasks ffff880871f57000 -o struct rpc_clnt { [ffff880871f57018] struct list_head cl_tasks; crash> rpc_xprt.address_strings 0xffff880870c38000 address_strings = {0xffff880870638a80 "10.67.6.11", 0xffff880870638b40 "2049", 0xffffffffa02e5ee3 "tcp", 0xffff880870638840 "0a43060b", 0xffff880870638b20 " 801", 0xffffffffa02e5ee3 "tcp"} crash> rpc_xprt.sending 0xffff880870c38000 -o struct rpc_xprt { [ffff880870c38190] struct rpc_wait_queue sending; crash> rpc_wait_queue.owner,qlen ffff880870c38190 -d owner = 84902 qlen = 55 looking at freezing tasks: crash> foreach task -R flags | paste -s - -d' \n' > fe_task-flags crash> !egrep '(4|6|c)[0-9a-f]{3},' fe_task-flags PID: 84902 TASK: ffff8807756ab500 CPU: 23 COMMAND: "python" flags = 0x416100, flags = 0x80, PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper" PID: 1 TASK: ffff8808748d5500 CPU: 31 COMMAND: "init" PID: 3606 TASK: ffff880871ee1500 CPU: 20 COMMAND: "sge_execd" PID: 84886 TASK: ffff88086b202aa0 CPU: 20 COMMAND: "sge_shepherd" PID: 84902 TASK: ffff8807756ab500 CPU: 23 COMMAND: "python" crash> p *(((struct task_struct *)0xffff8807756ab500)->cgroups.subsys[6]) $1 = { cgroup = 0xffff88086cb76a00, crash> lscgroup 0xffff88086cb76a00 CGROUP CONTROLLER:PATH ffff88086cb76a00 freezer:/UGE/5864552.1 PID: 84902 TASK: ffff8807756ab500 CPU: 23 COMMAND: "python" #0 [ffff88086c8d9a08] schedule at ffffffff81529990 #1 [ffff88086c8d9ae0] refrigerator at ffffffff810a8875 #2 [ffff88086c8d9b10] rpc_wait_bit_killable at ffffffffa02cf455 [sunrpc] #3 [ffff88086c8d9b20] __wait_on_bit at ffffffff8152ac9f #4 [ffff88086c8d9b70] out_of_line_wait_on_bit at ffffffff8152ad48 #5 [ffff88086c8d9be0] __rpc_execute at ffffffffa02cf965 [sunrpc] #6 [ffff88086c8d9c50] rpc_execute at ffffffffa02cfc21 [sunrpc] #7 [ffff88086c8d9c80] rpc_run_task at ffffffffa02c6465 [sunrpc] #8 [ffff88086c8d9ca0] rpc_call_sync at ffffffffa02c6582 [sunrpc] #9 [ffff88086c8d9d00] nfs3_rpc_wrapper.clone.0 at ffffffffa037d82c [nfs] #10 [ffff88086c8d9d40] nfs3_proc_getattr at ffffffffa037ebb7 [nfs] #11 [ffff88086c8d9d90] __nfs_revalidate_inode at ffffffffa036c073 [nfs] #12 [ffff88086c8d9dd0] nfs_revalidate_inode at ffffffffa036cda6 [nfs] #13 [ffff88086c8d9e00] nfs_getattr at ffffffffa036ce86 [nfs] #14 [ffff88086c8d9e40] vfs_getattr at ffffffff81193b51 #15 [ffff88086c8d9e80] vfs_fstatat at ffffffff81193be4 #16 [ffff88086c8d9ee0] vfs_stat at ffffffff81193d4b #17 [ffff88086c8d9ef0] sys_newstat at ffffffff81193d74 #18 [ffff88086c8d9f80] system_call_fastpath at ffffffff8100b072 net/sunrpc/xprt.c: rpc_xprt->state contains these flags, defined in include/linux/sunrpc/xprt.h: #define XPRT_LOCKED (0) #define XPRT_CONNECTED (1) #define XPRT_CONNECTING (2) #define XPRT_CLOSE_WAIT (3) #define XPRT_BOUND (4) #define XPRT_BINDING (5) #define XPRT_CLOSING (6) #define XPRT_CONNECTION_ABORT (7) #define XPRT_CONNECTION_CLOSE (8) rpc state machine functions in net/sunrpc/xprt.c both xprt_reserve_xprt() and xprt_reserve_xprt_cong() start like this: int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } ^^== if xprt->state is already locked, and we're trying to run an rpc_task that isn't the snd_task, we're blocked, and have to wait if it's locked, and we're trying to enqueue the rpc_task that's already snd_task, mission accomplished if it wasn't locked, we set the lock flag and enqueue the rpc_task there is nothing preventing this process from being frozen after setting XPRT_LOCKED, blocking all other users of the rpc_xprt XPRT_LOCKED also set in: __xprt_lock_write_next __xprt_lock_write_next_cong xprt_conditional_disconnect xprt_init_autodisconnect XPRT_LOCKED flag cleared again in: xprt_clear_locked note that there's also an rpc_xprt->transport_lock that appears to control access to rpc_xprt->state and other fields, such as this in xprt_force_disconnect spin_lock_bh(&xprt->transport_lock); set_bit(XPRT_CLOSE_WAIT, &xprt->state); spin_unlock_bh(&xprt->transport_lock); the process could also go in the freezer while holding the rpc_xprt->transport_lock (XPRT_LOCKED and other flags in ->state are set/cleared while ->transport_lock is held), though that's a much smaller window Hi Frank! Possible we just need something like: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 45c9d98..26441fc 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -755,9 +755,11 @@ static void __rpc_execute(struct rpc_task *task) /* sync task: sleep here */ dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid); + freezer_do_not_count(); status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, TASK_KILLABLE); + freezer_count(); if (status == -ERESTARTSYS) { /* * When a sync task receives a signal, it exits with You're not seeing any frozen async tasks holding up the xprt? I'm still looking to see if that could happen. Possible the reproducer should be changed to perform IO or some other async RPC. I'll try & spin up a kernel with the patch & give it a go. I'm not sure the freezer exclusion is wide enough to catch a task between setting XPRT_LOCKED and and the out_of_line_wait_on_bit() (seems likely to reduce the chances of getting stuck, though) I don't currently see any frozen async tasks, though I'm not really looking for that yet. I can probably also add tests for sync & async IO to see what happens in those scenarios. thanks! Actually, on a second look, that (comment 6) is probably completely wrong, since we're going to end up in the freezer after coming back from waiting in freezer_count() anyway. Not to mention that rpc_wait_bit_killable is already calling freezable_schedule(). So either need to change up the way the xprt locking is done, or insert freezer_[do_not_]count pairs a bit closer to the xprt code. Whatever work is done here, we'll need to ensure the desired behavior from https://bugzilla.redhat.com/show_bug.cgi?id=702630 is retained. Due to the complex dependency problems introduced by freezing a task that is doing RPC and may hold locks all the way back through VFS to userspace, it looks like making NFS tasks independently freezable is not a simple bug fix but rather a large project. There's current discussion of this here: http://marc.info/?t=142550657200003&r=1&w=2 Let's try to solve this problem another way, if we can. Frank, you said the customer's grid engine is putting NFS tasks into the freezer.. what grid engine? Maybe we can think about ways to exclude freezing of NFS tasks via inspection at "freeze-time" from the grid engine itself. The grid engine is Univa Grid Engine (UGE, previously SGE, Sun Grid Engine) Hello, my I add some words here. My Name is Marco Donauer, Univa Support. We don't think that it is related to any Grid Engine. What we do is just to use the freezer mechanism when a job has to be suspended. This means, that we just echo FROZEN into the status file. This case looks like a problem with handling locks. This situation can also happen if you create a script which is doing the suspend/unsuspend via the cgroups freezer. Due to the massive job count we are processing with Grid Engine it might happen more often than with other applications. Already NFS explicitly talking about in their documentation that some application requiring locks enabled to work correctly. And all these applications might run into the same problem, when they are frozen at the time they hold the nfs lock. It might be possible to find a way to do a pre check on Grid Engine side if a process is accessing nfs and eventually holding a lock. The this time no freezing is allowed, but there will be still a short time window between check and freezing. One post above is talking about a new kernel? So did you already release a patch for it? Best Regards, Marco Donauer Marco, we've not released a patch or done any kernel work to resolve this bug, since the linux freezer cgroup turns out to be a mechanism at odds with the locking required at several layers in the NFS client. Work to fix this up would need to happen upstream first. Has there been any movement on this upstream? It looks like a complex problem and the work may have stalled? There's not been any work upstream on this issue. I think there would need to be an very large investment of time and work to make cgroups freezer work with RPC tasks.. or to find another solution. Unfortunately, that has not happened yet. Dear team, so am I right, that there will be no solution for this problem in the near future? Are there workarounds available to solve this problem? I think it also can happen in other situation that "anybody" is using cgroups and processes might be frozen. The more the cgroups feature is used the higher is the possibility to run into this problem. This issue makes the cgroups feature worthless especially when many processes are frozen. We would be interested if there is a way to find out if a process is currently holding a RPC lock? Are the system calls which provide us this information? Here would be helpful to call something like has_rpc_lock(pid) which returns 0 or 1 Or can we find this somewhere in the proc file system? Regards, Marco There's no reliable way to determine whether it's safe to freeze a particular cgroup, since it's a snapshot in time, and whether a process is blocked while holding the lock can change before a subsequent freeze of the cgroup. So the only way to know is after-the-fact... in other words "did we catch the processes in this cgroup at a bad time?" Unfortunately, there's also no direct way for userspace to determine that a process holds such a lock. However, I whipped up the following script, which I think should give approximately what you're looking for: check_safely_frozen: ============================================================= #!/bin/bash cgroup=$1 freezer_base=/sys/fs/cgroup/freezer [[ -z $cgroup ]] && { echo "No freezer cgroup specified" echo "usage: $(readlink -f $0) <frozen_cgroup>" exit 11 } [[ -d $freezer_base && $(stat -f -c %T $freezer_base) == "cgroupfs" ]] || { echo "Unable to locate freezer cgroup directory at '$freezer_base'" exit 12 } [[ -d $freezer_base/$cgroup ]] || { echo "Freezer cgroup '$cgroup' could not be found" exit 13 } [[ $(cat $freezer_base/$cgroup/freezer.state) == "FROZEN" ]] || { echo "Freezer cgroup '$cgroup' is not frozen" exit 14 } blocked=$(for t in $(cat $freezer_base/$cgroup/tasks 2>/dev/null) ; do awk '{printf "%s ", gensub(/(.+)+(+.+)/, "\\1", "g", $2)}' /proc/$t/stack 2>/dev/null | \ egrep -q '^__refrigerator rpc_wait_bit_killable __rpc_execute' && echo $t done) [[ -n $blocked ]] && { echo "pids blocked: $blocked" exit 1 } exit 0 ============================================================= After freezing a cgroup, you'd run the script to determine whether the freeze caught any processes in the critical section of sending an rpc; it exits with '0' if the cgroup was frozen safely, and non-zero in case of an error, or unsafe freeze. This can then be used to unfreeze (allowing the process to exit the critical section) and reattempt the freeze again later. Or, wrapping it all in one, the following script should be able to freeze a cgroup safely, thawing and retrying if a process is currently frozen while in the rpc_execute: safe_freeze: ============================================================= #!/bin/bash cgroup=$1 freezer_base=/sys/fs/cgroup/freezer [[ -z $cgroup ]] && { echo "No freezer cgroup specified" echo "usage: $(readlink -f $0) <frozen_cgroup>" exit 11 } [[ -d $freezer_base && $(stat -f -c %T $freezer_base) == "cgroupfs" ]] || { echo "Unable to locate freezer cgroup directory at '$freezer_base'" exit 12 } [[ -d $freezer_base/$cgroup ]] || { echo "Freezer cgroup '$cgroup' could not be found" exit 13 } unsafe_count=0 while [[ 42 ]] ; do state=$(<$freezer_base/$cgroup/freezer.state) case $state in FROZEN) for t in $(<$freezer_base/$cgroup/tasks) ; do awk '{printf "%s ", gensub(/(.+)+(+.+)/, "\\1", "g", $2)} END{printf "\n"}' /proc/$t/stack 2>/dev/null done | egrep -q '^__refrigerator rpc_wait_bit_killable __rpc_execute' if [[ $? -ne 0 ]] ; then break ; fi unsafe_count=$(($unsafe_count + 1)) echo THAWED > $freezer_base/$cgroup/freezer.state ;; THAWED) echo FROZEN > $freezer_base/$cgroup/freezer.state ;; FREEZING) ;; # still freezing esac sleep 0.01 done echo "successfully froze '$cgroup' after $unsafe_count failure(s)" ============================================================= for example, in one terminal: mkdir /sys/fs/cgroup/freezer/test_cgroup echo $$ > /sys/fs/cgroup/freezer/test_cgroup/tasks mount server:/export /mnt/nfs while [[ 42 ]] ; do find /mnt/nfs >/dev/null ; done then in another terminal: # /var/tmp/safe_freeze test_cgroup successfully froze 'test_cgroup' after 10 failures # echo THAWED >/sys/fs/cgroup/freezer/test_cgroup/freezer.state # /var/tmp/safe_freeze test_cgroup successfully froze 'test_cgroup' after 26 failures # echo THAWED >/sys/fs/cgroup/freezer/test_cgroup/freezer.state # while [[ 42 ]] ; do /var/tmp/safe_freeze test_cgroup ; echo THAWED >/sys/fs/cgroup/freezer/test_cgroup/freezer.state ; sleep 1 ; done successfully froze 'test_cgroup' after 0 failures successfully froze 'test_cgroup' after 2 failures successfully froze 'test_cgroup' after 5 failures successfully froze 'test_cgroup' after 21 failures successfully froze 'test_cgroup' after 0 failures successfully froze 'test_cgroup' after 12 failures successfully froze 'test_cgroup' after 11 failures successfully froze 'test_cgroup' after 6 failures successfully froze 'test_cgroup' after 9 failures successfully froze 'test_cgroup' after 13 failures successfully froze 'test_cgroup' after 4 failures successfully froze 'test_cgroup' after 7 failures successfully froze 'test_cgroup' after 6 failures successfully froze 'test_cgroup' after 19 failures successfully froze 'test_cgroup' after 4 failures |