Description of problem: I had a 4 node cluster with one gfs being served by each node to 12 nfs clients though 12 different network interfaces. After a while doing I/O from the clients, one of the servers (taft-01) paniced. Unable to handle kernel paging request at 0000010299e8b6b0 RIP: <ffffffffa01713f1>{:sunrpc:xdr_encode_netobj+25} PML4 8063 PGD 0 Oops: 0002 [1] SMP CPU 2 Modules linked in: nfsd exportfs lockd nfs_acl lock_dlm(U) gfs(U) lock_harness(U) dlm(U) cmad Pid: 7789, comm: rpciod Not tainted 2.6.9-42.ELsmp RIP: 0010:[<ffffffffa01713f1>] <ffffffffa01713f1>{:sunrpc:xdr_encode_netobj+25} RSP: 0018:000001020c7ebda0 EFLAGS: 00010202 RAX: 0000000026c06d83 RBX: 000001021159e448 RCX: 0000000000000000 RDX: 0000000024000000 RSI: 000001021159e4d8 RDI: 00000101fee700a4 RBP: 000001021159e4e8 R08: 00000101fee70080 R09: 00000101fee700a8 R10: 000001021159e4d8 R11: ffffffffa02bd696 R12: 000001020c504778 R13: 000001020c504738 R14: 0000010037d47400 R15: ffffffffa02b8088 FS: 0000002a958a0b00(0000) GS:ffffffff804e5180(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000010299e8b6b0 CR3: 00000000dff9e000 CR4: 00000000000006e0 Process rpciod (pid: 7789, threadinfo 000001020c7ea000, task 000001020ce367f0) Stack: 00000101fee70080 00000101fee70080 ffffffffa02bcd8c 000001021159e420 000001020c504738 000001021159e448 ffffffffa02bd6dd 00000101fee70064 0000010200238200 ffffffffa02bd696 Call Trace:<ffffffffa02bcd8c>{:lockd:nlm4_encode_lock+132} <ffffffffa02bd6dd>{:lockd:nlm4clt <ffffffffa02bd696>{:lockd:nlm4clt_encode_testargs+0} <ffffffffa0167217>{:sunrpc:call_transmit+532} <ffffffffa016ac8f>{:sunrpc:__rpc_execut <ffffffffa016af8f>{:sunrpc:__rpc_schedule+74} <ffffffffa016b44f>{:sunrpc:rpciod+301} <ffffffff80135752>{autoremove_wake_function+0} <ffffffff8013a7e5>{do_exit+3137} <ffffffff80135752>{autoremove_wake_function+0} <ffffffff80110f47>{child_rip+8} <ffffffffa02b8088>{:lockd:lockd+0} <ffffffffa016b322>{:sunrpc:rpciod+0} <ffffffff80110f3f>{child_rip+0} Code: c7 04 87 00 00 00 00 8b 0e 89 c8 89 ca 25 00 ff 00 00 c1 e2 RIP <ffffffffa01713f1>{:sunrpc:xdr_encode_netobj+25} RSP <000001020c7ebda0> CR2: 0000010299e8b6b0 <0>Kernel panic - not syncing: Oops This is the I/O that was being run from each of the clients: iogen -o -m random -f sync -s write,writev -t1000b -T10000b 100000b:doiofile | doio -vDk -n 10 Version-Release number of selected component (if applicable): Kernel 2.6.9-42.ELsmp on an x86_64
How reproducible is this?
I've only seen it the one time so far.
I got a similar (as far as I can tell) problem on my cluster at RHEL4U4. Here's the kernel panic output: roth-02.lab.msp.redhat.com login: general protection fault: 0000 [1] SMP CPU 0 Modules linked in: nfsd(U) exportfs(U) lockd(U) nfs_acl(U) parport_pc(U) lp(U) parport(U) autofs4(U) i2c_dev(U) i2c_core(U) lock_dlm(U) gfs(U) lock_harness(U) dlm(U) cman(U) md5(U) ipv6(U) sunrpc(U) ds(U) yenta_socket(U) pcmcia_core(U) dm_round_robin(U) dm_multipath(U) joydev(U) button(U) battery(U) ac(U) uhci_hcd(U) ehci_hcd(U) tg3(U) dm_snapshot(U) dm_zero(U) dm_mirror(U) ext3(U) jbd(U) dm_mod(U) qla2400(U) qla2xxx(U) scsi_transport_fc(U) ata_piix(U) libata(U) sd_mod(U) scsi_mod(U) Pid: 3516, comm: rpciod Tainted: GF 2.6.9-prep RIP: 0010:[<ffffffffa0105bf9>] <ffffffffa0105bf9>{:tg3:tg3_poll+1334} RSP: 0018:ffffffff80451178 EFLAGS: 00010246 RAX: 000001007f964840 RBX: 000001007c9c4080 RCX: 0000000000000042 RDX: 000001007f9648b0 RSI: 0000bf9f0000bf98 RDI: 0000010072667a12 RBP: 0000010069e25500 R08: 000001007cae1460 R09: 000000000000007e R10: 000001007c9c4080 R11: 00000000000000e4 R12: 000001007ea05380 R13: 0000000000000042 R14: 0000000000000000 R15: 000001007bc64fc0 FS: 0000002a958a0b00(0000) GS:ffffffff804ddb80(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000002a9556c000 CR3: 0000000000101000 CR4: 00000000000006e0 Process rpciod (pid: 3516, threadinfo 000001007213a000, task 0000010071c7e030) Stack: ffffffff80451228 000001007ea0542c 000000006a78d012 00000000a012ac32 0000007e007f0292 0000004000010000 000001007bd74000 ffffffff8045121c 000001007ea05000 0000000000000046 Call Trace:<IRQ> <ffffffffa02c3088>{:lockd:lockd+0} <ffffffff802ab940>{net_rx_action+203} <ffffffff8013bc80>{__do_softirq+88} <ffffffff8013bd29>{do_softirq+49} <ffffffff80113247>{do_IRQ+328} <ffffffff80110833>{ret_from_intr+0} <EOI> <ffffffffa02c3088>{:lockd:lockd+0} <ffffffffa02c8696>{:lockd:nlm4clt_encode_testargs+0} <ffffffffa02c7ad9>{:lockd:nlm4_encode_cookie+60} <ffffffffa02c86ab>{:lockd:nlm4clt_encode_testargs+21} <ffffffffa02c8696>{:lockd:nlm4clt_encode_testargs+0} <ffffffffa016e217>{:sunrpc:call_transmit+532} <ffffffffa0171c8f>{:sunrpc:__rpc_execute+187} <ffffffff80304aa8>{thread_return+193} <ffffffffa0171f8f>{:sunrpc:__rpc_schedule+74} <ffffffffa017244f>{:sunrpc:rpciod+301} <ffffffff80134d3a>{autoremove_wake_function+0} <ffffffff80134d3a>{autoremove_wake_function+0} <ffffffff80110f47>{child_rip+8} <ffffffffa02c3088>{:lockd:lockd+0} <ffffffffa0172322>{:sunrpc:rpciod+0} <ffffffff80110f3f>{child_rip+0} Code: f3 a4 4c 89 f7 49 8b 44 24 48 48 8d 50 70 48 85 c0 48 0f 45 RIP <ffffffffa0105bf9>{:tg3:tg3_poll+1334} RSP <ffffffff80451178> <0>Kernel panic - not syncing: Oops The panic was reproduced by running this test case: ./nfs_try -R /usr/tests/sts/var/share/resource_files/roth.xml -n nfs1 -C /usr/tests/sts/var/share/resource_files/roth.xml -S QUICK Using a cluster of three x86_64 servers: roth-01, roth-02, roth-03 and four nfs clients: trin-13,14,15,16. I can get this panic about once every ten times I run the test case, and it's running on my U4 kernel compiled from source RPMs, so it should be easy to inject debug code. It seems to happen after the test is complete and deemed successful, when the nfs clients are trying to umount the nfs mounts.
We never see lockd panicked before. However, did you notice Bob died in tg3 driver interrupt stack ? RIP: 0010:[<ffffffffa0105bf9>] <ffffffffa0105bf9>{:tg3:tg3_poll+1334} <ffffffff802ab940>{net_rx_action+203} <ffffffff8013bc80>{__do_softirq+88} <ffffffff8013bd29>{do_softirq+49} <ffffffff80113247>{do_IRQ+328} <ffffffff80110833>{ret_from_intr+0} I had one while working with cutomer yesterday that also on RHEL4 U4: Process uio_file_serv (pid: 14351, threadinfo 00000101c0330000, task 00000101cab427f0) Stack: 00000100ac7f1034 00000100e067e0c0 00000100e067e0c0 ffffffff802e27b0 ffffff00004b2000 00000003d194f700 00000100ac7f1034 00000101c86d60c0 0000010003d290b0 00000101c86d60c0 Call Trace:<IRQ> <ffffffff802e27b0>{tcp_v4_do_rcv+200} <ffffffff802e3076>{tcp_v4_rcv+1701} <ffffffff802c91f9>{ip_local_deliver+298} <ffffffff802c9944>{ip_rcv+1007} <ffffffff802b0737>{netif_receive_skb+791} <ffffffffa00dcdd7>{:tg3:tg3_poll+1812} <ffffffff802b0950>{net_rx_action+203} <ffffffff8013c738>{__do_softirq+88} <ffffffff8013c7e1>{do_softirq+49} <ffffffff80113247>{do_IRQ+328} <ffffffff80110833>{ret_from_intr+0} <EOI> The panic disappears today after we turned off floating IP in RHCS. So a good way to isolate this issue would be: 1. change to another NIC driver 2. not using floating ip (if you have been using one)
Thanks :) ... it does look like a client issue. Another thing to isolate the issue is not mixing nfs client and server togather.
This bug is 3+ years old, closing. Will reopen if seen again.
------------------------------------------------------------ Unable to handle kernel paging request at 000001043b6088b0 RIP: <ffffffffa012e591>{:sunrpc:xdr_encode_netobj+25} ------------------------------------------------------------ ------------------------------------------------------------ crash> bt PID: 4009 TASK: 103eecaf7f0 CPU: 0 COMMAND: "rpciod" #0 [103ed103be0] netpoll_start_netdump at ffffffffa0210366 #1 [103ed103c10] do_page_fault at ffffffff80124c50 #2 [103ed103cf0] error_exit at ffffffff80110e21 [exception RIP: xdr_encode_netobj+25] RIP: ffffffffa012e591 RSP: 00000103ed103da0 RFLAGS: 00010206 RAX: 000000003fcf7203 RBX: 00000103f1d38c48 RCX: 0000000000000000 RDX: 000000001c000000 RSI: 00000103f1d38cd8 RDI: 000001033c22c0a4 RBP: 00000103f1d38ce8 R8: 000001033c22c088 R9: 000001033c22c0a8 R10: 00000103f1d38cd8 R11: ffffffffa01bb8aa R12: 0000010289b90040 R13: 0000010289b90000 R14: 00000100069c6800 R15: ffffffffa01b625f ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000 #3 [103ed103db0] nlm4_encode_lock at ffffffffa01bafa0 #4 [103ed103dd0] nlm4clt_encode_testargs at ffffffffa01bb8f1 #5 [103ed103df0] call_transmit at ffffffffa012424c #6 [103ed103e20] __rpc_execute at ffffffffa0127cfc #7 [103ed103eb0] __rpc_schedule at ffffffffa0127ffc #8 [103ed103ed0] rpciod at ffffffffa01284bc #9 [103ed103f50] kernel_thread at ffffffff80110fd7 ------------------------------------------------------------ ------------------------------------------------------------ crash> dis -l xdr_encode_netobj /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 26 0xffffffffa012e578 <xdr_encode_netobj>: push %r8 0xffffffffa012e57a <xdr_encode_netobj+2>: mov %rdi,%r9 /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 30 0xffffffffa012e57d <xdr_encode_netobj+5>: add $0x4,%r9 /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 26 0xffffffffa012e581 <xdr_encode_netobj+9>: mov %rsi,%r10 0xffffffffa012e584 <xdr_encode_netobj+12>: push %r8 /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 27 0xffffffffa012e586 <xdr_encode_netobj+14>: mov (%rsi),%eax /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 31 0xffffffffa012e588 <xdr_encode_netobj+16>: cld /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 27 0xffffffffa012e589 <xdr_encode_netobj+17>: add $0x3,%eax 0xffffffffa012e58c <xdr_encode_netobj+20>: shr $0x2,%eax /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 29 0xffffffffa012e58f <xdr_encode_netobj+23>: mov %eax,%eax => 0xffffffffa012e591 <xdr_encode_netobj+25>: movl $0x0,(%rdi,%rax,4) /usr/src/debug/kernel-2.6.9/linux-2.6.9/net/sunrpc/xdr.c: 30 0xffffffffa012e598 <xdr_encode_netobj+32>: mov (%rsi),%ecx ------------------------------------------------------------ net/sunrpc/xdr.c: ------------------------------------------------------------ 21 /* 22 * XDR functions for basic NFS types 23 */ 24 u32 * 25 xdr_encode_netobj(u32 *p, const struct xdr_netobj *obj) 26 { 27 unsigned int quadlen = XDR_QUADLEN(obj->len); 28 => 29 p[quadlen] = 0; /* zero trailing bytes */ 30 *p++ = htonl(obj->len); 31 memcpy(p, obj->data, obj->len); 32 return p + XDR_QUADLEN(obj->len); 33 } ------------------------------------------------------------ RDI = 0x000001033c22c0a4 = u32 *p RSI = 0x00000103f1d38cd8 = struct xdr_netobj *obj quadlen = XDR_QUADLEN(0xff3dc809) = 0x3fcf7203 EAX = RAX = 0x000000003fcf7203 p[quadlen] = ( 0x000001033c22c0a4 + ( 4 * 0x3fcf7203 ) ) = 0x1043b6088b0 => And the paging request that failed was at 0x000001043b6088b0 ------------------------------------------------------------ crash> px *(struct xdr_netobj *) 0x00000103f1d38cd8 $2 = { len = 0xff3dc809, data = 0x103f1d38f10 "2518.sprint.com" } ------------------------------------------------------------ ------------------------------------------------------------ 18 #define XDR_QUADLEN(l) (((l) + 3) >> 2) crash> px ( 0xff3dc809 + 0x3 ) $3 = 0xff3dc80c crash> px ( 0xff3dc80c >> 2 ) $4 = 0x3fcf7203 ------------------------------------------------------------ ------------------------------------------------------------ 0xffffffffa012e591 <xdr_encode_netobj+25>: movl $0x0,(%rdi,%rax,4) crash> px ( 4 * 0x3fcf7203 ) $8 = 0xff3dc80c crash> px ( 0x000001033c22c0a4 + 0xff3dc80c ) $9 = 0x1043b6088b0 ------------------------------------------------------------
The above crash was on a 2.6.9-89.0.18.ELsmp kernel.