Bug 172696
| Summary: | kernel panic after a few hours/days of operation with pulse | ||||||
|---|---|---|---|---|---|---|---|
| Product: | Red Hat Enterprise Linux 4 | Reporter: | Mariano Fernandez <mfernandez> | ||||
| Component: | kernel | Assignee: | David Miller <davem> | ||||
| Status: | CLOSED DUPLICATE | QA Contact: | Cluster QE <mspqa-list> | ||||
| Severity: | high | Docs Contact: | |||||
| Priority: | medium | ||||||
| Version: | 4.0 | CC: | ian, jbaron, lhh, tgraf | ||||
| Target Milestone: | --- | ||||||
| Target Release: | --- | ||||||
| Hardware: | i686 | ||||||
| OS: | Linux | ||||||
| Whiteboard: | |||||||
| Fixed In Version: | Doc Type: | Bug Fix | |||||
| Doc Text: | Story Points: | --- | |||||
| Clone Of: | Environment: | ||||||
| Last Closed: | 2006-05-05 10:48:20 UTC | Type: | --- | ||||
| Regression: | --- | Mount Type: | --- | ||||
| Documentation: | --- | CRM: | |||||
| Verified Versions: | Category: | --- | |||||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |||||
| Cloudforms Team: | --- | Target Upstream Version: | |||||
| Embargoed: | |||||||
| Bug Depends On: | |||||||
| Bug Blocks: | 181409 | ||||||
| Attachments: |
|
||||||
This looks like a bug in the ipvs code in the kernel, not with ipvsadm (ipvsadm should not cause a panic in the kernel - ever). A copy of your LVS config will help. Here it is the config file (real IP address changed):
serial_no = 89
primary = 192.168.1.1
primary_private = 10.0.0.1
service = lvs
backup_active = 1
backup = 192.168.1.2
backup_private = 10.0.0.2
heartbeat = 1
heartbeat_port = 539
keepalive = 4
deadtime = 12
network = nat
nat_router = 10.0.0.3 eth1:1
nat_nmask = 255.255.255.0
debug_level = NONE
monitor_links = 1
virtual http {
active = 0
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 80
persistent = 300
send = "GET / HTTP/1.0\r\n\r\n"
expect = "HTTP"
use_regex = 0
load_monitor = none
scheduler = wlc
protocol = tcp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01 {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02 {
address = 10.0.0.2
active = 1
weight = 1
}
}
virtual imaps {
active = 1
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 993
persistent = 300
expect = "OK"
use_regex = 0
send_program = "/etc/sysconfig/ha/service_check.sh imaps %h"
load_monitor = none
scheduler = wlc
protocol = tcp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01-imaps {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02-imaps {
address = 10.0.0.2
active = 1
weight = 1
}
}
virtual webmail {
active = 1
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 443
persistent = 300
expect = "OK"
use_regex = 0
send_program = "/etc/sysconfig/ha/service_check.sh webmail %h"
load_monitor = none
scheduler = wlc
protocol = tcp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01-https {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02-https {
address = 10.0.0.2
active = 1
weight = 1
}
}
virtual pops {
active = 1
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 995
persistent = 300
expect = "OK"
use_regex = 0
send_program = "/etc/sysconfig/ha/service_check.sh pops %h"
load_monitor = none
scheduler = wlc
protocol = tcp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01-pops {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02-pops {
address = 10.0.0.2
active = 1
weight = 1
}
}
virtual smtp {
active = 1
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 25
persistent = 300
expect = "OK"
use_regex = 0
send_program = "/etc/sysconfig/ha/service_check.sh smtp %h"
load_monitor = none
scheduler = wlc
protocol = tcp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01-smtp {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02-smtp {
address = 10.0.0.2
active = 1
weight = 1
}
}
virtual dns {
active = 1
address = 192.168.1.3 eth0:3
vip_nmask = 255.255.255.240
port = 53
persistent = 0
expect = "OK"
use_regex = 0
send_program = "/etc/sysconfig/ha/service_check.sh dns %h"
load_monitor = none
scheduler = wlc
protocol = udp
timeout = 6
reentry = 15
quiesce_server = 0
server p-srv01-dns {
address = 10.0.0.1
active = 1
weight = 1
}
server p-srv02-dns {
address = 10.0.0.2
active = 1
weight = 1
}
}
Also, here it is the script used to test services (in case you need it)
#!/bin/sh
SERVICE=$1
HOST=$2
#echo 0:$0 s:$SERVICE h:$HOST >> /tmp/execution.log
if [ "$SERVICE" == "smtp" ]; then
PORT=25
COMMAND='QUIT'
EXPECT='Postfix'
fi
if [ "$SERVICE" == "imap" -o "$SERVICE" = "imaps" ]; then
PORT=143
COMMAND='* LOGOUT'
EXPECT='OK'
fi
if [ "$SERVICE" == "pop" -o "$SERVICE" = "pops" ]; then
PORT=110
COMMAND='QUIT'
EXPECT='OK'
fi
if [ "$SERVICE" == "http" -o "$SERVICE" = "https" ]; then
PORT=80
COMMAND='GET / HTTP/1.0'
EXPECT='HTTP'
fi
if [ "$SERVICE" == "webmail" ]; then
if [ "$($0 imaps $HOST)" == "OK" ]; then
PORT=80
COMMAND='GET / HTTP/1.0'
EXPECT='HTTP'
else
echo "FAIL"
exit 0
fi
fi
if [ "$SERVICE" == "dns" ]; then
dig www.latinsourcetech.com @$HOST +time=2 +tries=1 &>/dev/null
if [ $? == '0' ]; then
echo "OK"
else
echo "FAIL"
fi
exit 0
fi
if [ "$COMMAND" == "" ]; then
echo "ERR: NO SERVICE"
exit 0
fi
TEST=`(echo "$COMMAND" | nc $HOST $PORT) | grep -c $EXPECT`
if [ $TEST -ge 1 ]; then
echo "OK"
else
echo "FAIL"
fi
I have found a patch written by Andrew Morton in a forum.
Recompiled a kernel with that patch (attached) and it run for a while, but got a
kernel panic also. The message this time is different:
------------[ cut here ]------------
kernel BUG at kernel/timer.c:420!
invalid operand: 0000 [#1]
Modules linked in: nfs lockd ip_vs_wlc ip_vs parport_pc lp parport netconsole
netdump md5 ipv6 autofs4 i2c_dev i2c_core sunrpc ipt_REJECT ipt_state
ip_conntrack ipt_multiport iptable_filter ipt_MARK iptable_mangle ip_tables
button battery ac ohci_hcd snd_intel8x0 snd_ac97_codec snd_pcm_oss snd_mixer_oss
snd_pcm snd_timer snd_page_alloc snd_mpu401_uart snd_rawmidi snd_seq_device snd
soundcore sis900 3c59x mii dm_snapshot dm_zero dm_mirror ext3 jbd dm_mod
CPU: 0
EIP: 0060:[<c012a510>] Not tainted VLI
EFLAGS: 00010007 (2.6.9-22.0.1.1.EL)
EIP is at cascade+0x18/0x37
eax: c035cde8 ebx: c035d320 ecx: c035cde8 edx: eea09d24
esi: eea09d24 edi: c035c940 ebp: 00000038 esp: c03e4fb8
ds: 007b es: 007b ss: 0068
Process swapper (pid: 0, threadinfo=c03e4000 task=c0358be0)
Stack: 00000000 c0409528 c035c940 00000246 c012a9e0 c03e4fd0 c03e4fd0 c03e4fd0
f5d79780 00000001 c0409528 0000000a 00000000 c0126769 c03b7f90 00000046
c03e5000 c0109338
Call Trace:
[<c012a9e0>] run_timer_softirq+0xc8/0x2d4
[<c0126769>] __do_softirq+0x35/0x79
[<c0109338>] do_softirq+0x46/0x4d
=======================
[<c01088fc>] do_IRQ+0x2b3/0x2bf
[<c030fa10>] common_interrupt+0x18/0x20
[<c01040d3>] mwait_idle+0x32/0x40
[<c010408c>] cpu_idle+0x1f/0x34
[<c03b86b9>] start_kernel+0x214/0x216
Code: ff ff c7 41 04 00 00 00 00 56 9d b8 01 00 00 00 5b 5e c3 55 89 cd 57 89 c7
56 53 8d 1c ca 8b 33 39 de 74 1a 39 7e 30 89 f2 74 08 <0f> 0b a4 01 33 ee 31 c0
8b 36 89 f8 e8 f9 f7 ff ff eb e2 89 1b
Created attachment 121130 [details]
Patch from Andrew Morton for IPVS deadlock
I'm seeing this same problem on 2.6.9-22.0.2.ELsmp as well as 2.6.9-28.EL.jwltest.105. I will attach a serial console while running 2.6.9-22.0.2.ELsmp and post here; nothing is logged in /var/log/messages. hi, i belive this is fixed in the latest beta kernels, can youi please try: http://people.redhat.com/~jbaron/rhel4/ thanks, -Jason Hi Jason, I'll give it a try next week, since this one I cannot take those servers down. Hopefully I will come back with good news by the end of next week. Thanks, Mariano *** This bug has been marked as a duplicate of 174990 *** Jason, I have installed the beta kernel last Tuesday and the servers are still up and running without any panic.... I will keep you posted about the behaivor next week. -- mf errata tool clean up, add to U4 CANFIX list for tracking purposes. |
From Bugzilla Helper: User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.12) Gecko/20050921 Red Hat/1.0.7-1.4.1 Firefox/1.0.7 Description of problem: There are two servers running RHEL v4 and CS. One is the primary pulse server and the other the backup. Both have last kernel update (2.6.9-22.0.1.EL) and last update of every package. The server that has pulse operational works for a few hours/days and then it panics with the following error: Kernel panic - not syncing: fs/block_dev.c:396: spin_lock(fs/block_dev.c:c035fc40) already locked by fs/block_dev.c/396 ------------[ cut here ]------------ kernel BUG at kernel/panic.c:74! invalid operand: 0000 [#1] Modules linked in: ip_vs_wlc ip_vs nfs lockd parport_pc lp parport netconsole netdump md5 ipv6 autofs4 i2c_dev i2c_core sunrpc ipt_REJECT ipt_state ip_conntrack ipt_multiport iptable_filter ipt_MARK iptable_mangle ip_tables button battery ac ohci_hcd ehci_hcd snd_intel8x0 snd_ac97_codec snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc snd_mpu401_uart snd_rawmidi snd_seq_device snd soundcore sis900 3c59x mii dm_snapshot dm_zero dm_mirror ext3 jbd dm_mod CPU: 0 EIP: 0060:[<c0120a96>] Not tainted VLI EFLAGS: 00010282 (2.6.9-22.0.1.EL) EIP is at panic+0x47/0x13d eax: 0000007c ebx: c03e4f7c ecx: c031e7de edx: c03e4f44 esi: ffffffff edi: c035c940 ebp: 00000246 esp: c03e4f4c ds: 007b es: 007b ss: 0068 Process service_check.s (pid: 4548, threadinfo=c03e4000 task=f6bb8cd0) Stack: c03e4f7c c0170c32 c03181ff c0321a55 0000018c c0321a55 c035fc40 c0321a55 0000018c c014bd94 00000000 f8b74d7b f66e9100 f6742bd0 f6742bd0 1350bfbc 0003d3d2 0003049e 00000000 0000000f c03e8120 f6742bd0 00000000 f8b86f20 Call Trace: [<c0170c32>] nr_blockdev_pages+0x6f/0xfa [<c014bd94>] si_meminfo+0x1f/0x3b [<f8b74d7b>] update_defense_level+0xf/0x332 [ip_vs] [<f8b7509e>] defense_timer_handler+0x0/0x29 [ip_vs] [<f8b750a3>] defense_timer_handler+0x5/0x29 [ip_vs] [<c012ab03>] run_timer_softirq+0x1eb/0x2d4 [<c0126769>] __do_softirq+0x35/0x79 [<c0109338>] do_softirq+0x46/0x4d ======================= [<c01088fc>] do_IRQ+0x2b3/0x2bf [<c030fa10>] common_interrupt+0x18/0x20 [<c0170c72>] nr_blockdev_pages+0xaf/0xfa [<c014bd94>] si_meminfo+0x1f/0x3b [<c01a7e57>] meminfo_read_proc+0x41/0x191 [<c018103f>] dput+0x33/0x423 [<c014b7a5>] buffered_rmqueue+0x1c4/0x1e7 [<c014b89d>] __alloc_pages+0xd5/0x2f7 [<c01a5ac7>] proc_file_read+0xd1/0x225 [<c016803a>] vfs_read+0xb6/0xe2 [<c016824d>] sys_read+0x3c/0x62 [<c030f8cb>] syscall_call+0x7/0xb Code: 3e c0 e8 93 44 0c 00 68 c0 8a 3e c0 68 de e7 31 c0 e8 60 0b 00 00 83 c4 0c 83 3d 9c b3 40 c0 00 75 09 83 3d 98 b3 40 c0 00 74 08 <0f> 0b 4a 00 01 e8 31 c0 31 c0 e8 d7 99 ff ff 31 d2 b9 c0 8a 3e Version-Release number of selected component (if applicable): How reproducible: Sometimes Steps to Reproduce: 1. Server gets pulse working on it 2. Works for a period of time 3. Finally it panics Additional info: Both servers are also real servers on the balancing trhough the private IP address. The cluster is managing imap/pop/smtp/web services. Both servers access data (mailbox) through NFS and automount. Services authenticate through an LDAP server.