Bug 182454 - clustat hangs when one of nodes is in rebooting
clustat hangs when one of nodes is in rebooting
Status: CLOSED ERRATA
Product: Red Hat Cluster Suite
Classification: Red Hat
Component: rgmanager (Show other bugs)
4
All Linux
medium Severity high
: ---
: ---
Assigned To: Lon Hohberger
Cluster QE
:
: 190234 190408 (view as bug list)
Depends On:
Blocks: 180185
  Show dependency treegraph
 
Reported: 2006-02-22 12:37 EST by dex chen
Modified: 2009-04-16 16:19 EDT (History)
2 users (show)

See Also:
Fixed In Version: RHBA-2006-0557
Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of:
Environment:
Last Closed: 2006-08-10 17:21:29 EDT
Type: ---
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Category: ---
oVirt Team: ---
RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: ---


Attachments (Terms of Use)

  None (edit)
Description dex chen 2006-02-22 12:37:53 EST
Description of problem:
We have a 4 node cluster using GFS and DLM. clusat command on any other nodes 
will never return when one of nodes is in rebooting, which results the cluster 
unusable. This problem is very reproducible (indeed every time in my tries).

We also oberved that clurgmgrd did not get started after the rebooting node is 
up again.

We are aware of bug#17746. The symptoms are similar, but we do not use gulm as 
it is in bug#17746.

We are also aware the fix to clustat poste by http://sourceware.org/ml/cluster-
cvs/2006-q1/msg00062.html. Do you think this fix will resove our problem?

Version-Release number of selected component (if applicable):

rgmanager-1.9.43-0
rgmanager-debuginfo-1.9.43-0



How reproducible:

every time

Steps to Reproduce:
1) set up a 4 node cluster using DLM
2) reboot one of node by 'reboot -f' , e.g. on node1
3) one node 2-4 run 'clustat'
  
Actual results:

clustat never returns.

Expected results:
clustat returns cleanly.


Additional info:

We did strace and gdb on the haning clustat. The output from them are pated 
here:
>>> strace >>>>
execve("/usr/sbin/clustat", ["clustat"], [/* 23 vars */]) = 0
uname({sys="Linux", node="sqazero03", ...}) = 0
brk(0)                                  = 0x50c000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a95556000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=183328, ...}) = 0
mmap(NULL, 183328, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000
close(3)                                = 0
open("/lib64/tls/libpthread.so.0", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340UP\356"..., 640) = 
640
fstat(3, {st_mode=S_IFREG|0755, st_size=106105, ...}) = 0
mmap(0x33ee500000, 1131384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x33ee500000
mprotect(0x33ee510000, 1065848, PROT_NONE) = 0
mmap(0x33ee60f000, 8192, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xf000) = 0x33ee60f000
mmap(0x33ee611000, 13176, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x33ee611000
close(3)                                = 0
open("/lib64/libdl.so.2", O_RDONLY)     = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\17\240"..., 640) = 
640
fstat(3, {st_mode=S_IFREG|0755, st_size=17943, ...}) = 0
mmap(0x33eda00000, 1056968, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x33eda00000
mprotect(0x33eda02000, 1048776, PROT_NONE) = 0
mmap(0x33edb01000, 8192, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0x33edb01000
close(3)                                = 0
open("/usr/lib64/libncurses.so.5", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240\370"..., 640) = 640
fstat(3, {st_mode=S_IFREG|0755, st_size=1018858, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a95584000
mmap(0x33f5e00000, 1421240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x33f5e00000
mprotect(0x33f5e4d000, 1105848, PROT_NONE) = 0
mmap(0x33f5f4c000, 61440, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x4c000) = 0x33f5f4c000
close(3)                                = 0
open("/lib64/tls/libc.so.6", O_RDONLY)  = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\305\301"..., 640) = 640
lseek(3, 624, SEEK_SET)                 = 624
read(3, "\4\0\0\0\20\0\0\0\1\0\0\0GNU\0\0\0\0\0\2\0\0\0\4\0\0\0"..., 32) = 32
fstat(3, {st_mode=S_IFREG|0755, st_size=1489097, ...}) = 0
mmap(0x33edc00000, 2305992, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x33edc00000
mprotect(0x33edd2a000, 1085384, PROT_NONE) = 0
mmap(0x33ede29000, 24576, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x129000) = 0x33ede29000
mmap(0x33ede2f000, 16328, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x33ede2f000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a95585000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a95586000
mprotect(0x33ede29000, 12288, PROT_READ) = 0
arch_prctl(ARCH_SET_FS, 0x2a95585a00)   = 0
munmap(0x2a95557000, 183328)            = 0
set_tid_address(0x2a95585a90)           = 17453
rt_sigaction(SIGRTMIN, {0x33ee505190, [], SA_RESTORER|SA_SIGINFO, 
0x33ee50c320}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x33ee505210, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 
0x33ee50c320}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY}) = 0
_sysctl({{CTL_KERN, KERN_VERSION, 0, 0, 0, 0, 20bd1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 2, 0x7fbffff870, 34, (nil), 
0}) = 0
open("/lib64/magma", O_RDONLY|O_NONBLOCK|O_DIRECTORY) = 3
fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
fcntl(3, F_SETFD, FD_CLOEXEC)           = 0
brk(0)                                  = 0x50c000
brk(0x52e000)                           = 0x52e000
getdents64(3, /* 4 entries */, 4096)    = 120
getdents64(3, /* 0 entries */, 4096)    = 0
lseek(3, 0, SEEK_SET)                   = 0
getdents64(3, /* 4 entries */, 4096)    = 120
getdents64(3, /* 0 entries */, 4096)    = 0
close(3)                                = 0
stat("/lib64/magma/..", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
stat("/lib64/magma/.", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
stat("/lib64/magma/magma_gulm.so", {st_mode=S_IFREG|0755, st_size=16888, ...}) 
= 0
futex(0x33edb020c4, FUTEX_WAKE, 2147483647) = 0
open("/lib64/magma/magma_gulm.so", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\23\0"..., 640) = 640
fstat(3, {st_mode=S_IFREG|0755, st_size=16888, ...}) = 0
mmap(NULL, 1063648, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 
0x2a95587000
mprotect(0x2a9558b000, 1047264, PROT_NONE) = 0
mmap(0x2a9568a000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x2a9568a000
close(3)                                = 0
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=183328, ...}) = 0
mmap(NULL, 183328, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a9568b000
close(3)                                = 0
open("/usr/lib64/libgulm.so.1", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 \27\20\356"..., 640) = 
640
fstat(3, {st_mode=S_IFREG|0644, st_size=54015, ...}) = 0
mmap(0x33ee100000, 1069912, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x33ee100000
mprotect(0x33ee106000, 1045336, PROT_NONE) = 0
mmap(0x33ee205000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5000) = 0x33ee205000
close(3)                                = 0
munmap(0x2a9568b000, 183328)            = 0
gettid()                                = 17453
socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = -1 EAFNOSUPPORT (Address family not 
supported by protocol)
munmap(0x2a95587000, 1063648)           = 0
munmap(0x33ee100000, 1069912)           = 0
stat("/lib64/magma/magma_sm.so", {st_mode=S_IFREG|0755, st_size=23248, ...}) = 0
open("/lib64/magma/magma_sm.so", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\220\26\0"..., 640) = 640
fstat(3, {st_mode=S_IFREG|0755, st_size=23248, ...}) = 0
mmap(NULL, 1070016, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 
0x2a95587000
mprotect(0x2a9558d000, 1045440, PROT_NONE) = 0
mmap(0x2a9568c000, 4096, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5000) = 0x2a9568c000
close(3)                                = 0
socket(0x1e /* PF_??? */, SOCK_DGRAM, 3) = 3
rt_sigaction(SIGINT, {0x4021f8, [INT], SA_RESTORER|SA_RESTART, 0x33edc2e410}, 
{SIG_DFL}, 8) = 0
rt_sigaction(SIGTERM, {0x4021f8, [TERM], SA_RESTORER|SA_RESTART, 0x33edc2e410}, 
{SIG_DFL}, 8) = 0
ioctl(3, 0x7805, 0)                     = 1
ioctl(3, 0x80107803, 0)                 = 4
ioctl(3, 0x80107803, 0x7fbffff9b0)      = 4
open("/proc/cluster/services", O_RDONLY) = 4
read(4, "Service          Name           "..., 4096) = 156
read(4, "", 4096)                       = 0
close(4)                                = 0
getuid()                                = 0
ioctl(3, 0xffffffff80107803, 0)         = 4
ioctl(3, 0xffffffff80107803, 0x7fbffff9a0) = 4
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(4, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0+\0\0\0/cluster/clu"..., 63) = 63
read(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0", 20) = 20
read(4, "sqazero01\0", 10)              = 10
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0+\0\0\0/cluster/clu"..., 63) = 63
read(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0", 20) = 20
read(4, "sqazero02\0", 10)              = 10
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0+\0\0\0/cluster/clu"..., 63) = 63
read(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0", 20) = 20
read(4, "sqazero03\0", 10)              = 10
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0+\0\0\0/cluster/clu"..., 63) = 63
read(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0", 20) = 20
read(4, "sqazero04\0", 10)              = 10
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0+\0\0\0/cluster/clu"..., 63) = 63
read(4, "\3\0\0\0\0\0\0\0\0\0\0\0\303\377\377\377\0\0\0\0", 20) = 20
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(4, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20
close(4)                                = 0
ioctl(3, 0xffffffff80107803, 0)         = 4
ioctl(3, 0xffffffff80107803, 0x7fbffff930) = 4
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\n", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\2", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_RAW, 0)         = 4
bind(4, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(4, {sa_family=AF_NETLINK, pid=17453, groups=00000000}, [12]) = 0
time(NULL)                              = 1140218206
sendto(4, "\24\0\0\0\26\0\1\3^Y\366C\0\0\0\0\0\0\0\0", 20, 0, 
{sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"D\0\0\0\24\0\2\0^Y\366C-D\0\0\2\10\200\376\1\0\0\0\10\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 272
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"\24\0\0\0\3\0\2\0^Y\366C-D\0\0\0\0\0\0\1\0\0\0\10\0\1\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 20
close(4)                                = 0
gettimeofday({1140218206, 722032}, NULL) = 0
open("/etc/resolv.conf", O_RDONLY)      = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=74, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "search lab01.local jrbm.local\nna"..., 4096) = 74
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
connect(4, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT 
(No such file or directory)
close(4)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 4
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
connect(4, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT 
(No such file or directory)
close(4)                                = 0
open("/etc/nsswitch.conf", O_RDONLY)    = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=1701, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "#\n# /etc/nsswitch.conf\n#\n# An ex"..., 4096) = 1701
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
open("/etc/ld.so.cache", O_RDONLY)      = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=183328, ...}) = 0
mmap(NULL, 183328, PROT_READ, MAP_PRIVATE, 4, 0) = 0x2a9568d000
close(4)                                = 0
open("/lib64/libnss_files.so.2", O_RDONLY) = 4
read(4, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0@\"\0\0\0"..., 640) = 640
fstat(4, {st_mode=S_IFREG|0755, st_size=56791, ...}) = 0
mmap(NULL, 1094952, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 
0x2a956ba000
mprotect(0x2a956c4000, 1053992, PROT_NONE) = 0
mmap(0x2a957c4000, 8192, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0xa000) = 0x2a957c4000
close(4)                                = 0
munmap(0x2a9568d000, 183328)            = 0
open("/etc/host.conf", O_RDONLY)        = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=17, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "order hosts,bind\n", 4096)     = 17
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
futex(0x33ede31a40, FUTEX_WAKE, 2147483647) = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr
("10.10.10.9")}, 16) = 0
getsockname(4, {sa_family=AF_INET, sin_port=htons(32785), sin_addr=inet_addr
("10.10.10.8")}, [22733639854850064]) = 0
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\n", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\2", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_RAW, 0)         = 4
bind(4, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(4, {sa_family=AF_NETLINK, pid=17453, groups=00000000}, [12]) = 0
time(NULL)                              = 1140218206
sendto(4, "\24\0\0\0\26\0\1\3^Y\366C\0\0\0\0\0\0\0\0", 20, 0, 
{sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"D\0\0\0\24\0\2\0^Y\366C-D\0\0\2\10\200\376\1\0\0\0\10\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 272
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"\24\0\0\0\3\0\2\0^Y\366C-D\0\0\0\0\0\0\1\0\0\0\10\0\1\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 20
close(4)                                = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr
("10.10.10.7")}, 16) = 0
getsockname(4, {sa_family=AF_INET, sin_port=htons(32785), sin_addr=inet_addr
("10.10.10.8")}, [22733639854850064]) = 0
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\n", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\2", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_RAW, 0)         = 4
bind(4, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(4, {sa_family=AF_NETLINK, pid=17453, groups=00000000}, [12]) = 0
time(NULL)                              = 1140218206
sendto(4, "\24\0\0\0\26\0\1\3^Y\366C\0\0\0\0\0\0\0\0", 20, 0, 
{sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"D\0\0\0\24\0\2\0^Y\366C-D\0\0\2\10\200\376\1\0\0\0\10\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 272
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"\24\0\0\0\3\0\2\0^Y\366C-D\0\0\0\0\0\0\1\0\0\0\10\0\1\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 20
close(4)                                = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr
("10.10.10.6")}, 16) = 0
getsockname(4, {sa_family=AF_INET, sin_port=htons(32785), sin_addr=inet_addr
("10.10.10.8")}, [22733639854850064]) = 0
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\n", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_DGRAM, 0)       = 4
sendto(4, "\21\0\0\0\26\0\1\3\0\0\0\0\0\0\0\0\2", 17, 0, {sa_family=AF_NETLINK, 
pid=0, groups=00000000}, 12) = 17
recvfrom(4, "D\0\0\0\24\0\2\0\0\0\0\0-D\0\0\2\10\200\376\1\0\0\0\10"..., 10240, 
0, NULL, NULL) = 272
close(4)                                = 0
socket(PF_NETLINK, SOCK_RAW, 0)         = 4
bind(4, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(4, {sa_family=AF_NETLINK, pid=17453, groups=00000000}, [12]) = 0
time(NULL)                              = 1140218206
sendto(4, "\24\0\0\0\26\0\1\3^Y\366C\0\0\0\0\0\0\0\0", 20, 0, 
{sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"D\0\0\0\24\0\2\0^Y\366C-D\0\0\2\10\200\376\1\0\0\0\10\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 272
recvmsg(4, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov
(1)=[{"\24\0\0\0\3\0\2\0^Y\366C-D\0\0\0\0\0\0\1\0\0\0\10\0\1\0"..., 4096}], 
msg_controllen=0, msg_flags=0}, 0) = 20
close(4)                                = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr
("10.10.10.8")}, 16) = 0
getsockname(4, {sa_family=AF_INET, sin_port=htons(32785), sin_addr=inet_addr
("10.10.10.8")}, [22733639854850064]) = 0
close(4)                                = 0
ioctl(3, 0x80107803, 0)                 = 4
ioctl(3, 0x80107803, 0x7fbffff930)      = 4
open("/proc/cluster/services", O_RDONLY) = 4
read(4, "Service          Name           "..., 4096) = 156
read(4, "", 4096)                       = 0
close(4)                                = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
open("/etc/hosts", O_RDONLY)            = 4
fcntl(4, F_GETFD)                       = 0
fcntl(4, F_SETFD, FD_CLOEXEC)           = 0
fstat(4, {st_mode=S_IFREG|0644, st_size=759, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
read(4, "# Do not remove the following li"..., 4096) = 759
close(4)                                = 0
munmap(0x2a9568d000, 4096)              = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 4
setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
connect(4, {sa_family=AF_INET, sin_port=htons(41966), sin_addr=inet_addr
("10.10.10.8")}, 16) = -1 EINPROGRESS (Operation now in progress)
select(5, [4], [4], NULL, {10, 0})      = 2 (in [4], out [4], left {10, 0})
getsockopt(4, SOL_SOCKET, SO_ERROR, [8589934703], [4]) = 0
close(4)                                = 0
close(4)                                = -1 EBADF (Bad file descriptor)
fstat(1, {st_mode=S_IFREG|0644, st_size=22849, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x2a9568d000
close(3)                                = 0
munmap(0x2a95587000, 1070016)           = 0
write(1, "Member Status: Quorate\n\nResource"..., 403Member Status: Quorate

Resource Group Manager not running; no service information available.

  Member Name                              Status
  ------ ----                              ------
  sqazero01                                Online
  sqazero02                                Online
  sqazero03                                Online, Local
  sqazero04                                Online

) = 403
munmap(0x2a9568d000, 4096)              = 0
exit_group(0)                           = ?
Process 17453 detached


>>>gdb>>>>>
[root@sqazero02 ~]# gdb /usr/sbin/clustat
GNU gdb Red Hat Linux (6.3.0.0-1.63rh)
Copyright 2004 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu"...Using host libthread_db 
library "/lib64/tls/libthread_db.so.1".

(gdb) attach 6732
Attaching to program: /usr/sbin/clustat, process 6732
Reading symbols from /lib64/tls/libpthread.so.0...done.
[Thread debugging using libthread_db enabled]
[New Thread 182894221824 (LWP 6732)]
Loaded symbols for /lib64/tls/libpthread.so.0
Reading symbols from /lib64/libdl.so.2...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /usr/lib64/libncurses.so.5...done.
Loaded symbols for /usr/lib64/libncurses.so.5
Reading symbols from /lib64/tls/libc.so.6...done.
Loaded symbols for /lib64/tls/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /lib64/magma/magma_sm.so...Reading symbols 
from /usr/lib/debug/lib64/magma/magma_sm.so.debug...done.
done.
Loaded symbols for /lib64/magma/magma_sm.so
Reading symbols from /lib64/libnss_files.so.2...done.
Loaded symbols for /lib64/libnss_files.so.2
0x0000003b5400b404 in recv () from /lib64/tls/libpthread.so.0
(gdb) bt
#0  0x0000003b5400b404 in recv () from /lib64/tls/libpthread.so.0
#1  0x0000000000404539 in msg_peek ()
#2  0x0000000000407e87 in msg_receive_simple (fd=4, buf=0x7fbffff888,
    timeout=10) at msgsimple.c:86
#3  0x00000000004022cb in rg_state_list (local_node_id=2) at clustat.c:60
#4  0x0000000000403495 in main (argc=1, argv=0x7fbffffa08) at clustat.c:620
#5  0x0000003b5371c4bb in __libc_start_main () from /lib64/tls/libc.so.6
#6  0x000000000040216a in _start ()
#7  0x0000007fbffff9f8 in ?? ()
#8  0x000000000000001c in ?? ()
#9  0x0000000000000001 in ?? ()
#10 0x0000007fbffffbe9 in ?? ()
#11 0x0000000000000000 in ?? ()
Comment 1 Lon Hohberger 2006-02-22 13:42:05 EST
Those patches in the mail to cluster-cvs fix a whole lot of problems, and is
likely to fix this one too.

Note that if the lock subsystem stops responding - but does not return errors -
things will still break.
Comment 2 Lon Hohberger 2006-05-12 13:05:45 EDT
*** Bug 190408 has been marked as a duplicate of this bug. ***
Comment 3 Lon Hohberger 2006-05-12 13:06:44 EDT
*** Bug 190234 has been marked as a duplicate of this bug. ***
Comment 4 Lon Hohberger 2006-05-12 13:09:21 EDT
These bugs seem to all be all related to lock contention during node transitions
- fixes here:

http://people.redhat.com/lhh/rgmanager-1.9.46-1.3speed.x86_64.rpm
http://people.redhat.com/lhh/rgmanager-1.9.46-1.3speed.i386.rpm
http://people.redhat.com/lhh/rgmanager-1.9.46-1.3speed.src.rpm
Comment 7 Red Hat Bugzilla 2006-08-10 17:21:38 EDT
An advisory has been issued which should help the problem
described in this bug report. This report is therefore being
closed with a resolution of ERRATA. For more information
on the solution and/or where to find the updated files,
please follow the link below. You may reopen this bug report
if the solution does not work for you.

http://rhn.redhat.com/errata/RHBA-2006-0557.html

Note You need to log in before you can comment on or make changes to this bug.