Description of problem: I don't have a lot to go on yet, but I have seen this a few times now and want it documented. I'll try and gather more info. [root@morph-01 ~]# ps -C pvcreate PID TTY TIME CMD 8787 ? 00:00:00 pvcreate [root@morph-01 ~]# strace -p 8787 Process 8787 attached - interrupt to quit read(3, [hang] [root@morph-01 ~]# lsof -p 8787 COMMAND PID USER FD TYPE DEVICE SIZE NODE NAME pvcreate 8787 root cwd DIR 253,0 4096 2 / pvcreate 8787 root rtd DIR 253,0 4096 2 / pvcreate 8787 root txt REG 253,0 484672 1002724 /usr/sbin/lvm pvcreate 8787 root mem REG 253,0 106397 1890945 /lib/ld-2.3.4.so pvcreate 8787 root mem REG 253,0 1454546 1890946 /lib/tls/libc-2.3.4.so pvcreate 8787 root mem REG 253,0 15324 1890950 /lib/libdl-2.3.4.so pvcreate 8787 root mem REG 253,0 57968 1891294 /lib/libdevmapper.so.1.02 pvcreate 8787 root mem REG 253,0 7956 998100 /usr/lib/liblvm2clusterlock.so.2.02 pvcreate 8787 root mem REG 253,0 56320 1890956 /lib/libselinux.so.1 pvcreate 8787 root mem REG 253,0 879961 1003874 /usr/lib/libncurses.so.5.4 pvcreate 8787 root mem REG 253,0 170320 1000584 /usr/lib/libreadline.so.4.3 pvcreate 8787 root mem REG 253,0 48528848 1002983 /usr/lib/locale/locale-archive pvcreate 8787 root 0u IPv4 135147 TCP morph-01.lab.msp.redhat.com:44410->lithium.msp.redhat.com:5010 (ESTABLISHED) pvcreate 8787 root 1u IPv4 135148 TCP morph-01.lab.msp.redhat.com:44411->lithium.msp.redhat.com:5011 (ESTABLISHED) pvcreate 8787 root 2u IPv4 135149 TCP morph-01.lab.msp.redhat.com:44412->lithium.msp.redhat.com:5012 (ESTABLISHED) pvcreate 8787 root 3u unix 0xedc6a080 135158 socket [root@morph-01 ~]# gulm_tool getstats $(hostname) I_am = Master quorum_has = 3 quorum_needs = 2 rank = 0 quorate = true GenerationID = 1141229678267220 run time = 1853 pid = 7570 verbosity = Default failover = enabled [root@morph-02 ~]# gulm_tool getstats $(hostname) I_am = Client Master = morph-01.lab.msp.redhat.com rank = -1 quorate = true GenerationID = 1141229678267220 run time = 1851 pid = 25985 verbosity = Default failover = enabled [root@morph-03 ~]# gulm_tool getstats $(hostname) I_am = Slave Master = morph-01.lab.msp.redhat.com rank = 1 quorate = true GenerationID = 1141229678267220 run time = 1848 pid = 27789 verbosity = Default failover = enabled [root@morph-04 ~]# gulm_tool getstats $(hostname) I_am = Slave Master = morph-01.lab.msp.redhat.com rank = 2 quorate = true GenerationID = 1141229678267220 run time = 1846 pid = 25611 verbosity = Default failover = enabled [root@morph-01 ~]# gulm_tool nodelist localhost Name: morph-03 ip = ::ffff:10.15.89.63 state = Logged in last state = Logged out mode = Slave missed beats = 0 last beat = 1141231941207771 delay avg = 10000509 max delay = 12802905 Name: morph-04 ip = ::ffff:10.15.89.64 state = Logged in last state = Logged out mode = Slave missed beats = 0 last beat = 1141231943389284 delay avg = 10002058 max delay = 10638248 Name: morph-02 ip = ::ffff:10.15.89.62 state = Logged in last state = Logged out mode = Client missed beats = 0 last beat = 1141231941463385 delay avg = 10000541 max delay = 12548367 Name: morph-01 ip = ::ffff:10.15.89.61 state = Logged in last state = Was Logged in mode = Master missed beats = 0 last beat = 1141231938511252 delay avg = 10000689 max delay = 10002480 [root@morph-01 ~]# gulm_tool servicelist $(hostname):core LTPX Magma::5503 clvmd Mar 1 10:48:14 morph-01 kernel: lock_gulmd R running 2724 7570 1 7574 5503 (NOTLB) Mar 1 10:48:14 morph-01 kernel: lock_gulmd R running 2724 7574 1 8767 7570 (NOTLB) Mar 1 10:48:14 morph-01 kernel: clvmd R running 2844 8767 1 8770 7574 (NOTLB) Mar 1 10:48:14 morph-01 kernel: clvmd S C2EF3960 3020 8770 1 8788 8767 (NOTLB) Mar 1 10:48:14 morph-01 kernel: ee4a2e94 00000082 c0148025 c2ef3960 f1466700 0000263e 01b32d82 00003c1d Mar 1 10:48:14 morph-01 kernel: ede4a1b0 ede4a33c 00000000 7fffffff ee4a2000 ee4a2ef0 c030fd85 00000001 Mar 1 10:48:14 morph-01 kernel: ee4a2ef8 ee4a2ef8 ee4a2ef8 f7e9e0b0 ee4a2ef8 c013ab98 1d244b3c 00000000 Mar 1 10:48:14 morph-01 kernel: Call Trace: Mar 1 10:48:14 morph-01 kernel: [<c0148025>] find_get_page+0x6b/0xdd Mar 1 10:48:14 morph-01 kernel: [<c030fd85>] schedule_timeout+0x50/0x10c Mar 1 10:48:14 morph-01 kernel: [<c013ab98>] queue_me+0x59/0x121 Mar 1 10:48:14 morph-01 kernel: [<c013aed4>] futex_wait+0x133/0x196 Mar 1 10:48:14 morph-01 kernel: [<c011d062>] default_wake_function+0x0/0xc Mar 1 10:48:14 morph-01 kernel: [<c011d062>] default_wake_function+0x0/0xc Mar 1 10:48:14 morph-01 kernel: [<c013b179>] do_futex+0x29/0x5a Mar 1 10:48:14 morph-01 kernel: [<c013b2ab>] sys_futex+0x101/0x10c Mar 1 10:48:14 morph-01 kernel: [<c0311443>] syscall_call+0x7/0xb Mar 1 10:48:14 morph-01 kernel: clvmd S C011E867 3000 8788 1 8770 (NOTLB) Mar 1 10:48:14 morph-01 kernel: f7286e94 00000082 ede4ad50 c011e867 ede4a780 000015d5 fd49d214 00003a94 Mar 1 10:48:14 morph-01 kernel: ede4ad50 ede4aedc 00000000 7fffffff f7286000 f7286ef0 c030fd85 00000000 Mar 1 10:48:14 morph-01 kernel: f7286ef8 f7286ef8 f7286ef8 f7e9e0b0 f7286ef8 c013ab98 1d244b3c 00000000 Mar 1 10:48:14 morph-01 kernel: Call Trace: Mar 1 10:48:14 morph-01 kernel: [<c011e867>] autoremove_wake_function+0x0/0x2d Mar 1 10:48:14 morph-01 kernel: [<c030fd85>] schedule_timeout+0x50/0x10c Mar 1 10:48:14 morph-01 kernel: [<c013ab98>] queue_me+0x59/0x121 Mar 1 10:48:14 morph-01 kernel: [<c013aed4>] futex_wait+0x133/0x196 Mar 1 10:48:14 morph-01 kernel: [<c011d062>] default_wake_function+0x0/0xc Mar 1 10:48:14 morph-01 kernel: [<c011d062>] default_wake_function+0x0/0xc Mar 1 10:48:14 morph-01 kernel: [<c011c442>] recalc_task_prio+0x128/0x133 Mar 1 10:48:14 morph-01 kernel: [<c013b179>] do_futex+0x29/0x5a Mar 1 10:48:14 morph-01 kernel: [<c013b2ab>] sys_futex+0x101/0x10c Mar 1 10:48:14 morph-01 kernel: [<c0311443>] syscall_call+0x7/0xb Mar 1 10:48:14 morph-01 kernel: qarshd R running 3024 8786 2282 8787 1811 (NOTLB) Mar 1 10:48:14 morph-01 kernel: pvcreate S 00003C04 2952 8787 8786 (NOTLB) Mar 1 10:48:14 morph-01 kernel: c3299d80 00000082 32236054 00003c04 ef0c31a0 00001629 322479b7 00003c04 Mar 1 10:48:14 morph-01 kernel: ede4a780 ede4a90c edc6a080 7fffffff c3299de8 00000012 c030fd85 c3299db0 Mar 1 10:48:14 morph-01 kernel: c011c442 00000009 00000001 00f010c8 c6a2e8da f46e8500 ef6b20b0 ef6b20b0 Mar 1 10:48:14 morph-01 kernel: Call Trace: Mar 1 10:48:14 morph-01 kernel: [<c030fd85>] schedule_timeout+0x50/0x10c Mar 1 10:48:14 morph-01 kernel: [<c011c442>] recalc_task_prio+0x128/0x133 Mar 1 10:48:14 morph-01 kernel: [<c011c4a0>] activate_task+0x53/0x5f Mar 1 10:48:14 morph-01 kernel: [<c030a61c>] unix_stream_data_wait+0x93/0xb7 Mar 1 10:48:14 morph-01 kernel: [<c011e867>] autoremove_wake_function+0x0/0x2d Mar 1 10:48:14 morph-01 kernel: [<c011e867>] autoremove_wake_function+0x0/0x2d Mar 1 10:48:14 morph-01 kernel: [<c030a79e>] unix_stream_recvmsg+0x15e/0x398 Mar 1 10:48:14 morph-01 kernel: [<c02ac415>] sock_aio_read+0x10d/0x11b Mar 1 10:48:14 morph-01 kernel: [<c030ec6f>] schedule+0x4c3/0x5ea Mar 1 10:48:14 morph-01 kernel: [<c012f405>] get_signal_to_deliver+0x1e9/0x771 Mar 1 10:48:14 morph-01 kernel: [<c030ebe4>] schedule+0x438/0x5ea Mar 1 10:48:14 morph-01 kernel: [<c0168a6a>] do_sync_read+0x97/0xc9 Mar 1 10:48:14 morph-01 kernel: [<c012ed23>] ptrace_notify+0x112/0x1ab Mar 1 10:48:14 morph-01 kernel: [<c0223977>] tty_write+0x381/0x38b Mar 1 10:48:14 morph-01 kernel: [<c011e867>] autoremove_wake_function+0x0/0x2d Mar 1 10:48:14 morph-01 kernel: [<c0168b62>] vfs_read+0xc6/0xe2 Mar 1 10:48:14 morph-01 kernel: [<c0168d65>] sys_read+0x3c/0x62 Mar 1 10:48:14 morph-01 kernel: [<c0311443>] syscall_call+0x7/0xb Version-Release number of selected component (if applicable): [root@morph-01 ~]# uname -ar Linux morph-01 2.6.9-34.EL #1 Fri Feb 24 16:44:51 EST 2006 i686 i686 i386 GNU/Linux How reproducible: few times now
This is looking like a Gulm issue: [root@morph-01 ~]# tail -f /var/log/messages Mar 1 10:55:03 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:03 morph-01 lock_gulmd_core[7570]: Got heartbeat from morph-04 at 1141232103407222 (last:10001059 max:10638248 avg:10001058) Mar 1 10:55:04 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:08 morph-01 last message repeated 4 times Mar 1 10:55:08 morph-01 lock_gulmd_core[7570]: Got heartbeat from morph-01 at 1141232108530404 (last:10001478 max:10002480 avg:10001149) Mar 1 10:55:09 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:10 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:11 morph-01 lock_gulmd_core[7570]: Got heartbeat from morph-03 at 1141232111218135 (last:10000429 max:12802905 avg:10000516) Mar 1 10:55:11 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:11 morph-01 lock_gulmd_core[7570]: Got heartbeat from morph-02 at 1141232111474380 (last:10000431 max:12548367 avg:10000550) Mar 1 10:55:12 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:13 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) Mar 1 10:55:13 morph-01 lock_gulmd_core[7570]: Got heartbeat from morph-04 at 1141232113408278 (last:10001056 max:10638248 avg:10001058) Mar 1 10:55:14 morph-01 lock_gulmd_LTPX[7574]: Cannot connect morph-01 ::ffff:10.15.89.61 (Connection refused) [root@morph-02 ~]# tail -f /var/log/messages Mar 1 10:52:31 morph-02 lock_gulmd_core[25985]: Sending heartbeat to Core Master at 1141231951272364, last was 1141231941271885 Mar 1 10:52:32 morph-02 lock_gulmd_LTPX[25993]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:52:41 morph-02 last message repeated 9 times Mar 1 10:52:41 morph-02 lock_gulmd_core[25985]: Sending heartbeat to Core Master at 1141231961272844, last was 1141231951272364 Mar 1 10:52:42 morph-02 lock_gulmd_LTPX[25993]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:52:51 morph-02 last message repeated 9 times Mar 1 10:52:51 morph-02 lock_gulmd_core[25985]: Sending heartbeat to Core Master at 1141231971273324, last was 1141231961272844 Mar 1 10:52:52 morph-02 lock_gulmd_LTPX[25993]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:53:01 morph-02 last message repeated 9 times Mar 1 10:53:01 morph-02 lock_gulmd_core[25985]: Sending heartbeat to Core Master at 1141231981273803, last was 1141231971273324 Mar 1 10:53:02 morph-02 lock_gulmd_LTPX[25993]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) [root@morph-03 ~]# tail -f /var/log/messages Mar 1 10:57:22 morph-03 lock_gulmd_LT000[27793]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:57:22 morph-03 lock_gulmd_LT000[27793]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:22 morph-03 lock_gulmd_LTPX[27797]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:23 morph-03 lock_gulmd_LT000[27793]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:57:23 morph-03 lock_gulmd_LT000[27793]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:23 morph-03 lock_gulmd_LTPX[27797]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:24 morph-03 lock_gulmd_LT000[27793]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:57:24 morph-03 lock_gulmd_LT000[27793]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:24 morph-03 lock_gulmd_core[27789]: Sending heartbeat to Core Master at 1141232244370484, last was 1141232234370004 Mar 1 10:57:24 morph-03 lock_gulmd_LTPX[27797]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:25 morph-03 lock_gulmd_LT000[27793]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:57:25 morph-03 lock_gulmd_LT000[27793]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:57:25 morph-03 lock_gulmd_LTPX[27797]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) [root@morph-04 ~]# tail -f /var/log/messages Mar 1 10:54:39 morph-04 lock_gulmd_LT000[25615]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:39 morph-04 lock_gulmd_LTPX[25619]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:40 morph-04 lock_gulmd_LT000[25615]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:54:40 morph-04 lock_gulmd_LT000[25615]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:40 morph-04 lock_gulmd_LTPX[25619]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:41 morph-04 lock_gulmd_LT000[25615]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:54:41 morph-04 lock_gulmd_LT000[25615]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:41 morph-04 lock_gulmd_LTPX[25619]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:42 morph-04 lock_gulmd_LT000[25615]: Trying to log into Master morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 Mar 1 10:54:42 morph-04 lock_gulmd_LT000[25615]: Cannot connect to morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) Mar 1 10:54:42 morph-04 lock_gulmd_LTPX[25619]: Cannot connect morph-01.lab.msp.redhat.com ::ffff:10.15.89.61 (Connection refused) What is going on here? gulm_tool getstats reports that the cluster is fine yet I see all these connetcion refusal messages. I don't think this is another FQDN issuse cause everyone is useing the short name. [root@morph-01 ~]# uname -ar Linux morph-01 2.6.9-34.EL #1 Fri Feb 24 16:44:51 EST 2006 i686 i686 i386 GNU/Linux [root@morph-01 ~]# cat /etc/cluster/cluster.conf <?xml version="1.0"?> <cluster name="morph-cluster" config_version="1"> <gulm> <lockserver name="morph-01"/> <lockserver name="morph-03"/> <lockserver name="morph-04"/> </gulm> <clusternodes> <clusternode name="morph-01"> <fence> <method name="single"> <device name="apc" switch="1" port="1"/> </method> </fence> </clusternode> <clusternode name="morph-02"> <fence> <method name="single"> <device name="apc" switch="1" port="2"/> </method> </fence> </clusternode> <clusternode name="morph-03"> <fence> <method name="single"> <device name="apc" switch="1" port="3"/> </method> </fence> </clusternode> <clusternode name="morph-04"> <fence> <method name="single"> <device name="apc" switch="1" port="4"/> </method> </fence> </clusternode> </clusternodes> <fencedevices> <fencedevice name="apc" agent="fence_apc" ipaddr="morph-apc" login="apc" passwd="apc"/> </fencedevices> </cluster>
Looks like gulm LT is getting started (and failing) before gulm core starts. Mar 1 10:14:35 morph-01 lock_gulmd_main[5806]: Forked lock_gulmd_core. Mar 1 10:14:36 morph-01 lock_gulmd_main[5806]: Forked lock_gulmd_LT. Mar 1 10:14:36 morph-01 lock_gulmd_LT[7569]: Starting lock_gulmd_LT 1.0.6. (built Feb 20 2006 13:34:52) Copyright (C) 2004 Red Ha t, Inc. All rights reserved. Mar 1 10:14:36 morph-01 lock_gulmd_LT[7569]: I am running in Fail-over mode. Mar 1 10:14:36 morph-01 lock_gulmd_LT[7569]: I am (morph-01) with ip (::ffff:10.15.89.61) Mar 1 10:14:36 morph-01 lock_gulmd_LT[7569]: This is cluster morph-cluster Mar 1 10:14:36 morph-01 lock_gulmd_LT000[7569]: Locktable 0 started. Mar 1 10:14:36 morph-01 lock_gulmd_LT000[7569]: ERROR [src/lock_io.c:531] Failed to connect to core. 111:Connection refused Mar 1 10:14:36 morph-01 lock_gulmd_core[7570]: Starting lock_gulmd_core 1.0.6. (built Feb 20 2006 13:34:52) Copyright (C) 2004 Re d Hat, Inc. All rights reserved. Mar 1 10:14:36 morph-01 lock_gulmd_core[7570]: I am running in Fail-over mode. Mar 1 10:14:36 morph-01 lock_gulmd_core[7570]: I am (morph-01) with ip (::ffff:10.15.89.61) Mar 1 10:14:36 morph-01 lock_gulmd_core[7570]: This is cluster morph-cluster Mar 1 10:14:36 morph-01 lock_gulmd_core[7570]: In state: Pending Mar 1 10:14:37 morph-01 lock_gulmd_core[7570]: New Service "Magma::5503" connected. idx:1 fd:6 Mar 1 10:14:37 morph-01 lock_gulmd_core[7570]: EOF on xdr (Magma::5503 ::1 idx:1 fd:6) Mar 1 10:14:37 morph-01 lock_gulmd_core[7570]: Closing connection idx:1, fd:6 to Magma::5503 Mar 1 10:14:37 morph-01 lock_gulmd_main[5806]: Forked lock_gulmd_LTPX. Mar 1 10:14:37 morph-01 lock_gulmd_LTPX[7574]: Starting lock_gulmd_LTPX 1.0.6. (built Feb 20 2006 13:34:52) Copyright (C) 2004 Re d Hat, Inc. All rights reserved.
restarting the gulm deamon on the master with problems fixed this issue.
Changing ownership to cfeist as this is a gulm issue.
This appears to be an issue with the gulm processes not starting up at the correct times.
Modified gulm_lt & gulm_ltpx to retry if they are unable to connect the first time. Fix should be in the next gulm build.
An advisory has been issued which should help the problem described in this bug report. This report is therefore being closed with a resolution of ERRATA. For more information on the solution and/or where to find the updated files, please follow the link below. You may reopen this bug report if the solution does not work for you. http://rhn.redhat.com/errata/RHBA-2006-0553.html