Description of problem: Ever since I loaded the latest lvm rpms I've been seeing failures when trying to do lv creation and removal. I don't have the gulm mod loaded or running or in the .conf file. The create operations do happen but fail because clvm can't talk to the other nodes, so the activation fails. [root@morph-01 root]# rpm -qa | grep lvm lvm2-cluster-2.01.01-1.1.RHEL4 lvm2-2.01.00-1.0.RHEL4 [root@morph-01 root]# rpm -qa | grep device-mapper device-mapper-1.01.00-1.RHEL4 [root@morph-01 root]# cat /proc/cluster/nodes Node Votes Exp Sts Name 1 1 5 M morph-01 2 1 5 M morph-02 3 1 5 M morph-03 4 1 5 M morph-05 5 1 5 M morph-04 [root@morph-01 root]# cat /proc/cluster/services Service Name GID LID State Code Fence Domain: "default" 1 2 run - [1 2 3 5 4] DLM Lock Space: "clvmd" 3 4 run - [1 2 3 5 4] Here's a remove attempt that failed: [root@morph-02 root]# /usr/sbin/lvremove -v /dev/gfs/gfs0 Loaded external locking library /usr/lib/liblvm2clusterlock.so Using logical volume(s) on command line Archiving volume group "gfs" metadata. cluster request failed: Host is down Unable to deactivate logical volume "gfs0" Version-Release number of selected component (if applicable): [root@morph-03 root]# clvmd -V Cluster LVM daemon version: 2.01.01 (2005-01-19) Protocol version: 0.2.1 How reproducible: Sometimes
here's an strace of a failed create: . . . uname({sys="Linux", node="morph-02.lab.msp.redhat.com", ...}) = 0 open("/etc/lvm/backup/.lvm_morph-02.lab.msp.redhat.com_4176_90563467", O_WRONLY|O_APPEND|O_CREAT|O_EXCL|O_LARGEFILE, 0666) = 9 fcntl64(9, F_SETLK64, {type=F_WRLCK, whence=SEEK_SET, start=0, len=0}, 0xbfe6f380) = 0 fcntl64(9, F_GETFL) = 0x8401 (flags O_WRONLY|O_APPEND|O_LARGEFILE) fstat64(9, {st_mode=S_IFREG|0600, st_size=0, ...}) = 0 mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7ded000 _llseek(9, 0, [0], SEEK_CUR) = 0 time(NULL) = 1106259355 fsync(9) = 0 write(9, "# Generated by LVM2: Thu Jan 20 "..., 2778) = 2778 close(9) = 0 munmap(0xb7ded000, 4096) = 0 rename("/etc/lvm/backup/.lvm_morph-02.lab.msp.redhat.com_4176_90563467", "/etc/lvm/backup/gfs.tmp") = 0 rename("/etc/lvm/backup/gfs.tmp", "/etc/lvm/backup/gfs") = 0 stat64("/etc/lvm/backup/gfs.tmp", 0xbfe71440) = -1 ENOENT (No such file or directory) open("/etc/lvm/backup", O_RDONLY|O_LARGEFILE) = 9 fsync(9) = 0 close(9) = 0 _llseek(5, 2048, [2048], SEEK_SET) = 0 read(5, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(5, 5120, [5120], SEEK_SET) = 0 read(5, "gfs {\nid = \"CK0Wb6-jm0T-Iy4i-4dp"..., 512) = 512 _llseek(5, 2048, [2048], SEEK_SET) = 0 read(5, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(5, 2048, [2048], SEEK_SET) = 0 write(5, "\205r\202\266 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 close(5) = 0 _llseek(6, 2048, [2048], SEEK_SET) = 0 read(6, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(6, 5120, [5120], SEEK_SET) = 0 read(6, "gfs {\nid = \"CK0Wb6-jm0T-Iy4i-4dp"..., 512) = 512 _llseek(6, 2048, [2048], SEEK_SET) = 0 read(6, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(6, 2048, [2048], SEEK_SET) = 0 write(6, "\205r\202\266 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 close(6) = 0 _llseek(7, 2048, [2048], SEEK_SET) = 0 read(7, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(7, 5120, [5120], SEEK_SET) = 0 read(7, "gfs {\nid = \"CK0Wb6-jm0T-Iy4i-4dp"..., 512) = 512 _llseek(7, 2048, [2048], SEEK_SET) = 0 read(7, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(7, 2048, [2048], SEEK_SET) = 0 write(7, "\205r\202\266 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 close(7) = 0 _llseek(8, 2048, [2048], SEEK_SET) = 0 read(8, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(8, 5120, [5120], SEEK_SET) = 0 read(8, "gfs {\nid = \"CK0Wb6-jm0T-Iy4i-4dp"..., 512) = 512 _llseek(8, 2048, [2048], SEEK_SET) = 0 read(8, "\22\34\247\256 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 _llseek(8, 2048, [2048], SEEK_SET) = 0 write(8, "\205r\202\266 LVM2 x[5A%r0N*>\1\0\0\0\0\10\0\0\0\0\0\0"..., 512) = 512 close(8) = 0 write(3, "2\0\0\0\0\0\0\0\0\0\0\0C\0\0\0\0\31\0CK0Wb6jm0TIy4"..., 85) = 85 read(3, "\1g\320\0\310g\320\0\220\377\377\377^\0\0\0\0m", 18) = 18 read(3, "orph-03\0\220\377\377\377CLVMD not running\0mo"..., 94) = 94 open("/usr/share/locale/locale.alias", O_RDONLY) = 5 fstat64(5, {st_mode=S_IFREG|0644, st_size=2528, ...}) = 0 mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7ded000 read(5, "# Locale name alias data base.\n#"..., 4096) = 2528 read(5, "", 4096) = 0 close(5) = 0 munmap(0xb7ded000, 4096) = 0 open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) write(2, " ", 2 ) = 2 write(2, "cluster request failed: Host is "..., 36cluster request failed: Host is down) = 36 write(2, "\n", 1 ) = 1 write(2, " ", 2 ) = 2 write(2, "Failed to activate new LV.", 26Failed to activate new LV.) = 26 write(2, "\n", 1 ) = 1 stat64("/proc/lvm/VGs/gfs", 0xbfe723d0) = -1 ENOENT (No such file or directory) write(3, "3\1pf\0\0\0\0\0\0\0\0\10\0\0\0\0\6\0V_gfs\0W", 26) = 26 read(3, "\1j\320\0008j\320\0\0\0\0\0\17\0\0\0\0m", 18) = 18 read(3, "orph-02\0\0\0\0\0\0\0\0", 15) = 15 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 close(3) = 0 munmap(0xb7e000, 10480) = 0 open("/etc/lvm/.cache", O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 3 fstat64(3, {st_mode=S_IFREG|0600, st_size=0, ...}) = 0 mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7ded000 write(3, "# This file is automatically mai"..., 201) = 201 close(3) = 0 munmap(0xb7ded000, 4096) = 0 close(4) = 0 brk(0) = 0x9145000 brk(0) = 0x9145000 brk(0x9142000) = 0x9142000 brk(0) = 0x9142000 exit_group(5) = ?
daemon output: /usr/sbin/lvremove -f /dev/gfs/gfs0 cluster request failed: Host is down Unable to deactivate logical volume "gfs0" [root@morph-01 root]# clvmd -d CLVMD[4237]: 1106260908 CLVMD started CLVMD[4237]: 1106260912 Cluster ready, doing some more initialisation CLVMD[4237]: 1106260912 starting LVM thread CLVMD[4237]: 1106260912 clvmd ready for work CLVMD[4237]: 1106260912 Using timeout of 60 seconds CLVMD[4237]: 1106260912 LVM thread function started File descriptor 3 left open File descriptor 4 left open File descriptor 5 left open File descriptor 6 left open CLVMD[4237]: 1106260912 LVM thread waiting for work CLVMD[4237]: 1106260947 Got new connection on fd 7 CLVMD[4237]: 1106260947 Read on local socket 7, len = 26 CLVMD[4237]: 1106260947 creating pipe, [8, 9] CLVMD[4237]: 1106260947 Creating pre&post thread CLVMD[4237]: 1106260947 Created pre&post thread, state = 0 CLVMD[4237]: 1106260947 in sub thread: client = 0x835d128 CLVMD[4237]: 1106260947 doing PRE command LOCK_VG 'V_gfs' at 1 (client=0x835d128) CLVMD[4237]: 1106260947 sync_lock: 'V_gfs' mode:1 flags=0 CLVMD[4237]: 1106260947 sync_lock: returning lkid 102ca CLVMD[4237]: 1106260947 Writing status 0 down pipe 9 CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d128 CLVMD[4237]: 1106260947 distribute command: XID = 0 CLVMD[4237]: 1106260947 add_to_lvmqueue: cmd=0x835d388. client=0x835d128, msg=0x835d1e0, len=26, csid=(nil), xid=0 CLVMD[4237]: 1106260947 Waiting to do post command - state = 0 CLVMD[4237]: 1106260947 process_work_item: local CLVMD[4237]: 1106260947 process_local_command: msg=0x835d3b8, msglen =26, client=0x835d128 CLVMD[4237]: 1106260947 Reply from node morph-01: 0 bytes CLVMD[4237]: 1106260947 Got 1 replies, expecting: 1 CLVMD[4237]: 1106260947 LVM thread waiting for work CLVMD[4237]: 1106260947 Got post command condition... CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d128 CLVMD[4237]: 1106260947 Send local reply CLVMD[4237]: 1106260947 Waiting for next pre command CLVMD[4237]: 1106260947 Read on local socket 7, len = 26 CLVMD[4237]: 1106260947 Got pre command condition... CLVMD[4237]: 1106260947 doing PRE command LOCK_VG 'V_gfs' at 6 (client=0x835d128) CLVMD[4237]: 1106260947 sync_unlock: 'V_gfs' lkid:102ca CLVMD[4237]: 1106260947 Writing status 0 down pipe 9 CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d128 CLVMD[4237]: 1106260947 distribute command: XID = 1 CLVMD[4237]: 1106260947 add_to_lvmqueue: cmd=0x835d388. client=0x835d128, msg=0x835d1e0, len=26, csid=(nil), xid=1 CLVMD[4237]: 1106260947 Waiting to do post command - state = 0 CLVMD[4237]: 1106260947 process_work_item: local CLVMD[4237]: 1106260947 process_local_command: msg=0x835d3b8, msglen =26, client=0x835d128 CLVMD[4237]: 1106260947 Reply from node morph-01: 0 bytes CLVMD[4237]: 1106260947 Got 1 replies, expecting: 1 CLVMD[4237]: 1106260947 LVM thread waiting for work CLVMD[4237]: 1106260947 Got post command condition... CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d128 CLVMD[4237]: 1106260947 Send local reply CLVMD[4237]: 1106260947 Waiting for next pre command CLVMD[4237]: 1106260947 Read on local socket 7, len = 0 CLVMD[4237]: 1106260947 EOF on local socket: inprogress=0 CLVMD[4237]: 1106260947 Waiting for child thread CLVMD[4237]: 1106260947 SIGUSR2 received CLVMD[4237]: 1106260947 Got pre command condition... CLVMD[4237]: 1106260947 Subthread finished CLVMD[4237]: 1106260947 Joined child thread CLVMD[4237]: 1106260947 ret == 0, errno = 0. removing client CLVMD[4237]: 1106260947 Got new connection on fd 7 CLVMD[4237]: 1106260947 Read on local socket 7, len = 26 CLVMD[4237]: 1106260947 creating pipe, [8, 9] CLVMD[4237]: 1106260947 Creating pre&post thread CLVMD[4237]: 1106260947 Created pre&post thread, state = 0 CLVMD[4237]: 1106260947 in sub thread: client = 0x835d200 CLVMD[4237]: 1106260947 doing PRE command LOCK_VG 'V_gfs' at 4 (client=0x835d200) CLVMD[4237]: 1106260947 sync_lock: 'V_gfs' mode:4 flags=0 CLVMD[4237]: 1106260947 sync_lock: returning lkid 10360 CLVMD[4237]: 1106260947 Writing status 0 down pipe 9 CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d200 CLVMD[4237]: 1106260947 distribute command: XID = 2 CLVMD[4237]: 1106260947 add_to_lvmqueue: cmd=0x835d6d8. client=0x835d200, msg=0x835d1e0, len=26, csid=(nil), xid=2 CLVMD[4237]: 1106260947 process_work_item: local CLVMD[4237]: 1106260947 process_local_command: msg=0x835d708, msglen =26, client=0x835d200 CLVMD[4237]: 1106260947 Reply from node morph-01: 0 bytes CLVMD[4237]: 1106260947 Got 1 replies, expecting: 1 CLVMD[4237]: 1106260947 LVM thread waiting for work CLVMD[4237]: 1106260947 Waiting to do post command - state = 1 CLVMD[4237]: 1106260947 Got post command condition... CLVMD[4237]: 1106260947 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260947 background routine status was 0, sock_client=0x835d200 CLVMD[4237]: 1106260947 Send local reply CLVMD[4237]: 1106260947 Waiting for next pre command CLVMD[4237]: 1106260948 Read on local socket 7, len = 85 CLVMD[4237]: 1106260948 check_all_clvmds_running CLVMD[4237]: 1106260948 Reply from node morph-03: 18 bytes CLVMD[4237]: 1106260948 Got 2 replies, expecting: 0 CLVMD[4237]: 1106260948 Reply from node morph-04: 18 bytes CLVMD[4237]: 1106260948 Got 3 replies, expecting: 0 CLVMD[4237]: 1106260948 Send local reply CLVMD[4237]: 1106260948 Read on local socket 7, len = 26 CLVMD[4237]: 1106260948 Got pre command condition... CLVMD[4237]: 1106260948 doing PRE command LOCK_VG 'V_gfs' at 6 (client=0x835d200) CLVMD[4237]: 1106260948 sync_unlock: 'V_gfs' lkid:10360 CLVMD[4237]: 1106260948 Writing status 0 down pipe 9 CLVMD[4237]: 1106260948 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260948 background routine status was 0, sock_client=0x835d200 CLVMD[4237]: 1106260948 distribute command: XID = 3 CLVMD[4237]: 1106260948 add_to_lvmqueue: cmd=0x835d6d8. client=0x835d200, msg=0x835d1e0, len=26, csid=(nil), xid=3 CLVMD[4237]: 1106260948 process_work_item: local CLVMD[4237]: 1106260948 process_local_command: msg=0x835d708, msglen =26, client=0x835d200 CLVMD[4237]: 1106260948 Reply from node morph-01: 0 bytes CLVMD[4237]: 1106260948 Got 1 replies, expecting: 1 CLVMD[4237]: 1106260948 LVM thread waiting for work CLVMD[4237]: 1106260948 Waiting to do post command - state = 1 CLVMD[4237]: 1106260948 Got post command condition... CLVMD[4237]: 1106260948 read on PIPE 8: 4 bytes: status: 0 CLVMD[4237]: 1106260948 background routine status was 0, sock_client=0x835d200 CLVMD[4237]: 1106260948 Send local reply CLVMD[4237]: 1106260948 Waiting for next pre command CLVMD[4237]: 1106260948 Read on local socket 7, len = 0 CLVMD[4237]: 1106260948 EOF on local socket: inprogress=0 CLVMD[4237]: 1106260948 Waiting for child thread CLVMD[4237]: 1106260948 SIGUSR2 received CLVMD[4237]: 1106260948 Got pre command condition... CLVMD[4237]: 1106260948 Subthread finished CLVMD[4237]: 1106260948 Joined child thread CLVMD[4237]: 1106260948 ret == 0, errno = 0. removing client all others are similar to morph-02's: [root@morph-02 root]# clvmd -d CLVMD[3901]: 1106260933 CLVMD started CLVMD[3901]: 1106260940 Cluster ready, doing some more initialisation CLVMD[3901]: 1106260940 starting LVM thread CLVMD[3901]: 1106260940 clvmd ready for work CLVMD[3901]: 1106260940 Using timeout of 60 seconds CLVMD[3901]: 1106260940 LVM thread function started File descriptor 3 left open File descriptor 4 left open File descriptor 5 left open File descriptor 6 left open CLVMD[3901]: 1106260940 LVM thread waiting for work
testing is blocked on this.
It seems the gulm & cman initialisation conditions got swapped over in the amalgamation and I didn't spot it. I've also improved the error reporting too so it will now tell you /which/ node it thinks is down (in this case it was all of them). Oh, and thanks for the excellent bug report :) Checking in daemons/clvmd/clvmd-cman.c; /cvs/lvm2/LVM2/daemons/clvmd/clvmd-cman.c,v <-- clvmd-cman.c new revision: 1.9; previous revision: 1.8 done Checking in daemons/clvmd/clvmd-gulm.c; /cvs/lvm2/LVM2/daemons/clvmd/clvmd-gulm.c,v <-- clvmd-gulm.c new revision: 1.7; previous revision: 1.6 done Checking in daemons/clvmd/clvmd.c; /cvs/lvm2/LVM2/daemons/clvmd/clvmd.c,v <-- clvmd.c new revision: 1.13; previous revision: 1.12 done
fix verified.