Description of problem: This is the first time that I've ever seen this, on my seemingly healthy cluster, I can't get clvmd to start on one of the nodes and clvmd isn't very helpful in letting me know what's wrong. [root@taft-01 ~]# cat /proc/cluster/nodes Node Votes Exp Sts Name 1 1 4 M taft-03 2 1 4 M taft-04 3 1 4 M taft-02 4 1 4 M taft-01 [root@taft-01 ~]# cat /proc/cluster/services Service Name GID LID State Code Fence Domain: "default" 1 2 run - [1 2 3 4] [root@taft-01 ~]# cat /proc/cluster/status Protocol version: 5.0.1 Config version: 4 Cluster name: TAFT_CLUSTER Cluster ID: 55252 Cluster Member: Yes Membership state: Cluster-Member Nodes: 4 Expected_votes: 4 Total_votes: 4 Quorum: 3 Active subsystems: 1 Node name: taft-01 Node ID: 4 Node addresses: 10.15.89.67 [root@taft-01 ~]# service clvmd start Starting clvmd: [FAILED] Jun 30 06:31:49 taft-01 kernel: clvmd[4388] general protection rip:34be804ffc rsp:7fbffff1500 Jun 30 06:31:49 taft-01 clvmd: clvmd startup failed Clvmd started up just fine on the other nodes. Version-Release number of selected component (if applicable): root@taft-01 ~]# rpm -q device-mapper device-mapper-1.02.07-2.0.RHEL4 [root@taft-01 ~]# rpm -q lvm2 lvm2-2.02.06-3.0.RHEL4 [root@taft-01 ~]# rpm -q lvm2-cluster lvm2-cluster-2.02.06-1.0.RHEL4 [root@taft-01 ~]# [root@taft-01 ~]# [root@taft-01 ~]# [root@taft-01 ~]# rpm -q cmirror cmirror-1.0.0-5 [root@taft-01 ~]# rpm -q cmirror-kernel cmirror-kernel-2.6.9-8.0 [root@taft-01 ~]# uname -ar Linux taft-01 2.6.9-39.1.ELsmp #1 SMP Fri Jun 16 16:47:43 EDT 2006 x86_64 x86_64 x86_64 GNU/Linux How reproducible: everytime (once the node is in this state)
You shold have guessed that filing a big entitled "...for no apparent reason" would be returned for more information. really, now. It looks like a (possible DLM) kernel oops, so more kernel traceback please.
[root@taft-01 ~]# clvmd Segmentation fault (core dumped) Jul 5 08:35:58 taft-01 kernel: clvmd[4397] general protection rip:34be804ffc rsp:7fbfffed90 error:0 [root@taft-01 ~]# strace clvmd execve("/usr/sbin/clvmd", ["clvmd"], [/* 21 vars */]) = 0 uname({sys="Linux", node="taft-01", ...}) = 0 brk(0) = 0x561000 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95556000 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) open("/etc/ld.so.cache", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=110734, ...}) = 0 mmap(NULL, 110734, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000 close(3) = 0 open("/lib64/tls/libpthread.so.0", O_RDONLY) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 V0\2774"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=106203, ...}) = 0 mmap(0x34bf300000, 1131384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x34bf300000 mprotect(0x34bf30f000, 1069944, PROT_NONE) = 0 mmap(0x34bf40f000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xf000) = 0x34bf40f000 mmap(0x34bf411000, 13176, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x34bf411000 close(3) = 0 open("/lib64/libdevmapper-event.so.1.02", O_RDONLY) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`!\220\277"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0555, st_size=27352, ...}) = 0 mmap(0x34bf900000, 1071376, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x34bf900000 mprotect(0x34bf906000, 1046800, PROT_NONE) = 0 mmap(0x34bfa05000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5000) = 0x34bfa05000 close(3) = 0 open("/lib64/libdevmapper.so.1.02", O_RDONLY) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p5\20\277"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0555, st_size=68680, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95573000 mmap(0x34bf100000, 1112432, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x34bf100000 mprotect(0x34bf10e000, 1055088, PROT_NONE) = 0 mmap(0x34bf20e000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xe000) = 0x34bf20e000 --- SIGSEGV (Segmentation fault) @ 0 (0) --- +++ killed by SIGSEGV (core dumped) +++
hit this again today on taft-01: Jul 12 05:16:01 taft-01 kernel: clvmd[4361]: segfault at 0000000000000008 rip 00000034be80bb98 rsp 0000007fbffff360 error 4
Does the X86/64 log SEGVs in the kernel log or is that a kernel oops? If it's a kernel oops, is there more traceback? If its just a userland segv can you get a gdb traceback? - according to the message it dumped core.
ISTR that agk said (some time ago now) that he had fixed some things in LVM that might be causing odd segvs. Are these still happening ?
so..are these still hapenning with the new LVM code ??
Have not seen this bug in almost 4 months, closing. Will reopen if seen again.