Bug 217724

Summary: cman fails to start with short hostname set and fully qualified hostname in cluster.conf
Product: Red Hat Enterprise Linux 5 Reporter: Len DiMaggio <ldimaggi>
Component: cmanAssignee: Christine Caulfield <ccaulfie>
Status: CLOSED CURRENTRELEASE QA Contact: Cluster QE <mspqa-list>
Severity: medium Docs Contact:
Priority: medium    
Version: 5.0CC: cluster-maint, djansa, jlaska
Target Milestone: ---   
Target Release: ---   
Hardware: All   
OS: Linux   
Whiteboard:
Fixed In Version: RC Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2007-02-08 02:23:32 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description Len DiMaggio 2006-11-29 16:36:11 UTC
Description of problem:

cman fails to start with short hostname set and fully qualified hostname in
cluster.conf

I found this inadvertently during some Conga testing. It looks as though if a
cluster node's host name is defined without its domain, but the cluster.conf
file includes the fully qualified host name - which is the only way that the
Conga/luci web app will generate the file - cman cannot start.

See below for additional information and an strace.

Version-Release number of selected component (if applicable):
cman-2.0.35-2.el5

How reproducible:
100%

Steps to Reproduce:
1. Create a cluster via Conga - have the cluster nodes configured with short -
not fully qualified hostnames.

Actual results:
cman fails to start

Expected results:
cman should start

Additional info:

With hostname defined in /etc/sysconfig/network as:

================================
NETWORKING=yes
NETWORKING_IPV6=yes
HOSTNAME=tng3-2
================================

And reported by hostname/uname -n as:
================================
tng3-2
================================

And with this cluster definition:

================================
<?xml version="1.0"?>
<cluster alias="nodes23" config_version="1" name="nodes23">
        <fence_daemon post_fail_delay="0" post_join_delay="3"/>
        <clusternodes>
                <clusternode name="tng3-2.lab.msp.redhat.com" nodeid="1" votes="1"/>
                <clusternode name="tng3-3.lab.msp.redhat.com" nodeid="2" votes="1"/>
        </clusternodes>
        <cman expected_votes="1" two_node="1"/>
        <fencedevices/>
        <rm/>
</cluster>
================================

cman-2.0.35-2.el5 fails to start - here's an strace:

================================
execve("/usr/sbin/cman_tool", ["/usr/sbin/cman_tool", "-t", "120", "-w", "join",
"-c", "nodes23"], [/* 23 vars */]) = 0
brk(0)                                  = 0x9287000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=31538, ...}) = 0
mmap2(NULL, 31538, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7fc4000
close(3)                                = 0
open("/lib/libc.so.6", O_RDONLY)        = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0p`]\000"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1577100, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7fc3000
mmap2(0x5c0000, 1299876, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) =
0x5c0000
mmap2(0x6f8000, 12288, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x137) = 0x6f8000
mmap2(0x6fb000, 9636, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS,
-1, 0) = 0x6fb000
close(3)                                = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7fc2000
set_thread_area({entry_number:-1 -> 6, base_addr:0xb7fc26c0, limit:1048575,
seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0,
useable:1}) = 0
mprotect(0x6f8000, 8192, PROT_READ)     = 0
mprotect(0x5b7000, 4096, PROT_READ)     = 0
munmap(0xb7fc4000, 31538)               = 0
rt_sigaction(SIGALRM, {0x8048ea0, [ALRM], SA_RESTART}, {SIG_DFL}, 8) = 0
alarm(120)                              = 0
brk(0)                                  = 0x9287000
brk(0x92a8000)                          = 0x92a8000
getpid()                                = 3179
socket(PF_FILE, SOCK_STREAM, 0)         = 3
connect(3, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(3, "\1\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(3, "\1\0\0\0\3\0\0\0w\2\0\0\0\0\0\0\0\0\0\0", 20) = 20
close(3)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 3
connect(3, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(3, "\2\0\0\0\0\0\0\0w\2\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(3, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20
close(3)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 3
fcntl64(3, F_SETFD, FD_CLOEXEC)         = 0
connect(3, {sa_family=AF_FILE, path="/var/run/cman_admin"}, 110) = -1 ENOENT (No
such file or directory)
close(3)                                = 0
pipe([3, 4])                            = 0
fcntl64(4, F_SETFD, 0)                  = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0xb7fc2708) = 3180
--- SIGCHLD (Child exited) @ 0 (0) ---
select(4, [3], NULL, NULL, {1, 0})      = 0 (Timeout)
socket(PF_FILE, SOCK_STREAM, 0)         = 5
fcntl64(5, F_SETFD, FD_CLOEXEC)         = 0
connect(5, {sa_family=AF_FILE, path="/var/run/cman_admin"}, 110) = -1 ENOENT (No
such file or directory)
close(5)                                = 0
select(4, [3], NULL, NULL, {1, 0})      = 1 (in [3], left {0, 601000})
read(3, "CCS does not have a nodeid for t"..., 1024) = 105
write(2, "cman not started: CCS does not h"..., 93cman not started: CCS does not
have a nodeid for this node, run 'ccs_tool addnodeids' to fix
) = 93
write(2, "/usr/sbin/cman_tool: ", 21/usr/sbin/cman_tool: )   = 21
write(2, "aisexec daemon didn\'t start\n", 28aisexec daemon didn't start
) = 28
exit_group(1)                           = ?
Process 3179 detached
================================

Comment 1 Christine Caulfield 2006-11-30 11:02:05 UTC
cman was actually truncating the node when it looked for unqualified names, so
the nodeid lookup failed later on in the search - that explains the rather odd
error you got.

This checkin simple uses the length returned from strstr rather than truncating
the string at that point.
(note: does this need to go into RHEL50 ?? )

HEAD:
Checking in cmanccs.c;
/cvs/cluster/cluster/cman/daemon/cmanccs.c,v  <--  cmanccs.c
new revision: 1.22; previous revision: 1.21
done

-rRHEL5:
Checking in cmanccs.c;
/cvs/cluster/cluster/cman/daemon/cmanccs.c,v  <--  cmanccs.c
new revision: 1.21.2.1; previous revision: 1.21
done


Comment 2 Len DiMaggio 2006-11-30 16:29:30 UTC
(note: does this need to go into RHEL50 ?? )

I'd say yes - if I didn't know the history of other bugs (now resolved) with
luci and short-host names, I wouldn't have known how to approach debugging this.

Comment 3 RHEL Program Management 2006-12-01 15:01:49 UTC
This request was evaluated by Red Hat Product Management for inclusion in a Red
Hat Enterprise Linux major release.  Product Management has requested further
review of this request by Red Hat Engineering, for potential inclusion in a Red
Hat Enterprise Linux Major release.  This request is not yet committed for
inclusion.

Comment 4 Christine Caulfield 2006-12-15 15:17:38 UTC
Checked in for RHEL50

Checking in cmanccs.c;
/cvs/cluster/cluster/cman/daemon/cmanccs.c,v  <--  cmanccs.c
new revision: 1.21.4.1; previous revision: 1.21
done


Comment 5 Len DiMaggio 2007-01-22 13:41:22 UTC
Verified bug fix in cman-2.0.52-1.el5

Comment 6 RHEL Program Management 2007-02-08 02:23:32 UTC
A package has been built which should help the problem described in 
this bug report. This report is therefore being closed with a resolution 
of CURRENTRELEASE. You may reopen this bug report if the solution does 
not work for you.


Comment 7 Nate Straz 2007-12-13 17:22:07 UTC
Moving all RHCS ver 5 bugs to RHEL 5 so we can remove RHCS v5 which never existed.