Bug 193704

Summary: I/O hang to mirror volume after a node is recovered
Product: [Retired] Red Hat Cluster Suite Reporter: Corey Marthaler <cmarthal>
Component: cmirrorAssignee: Jonathan Earl Brassow <jbrassow>
Status: CLOSED CURRENTRELEASE QA Contact: Cluster QE <mspqa-list>
Severity: high Docs Contact:
Priority: high    
Version: 4CC: agk, dwysocha, mbroz
Target Milestone: ---   
Target Release: ---   
Hardware: All   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2010-04-27 14:51:54 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Bug Depends On:    
Bug Blocks: 180185    

Description Corey Marthaler 2006-05-31 18:14:51 UTC
Description of problem:
I had I/O going to a mirror on taft-01 (part of a 4 node cluster taft-01 -
taft-04) and taft-01 was the mirror server. I then rebooted taft-04 (which had
no I/O going at all). After taft-04 was recovered, the I/O I had going and any
more I/O that I attempted, hung.

[root@taft-03 sbin]# dd if=/dev/zero of=/dev/mirror_1/mirror_10 count=10 skip=100
10+0 records in
10+0 records out
[HANG]

[root@taft-02 sbin]# strace dd if=/dev/zero of=/dev/mirror_1/mirror_10 count=10
skip=100
execve("/bin/dd", ["dd", "if=/dev/zero", "of=/dev/mirror_1/mirror_10",
"count=10", "skip=100"], [/* 23 vars */]) = 0
uname({sys="Linux", node="taft-02", ...}) = 0
brk(0)                                  = 0x508000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95556000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=110042, ...}) = 0
mmap(NULL, 110042, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000
close(3)                                = 0
open("/lib64/tls/libc.so.6", O_RDONLY)  = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\305\241"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1493186, ...}) = 0
mmap(0x3eefa00000, 2310056, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3,
0) = 0x3eefa00000
mprotect(0x3eefb2b000, 1085352, PROT_NONE) = 0
mmap(0x3eefc2a000, 24576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12a000) = 0x3eefc2a000
mmap(0x3eefc30000, 16296, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3eefc30000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95572000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95573000
mprotect(0x3eefc2a000, 12288, PROT_READ) = 0
arch_prctl(ARCH_SET_FS, 0x2a95572b00)   = 0
munmap(0x2a95557000, 110042)            = 0
brk(0)                                  = 0x508000
brk(0x529000)                           = 0x529000
open("/usr/lib/locale/locale-archive", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=48510448, ...}) = 0
mmap(NULL, 48510448, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95574000
close(3)                                = 0
close(0)                                = 0
open("/dev/zero", O_RDONLY)             = 0
close(1)                                = 0
open("/dev/mirror_1/mirror_10", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 1
rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGQUIT, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGPIPE, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGUSR1, {0x401af0, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
ioctl(0, MGSL_IOCSTXIDLE or MTIOCGET or SNDCTL_MIDI_MPUCMD, 0x7fbffff760) = -1
ENOTTY (Inappropriate ioctl for device)
lseek(0, 51200, SEEK_CUR)               = 0
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
open("/usr/share/locale/locale.alias", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=2528, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a983b8000
read(3, "# Locale name alias data base.\n#"..., 4096) = 2528
read(3, "", 4096)                       = 0
close(3)                                = 0
munmap(0x2a983b8000, 4096)              = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No
such file or directory)
write(2, "10+0 records in\n", 1610+0 records in
)       = 16
write(2, "10+0 records out\n", 1710+0 records out
)      = 17
close(0)                                = 0
close(1      [HANG]


[root@taft-04 sbin]# lvs
  LV        VG       Attr   LSize   Origin Snap%  Move Log            Copy%
  mirror_10 mirror_1 mwi-a- 136.23G                    mirror_10_mlog 100.00

[root@taft-04 sbin]# dmsetup status
mirror_1-mirror_10: 0 285696000 mirror 2 253:3 253:4 279001/279000 1 AA 3
clustered_disk 253:2 A
mirror_1-mirror_10_mlog: 0 8192 linear
VolGroup00-LogVol01: 0 4063232 linear
mirror_1-mirror_10_mimage_1: 0 285696000 linear
VolGroup00-LogVol00: 0 40960000 linear
mirror_1-mirror_10_mimage_0: 0 285696000 linear
    


May 31 04:20:10 taft-01 kernel: device-mapper: I'm the cluster log server for
LVM-Pq5y7k
May 31 04:20:10 taft-01 kernel: device-mapper: Disk Resume::
May 31 04:20:10 taft-01 kernel: device-mapper:   Live nodes        :: 1
May 31 04:20:10 taft-01 kernel: device-mapper:   In-Use Regions    :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Good IUR's        :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Bad IUR's         :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Sync count        :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Disk Region count ::
184467440737095515
May 31 04:20:10 taft-01 kernel: device-mapper:   Region count      :: 279000
May 31 04:20:10 taft-01 kernel: device-mapper:   NOTE:  Mapping has changed.
May 31 04:20:10 taft-01 kernel: device-mapper: Marked regions::
May 31 04:20:10 taft-01 kernel: device-mapper:   0 - -1
May 31 04:20:10 taft-01 kernel: device-mapper:   Total = -1
May 31 04:20:10 taft-01 kernel: device-mapper: Out-of-sync regions::
May 31 04:20:10 taft-01 kernel: device-mapper:   0 - -1
May 31 04:20:10 taft-01 kernel: device-mapper:   Total = -1
May 31 04:20:11 taft-01 lvm[15125]: Monitoring mirror device, mirror_1-mirror_10
for ev
May 31 04:20:30 taft-01 qarshd[15127]: Talking to peer 10.15.80.47:49294
May 31 04:20:30 taft-01 qarshd[15127]: Running cmdline:
PATH=/usr/kerberos/bin:/usr/loc
CMAN: removing node taft-04 from the cluster : Missed too many heartbeats
May 31 04:39:24 taft-01 kernel: CMAN: removing node taft-04 from the cluster :
Missed too mas
May 31 04:39:57 taft-01 fenced[14110]: taft-04 not a cluster member after 30 sec
post_fail_dy
May 31 04:39:57 taft-01 fenced[14110]: fencing node "taft-04"
May 31 04:40:01 taft-01 fenced[14110]: fence "taft-04" success
May 31 04:40:06 taft-01 kernel: device-mapper: A node has failed.
May 31 04:40:06 taft-01 kernel: device-mapper: Disk Resume::
May 31 04:40:06 taft-01 kernel: device-mapper:   Live nodes        :: 3
May 31 04:40:06 taft-01 kernel: device-mapper:   In-Use Regions    :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Good IUR's        :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Bad IUR's         :: 0
May 31 04:40:06 taft-01 kernel: device-mapper:   Sync count        :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Disk Region count :: 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Region count      :: 279000
May 31 04:40:06 taft-01 kernel: device-mapper: Marked regions::
May 31 04:40:06 taft-01 kernel: device-mapper:   0 - 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Total = 279000
May 31 04:40:06 taft-01 kernel: device-mapper: Out-of-sync regions::
May 31 04:40:06 taft-01 kernel: device-mapper:   1 - 47190
May 31 04:40:06 taft-01 kernel: device-mapper:   47192 - 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Total = 278998
May 31 04:44:00 taft-01 kernel: CMAN: node taft-04 rejoining




Version-Release number of selected component (if applicable):
[root@taft-04 sbin]# rpm -q cmirror-kernel-smp
cmirror-kernel-smp-2.6.9-4.2

Comment 2 Jonathan Earl Brassow 2006-06-21 21:27:13 UTC
Please retest with kernel > 39.EL


Comment 3 Jonathan Earl Brassow 2006-07-06 04:32:44 UTC
Sorry, this should really be "modified" not needinfo

Comment 4 Corey Marthaler 2006-07-07 21:41:35 UTC
fix verified.

Comment 6 Alasdair Kergon 2010-04-27 14:51:54 UTC
Assuming this VERIFIED fix got released.  Closing.
Reopen if it's not yet resolved.