Bug 193704 - I/O hang to mirror volume after a node is recovered
Summary: I/O hang to mirror volume after a node is recovered
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: Red Hat Cluster Suite
Classification: Retired
Component: cmirror
Version: 4
Hardware: All
OS: Linux
high
high
Target Milestone: ---
Assignee: Jonathan Earl Brassow
QA Contact: Cluster QE
URL:
Whiteboard:
Depends On:
Blocks: 180185
TreeView+ depends on / blocked
 
Reported: 2006-05-31 18:14 UTC by Corey Marthaler
Modified: 2010-04-27 14:51 UTC (History)
3 users (show)

Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Clone Of:
Environment:
Last Closed: 2010-04-27 14:51:54 UTC
Embargoed:


Attachments (Terms of Use)

Description Corey Marthaler 2006-05-31 18:14:51 UTC
Description of problem:
I had I/O going to a mirror on taft-01 (part of a 4 node cluster taft-01 -
taft-04) and taft-01 was the mirror server. I then rebooted taft-04 (which had
no I/O going at all). After taft-04 was recovered, the I/O I had going and any
more I/O that I attempted, hung.

[root@taft-03 sbin]# dd if=/dev/zero of=/dev/mirror_1/mirror_10 count=10 skip=100
10+0 records in
10+0 records out
[HANG]

[root@taft-02 sbin]# strace dd if=/dev/zero of=/dev/mirror_1/mirror_10 count=10
skip=100
execve("/bin/dd", ["dd", "if=/dev/zero", "of=/dev/mirror_1/mirror_10",
"count=10", "skip=100"], [/* 23 vars */]) = 0
uname({sys="Linux", node="taft-02", ...}) = 0
brk(0)                                  = 0x508000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95556000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=110042, ...}) = 0
mmap(NULL, 110042, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000
close(3)                                = 0
open("/lib64/tls/libc.so.6", O_RDONLY)  = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\305\241"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1493186, ...}) = 0
mmap(0x3eefa00000, 2310056, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3,
0) = 0x3eefa00000
mprotect(0x3eefb2b000, 1085352, PROT_NONE) = 0
mmap(0x3eefc2a000, 24576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12a000) = 0x3eefc2a000
mmap(0x3eefc30000, 16296, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3eefc30000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95572000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a95573000
mprotect(0x3eefc2a000, 12288, PROT_READ) = 0
arch_prctl(ARCH_SET_FS, 0x2a95572b00)   = 0
munmap(0x2a95557000, 110042)            = 0
brk(0)                                  = 0x508000
brk(0x529000)                           = 0x529000
open("/usr/lib/locale/locale-archive", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=48510448, ...}) = 0
mmap(NULL, 48510448, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95574000
close(3)                                = 0
close(0)                                = 0
open("/dev/zero", O_RDONLY)             = 0
close(1)                                = 0
open("/dev/mirror_1/mirror_10", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 1
rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGQUIT, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGPIPE, {0x401a80, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGUSR1, {0x401af0, [], SA_RESTORER, 0x3eefa2e380}, NULL, 8) = 0
ioctl(0, MGSL_IOCSTXIDLE or MTIOCGET or SNDCTL_MIDI_MPUCMD, 0x7fbffff760) = -1
ENOTTY (Inappropriate ioctl for device)
lseek(0, 51200, SEEK_CUR)               = 0
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
open("/usr/share/locale/locale.alias", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=2528, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x2a983b8000
read(3, "# Locale name alias data base.\n#"..., 4096) = 2528
read(3, "", 4096)                       = 0
close(3)                                = 0
munmap(0x2a983b8000, 4096)              = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No
such file or directory)
write(2, "10+0 records in\n", 1610+0 records in
)       = 16
write(2, "10+0 records out\n", 1710+0 records out
)      = 17
close(0)                                = 0
close(1      [HANG]


[root@taft-04 sbin]# lvs
  LV        VG       Attr   LSize   Origin Snap%  Move Log            Copy%
  mirror_10 mirror_1 mwi-a- 136.23G                    mirror_10_mlog 100.00

[root@taft-04 sbin]# dmsetup status
mirror_1-mirror_10: 0 285696000 mirror 2 253:3 253:4 279001/279000 1 AA 3
clustered_disk 253:2 A
mirror_1-mirror_10_mlog: 0 8192 linear
VolGroup00-LogVol01: 0 4063232 linear
mirror_1-mirror_10_mimage_1: 0 285696000 linear
VolGroup00-LogVol00: 0 40960000 linear
mirror_1-mirror_10_mimage_0: 0 285696000 linear
    


May 31 04:20:10 taft-01 kernel: device-mapper: I'm the cluster log server for
LVM-Pq5y7k
May 31 04:20:10 taft-01 kernel: device-mapper: Disk Resume::
May 31 04:20:10 taft-01 kernel: device-mapper:   Live nodes        :: 1
May 31 04:20:10 taft-01 kernel: device-mapper:   In-Use Regions    :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Good IUR's        :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Bad IUR's         :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Sync count        :: 0
May 31 04:20:10 taft-01 kernel: device-mapper:   Disk Region count ::
184467440737095515
May 31 04:20:10 taft-01 kernel: device-mapper:   Region count      :: 279000
May 31 04:20:10 taft-01 kernel: device-mapper:   NOTE:  Mapping has changed.
May 31 04:20:10 taft-01 kernel: device-mapper: Marked regions::
May 31 04:20:10 taft-01 kernel: device-mapper:   0 - -1
May 31 04:20:10 taft-01 kernel: device-mapper:   Total = -1
May 31 04:20:10 taft-01 kernel: device-mapper: Out-of-sync regions::
May 31 04:20:10 taft-01 kernel: device-mapper:   0 - -1
May 31 04:20:10 taft-01 kernel: device-mapper:   Total = -1
May 31 04:20:11 taft-01 lvm[15125]: Monitoring mirror device, mirror_1-mirror_10
for ev
May 31 04:20:30 taft-01 qarshd[15127]: Talking to peer 10.15.80.47:49294
May 31 04:20:30 taft-01 qarshd[15127]: Running cmdline:
PATH=/usr/kerberos/bin:/usr/loc
CMAN: removing node taft-04 from the cluster : Missed too many heartbeats
May 31 04:39:24 taft-01 kernel: CMAN: removing node taft-04 from the cluster :
Missed too mas
May 31 04:39:57 taft-01 fenced[14110]: taft-04 not a cluster member after 30 sec
post_fail_dy
May 31 04:39:57 taft-01 fenced[14110]: fencing node "taft-04"
May 31 04:40:01 taft-01 fenced[14110]: fence "taft-04" success
May 31 04:40:06 taft-01 kernel: device-mapper: A node has failed.
May 31 04:40:06 taft-01 kernel: device-mapper: Disk Resume::
May 31 04:40:06 taft-01 kernel: device-mapper:   Live nodes        :: 3
May 31 04:40:06 taft-01 kernel: device-mapper:   In-Use Regions    :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Good IUR's        :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Bad IUR's         :: 0
May 31 04:40:06 taft-01 kernel: device-mapper:   Sync count        :: 2
May 31 04:40:06 taft-01 kernel: device-mapper:   Disk Region count :: 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Region count      :: 279000
May 31 04:40:06 taft-01 kernel: device-mapper: Marked regions::
May 31 04:40:06 taft-01 kernel: device-mapper:   0 - 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Total = 279000
May 31 04:40:06 taft-01 kernel: device-mapper: Out-of-sync regions::
May 31 04:40:06 taft-01 kernel: device-mapper:   1 - 47190
May 31 04:40:06 taft-01 kernel: device-mapper:   47192 - 279000
May 31 04:40:06 taft-01 kernel: device-mapper:   Total = 278998
May 31 04:44:00 taft-01 kernel: CMAN: node taft-04 rejoining




Version-Release number of selected component (if applicable):
[root@taft-04 sbin]# rpm -q cmirror-kernel-smp
cmirror-kernel-smp-2.6.9-4.2

Comment 2 Jonathan Earl Brassow 2006-06-21 21:27:13 UTC
Please retest with kernel > 39.EL


Comment 3 Jonathan Earl Brassow 2006-07-06 04:32:44 UTC
Sorry, this should really be "modified" not needinfo

Comment 4 Corey Marthaler 2006-07-07 21:41:35 UTC
fix verified.

Comment 6 Alasdair Kergon 2010-04-27 14:51:54 UTC
Assuming this VERIFIED fix got released.  Closing.
Reopen if it's not yet resolved.


Note You need to log in before you can comment on or make changes to this bug.