Bug 1263142

Summary: `/bin/plymouth --details` stall
Product: Red Hat Enterprise Linux 6 Reporter: Martin Cermak <mcermak>
Component: plymouthAssignee: Ray Strode [halfline] <rstrode>
Status: CLOSED WORKSFORME QA Contact: Desktop QE <desktop-qa-list>
Severity: medium Docs Contact:
Priority: medium    
Version: 6.7   
Target Milestone: rc   
Target Release: ---   
Hardware: Unspecified   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2015-09-21 17:29:48 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description Martin Cermak 2015-09-15 08:19:08 UTC
On my rhel-6.7 test box I was attempting to do a `service avahi-daemon restart` and noticed it "stalls". Strace shows that the stall is caused by `/bin/plymouth --details`. Versions used: plymouth-0.8.3-27.el6_5.1.x86_64, glibc-2.12-1.166.el6.x86_64.

Strace log:

=======
strace -f /bin/plymouth --details

... stuff deleted ...

open("/lib64/libpthread.so.0", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340]\340&7\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=145896, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f913f9a4000
mmap(0x3726e00000, 2212848, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3726e00000
mprotect(0x3726e17000, 2097152, PROT_NONE) = 0
mmap(0x3727017000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x3727017000
mmap(0x3727019000, 13296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3727019000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f913f9a3000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f913f9a2000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f913f9a1000
arch_prctl(ARCH_SET_FS, 0x7f913f9a2700) = 0
mprotect(0x3727882000, 4096, PROT_READ) = 0
mprotect(0x3727406000, 4096, PROT_READ) = 0
mprotect(0x3726802000, 4096, PROT_READ) = 0
mprotect(0x3726d8a000, 16384, PROT_READ) = 0
mprotect(0x3727017000, 4096, PROT_READ) = 0
mprotect(0x372641f000, 4096, PROT_READ) = 0
munmap(0x7f913f9a6000, 63724)           = 0
set_tid_address(0x7f913f9a29d0)         = 14880
set_robust_list(0x7f913f9a29e0, 24)     = 0
futex(0x7fffe4fe5c8c, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7fffe4fe5c8c, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 1, NULL, 7f913f9a2700) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigaction(SIGRTMIN, {0x3726e05c60, [], SA_RESTORER|SA_SIGINFO, 0x3726e0f790}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x3726e05cf0, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x3726e0f790}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM64_INFINITY}) = 0
rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER|SA_RESTART, 0x3726a326a0}, {SIG_DFL, [], 0}, 8) = 0
brk(0)                                  = 0x1192000
brk(0x11b3000)                          = 0x11b3000
epoll_create(64)                        = 3
pipe([4, 5])                            = 0
fcntl(4, F_SETFD, FD_CLOEXEC|0x800)     = 0
fcntl(5, F_SETFD, FD_CLOEXEC|0x800)     = 0
epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLERR|EPOLLHUP, {u32=18424048, u64=18424048}}) = 0
epoll_ctl(3, EPOLL_CTL_MOD, 4, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=18424048, u64=18424048}}) = 0
open("proc/cmdline", O_RDONLY)          = -1 ENOENT (No such file or directory)
socket(PF_LOCAL, SOCK_STREAM, 0)        = 6
fcntl(6, F_SETFD, FD_CLOEXEC|0x800)     = 0
setsockopt(6, SOL_SOCKET, SO_PASSCRED, [1], 4) = 0
connect(6, {sa_family=AF_LOCAL, sun_path=@"/ply-boot-protocol"}, 110) = 0
epoll_ctl(3, EPOLL_CTL_ADD, 6, {EPOLLERR|EPOLLHUP, {u32=18436912, u64=18436912}}) = 0
epoll_ctl(3, EPOLL_CTL_MOD, 6, {EPOLLERR|EPOLLHUP, {u32=18436912, u64=18436912}}) = 0
epoll_ctl(3, EPOLL_CTL_MOD, 6, {EPOLLOUT|EPOLLERR|EPOLLHUP, {u32=18436912, u64=18436912}}) = 0
epoll_wait(3, {{EPOLLOUT, {u32=18436912, u64=18436912}}}, 8, -1) = 1
write(6, "!\0", 2)                      = 2
epoll_ctl(3, EPOLL_CTL_MOD, 6, {EPOLLIN|EPOLLOUT|EPOLLERR|EPOLLHUP, {u32=18436912, u64=18436912}}) = 0
epoll_ctl(3, EPOLL_CTL_MOD, 6, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=18436912, u64=18436912}}) = 0
epoll_wait(3,
=======

Traceback:

=======
 6.7 S x86_64 # gdb -q -args /bin/plymouth --details
Reading symbols from /bin/plymouth...Reading symbols from /usr/lib/debug/bin/plymouth.debug...done.
done.
(gdb) r
Starting program: /bin/plymouth --details
[Thread debugging using libthread_db enabled]
^C
Program received signal SIGINT, Interrupt.
0x0000003726ae8f43 in __epoll_wait_nocancel () at ../sysdeps/unix/syscall-template.S:82
82      T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
(gdb) bt
#0  0x0000003726ae8f43 in __epoll_wait_nocancel () at ../sysdeps/unix/syscall-template.S:82
#1  0x0000003727a0534b in ply_event_loop_process_pending_events (loop=0x608010) at ply-event-loop.c:1276
#2  0x0000003727a05da0 in ply_event_loop_run (loop=0x608010) at ply-event-loop.c:1364
#3  0x000000000040369d in main (argc=<value optimized out>, argv=<value optimized out>) at ./plymouth.c:1133
(gdb)
=======

Comment 2 Martin Cermak 2015-09-15 10:14:07 UTC
I think the hang happens here in epoll_wait():

=======
(gdb) up
#1  0x0000003727a0534b in ply_event_loop_process_pending_events (loop=0x608010) at ply-event-loop.c:1276
1276         number_of_received_events = epoll_wait (loop->epoll_fd, events,
(gdb) l
1271           {
1272             timeout = (int) ((loop->wakeup_time - ply_get_timestamp ()) * 1000);
1273             timeout = MAX (timeout, 0);
1274           }
1275
1276         number_of_received_events = epoll_wait (loop->epoll_fd, events,
1277                                                 sizeof (events), timeout);
=======

Here, timeout is 0 and loop->epoll_fd points to: 

 6.7 S x86_64 # ls -l /proc/20862/fd/7
lrwx------ 1 root root 64 Sep 15 06:08 /proc/20862/fd/7 -> [eventpoll]

Comment 3 Ray Strode [halfline] 2015-09-15 13:31:44 UTC
is it looping ?  a timeout of 0 to epoll_wait() should return immediately.

plymouth --details is a command that is run when there is a failure.  I wonder if this stall is the cause of your problem or just a side effect.  Do you see anything strange in dmesg ?

Comment 4 Martin Cermak 2015-09-18 06:47:39 UTC
Unfortunately I don't have access to the reproducer box any more. Feel free to close this out if you consider it appropriate. I can possibly re-open.

Comment 5 Martin Cermak 2015-09-18 07:53:49 UTC
Got another box, but cannot reproduce.

Comment 6 Ray Strode [halfline] 2015-09-21 17:29:48 UTC
okay let's close this one out since we don't have a customer complaining and it's not clear we'll be able to debug the problem without a reproduction environment.