Bug 203233

Summary: RPM deadlocks if RPM database build is left in an uncomplete state
Product: [Fedora] Fedora Reporter: David Tonhofer <bughunt>
Component: rpmAssignee: Panu Matilainen <pmatilai>
Status: CLOSED WONTFIX QA Contact:
Severity: medium Docs Contact:
Priority: medium    
Version: 5CC: bugzilla, triage, zuirdj
Target Milestone: ---   
Target Release: ---   
Hardware: i686   
OS: Linux   
Whiteboard: bzcl34nup
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2008-05-06 16:14:30 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description David Tonhofer 2006-08-19 17:19:30 UTC
Description of problem:

1) The RPM database build (sometime after firstboot) failed because
   I had a '/var' that was too small.
2) After some time, I noticed that there was a problem with 'rpm',
   as there were a couple of cron-started 'rpm' processes hanging around
   that I had to 'kill -9'. 'rpm' did not work, seemed to be hanging (correct
   hunch, as it was waiting, see below). CTRL-C or kill were useless to stop
   'rpm', only 'kill -9' stopped the process. This occurred for 'rpm --query'
   and 'yum' and 'rpm --rebuilddb' (even after an 'rpm --initdb'). Blimey!
3) Looking further, files in '/var/lib/rpm' called '__db.XXX' seemed to 
   bother 'rpm'. Note that these files also are there after a *successful*
   database build. 'gdb' and 'strace' show that 'rpm' works a bit, then waits
   on a futex, probably thinking that some other process is still working on
   the database.
4) After removal of the /var/lib/rpm/__db.XXX files and an 'rpm --rebuilddb',
   things are working fine.

Conclusion: The mechanism for exclusive access to the RPM db needs some kind
of review.

strace of a hanging 'rpm --rebuilddb' yields:

<..snip...>
munmap(0xb7bb1000, 4096)                = 0
stat64("/var/lib/rpmrebuilddb.4128/DB_CONFIG", 0xbfd7f5cc) = -1 ENOENT (No such
file or directory)
open("/var/lib/rpmrebuilddb.4128/DB_CONFIG", O_RDONLY|O_LARGEFILE) = -1 ENOENT
(No such file or directory)
stat64("/var/lib/rpmrebuilddb.4128/__db.001", 0xbfd7f64c) = -1 ENOENT (No such
file or directory)
open("/var/lib/rpmrebuilddb.4128/__db.001", O_RDWR|O_CREAT|O_EXCL|O_LARGEFILE,
0644) = 5
fcntl64(5, F_SETFD, FD_CLOEXEC)         = 0
open("/var/lib/rpmrebuilddb.4128/__db.001", O_RDWR|O_CREAT|O_LARGEFILE, 0644) = 6
fcntl64(6, F_SETFD, FD_CLOEXEC)         = 0
_llseek(6, 0, [0], SEEK_END)            = 0
_llseek(6, 16384, [16384], SEEK_CUR)    = 0
write(6, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
mmap2(NULL, 24576, PROT_READ|PROT_WRITE, MAP_SHARED, 6, 0) = 0xb7bac000
close(6)                                = 0
gettimeofday({1156006124, 976551}, NULL) = 0
close(5)                                = 0
stat64("/var/lib/rpmrebuilddb.4128/__db.002", 0xbfd7f66c) = -1 ENOENT (No such
file or directory)
open("/var/lib/rpmrebuilddb.4128/__db.002", O_RDWR|O_CREAT|O_LARGEFILE, 0644) = 5
fcntl64(5, F_SETFD, FD_CLOEXEC)         = 0
_llseek(5, 0, [0], SEEK_END)            = 0
_llseek(5, 1310720, [1310720], SEEK_CUR) = 0
write(5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192
mmap2(NULL, 1318912, PROT_READ|PROT_WRITE, MAP_SHARED, 5, 0) = 0xb7a6a000
close(5)                                = 0
stat64("/var/lib/rpmrebuilddb.4128/Packages", 0xbfd7f46c) = -1 ENOENT (No such
file or directory)
open("/usr/lib/gconv/gconv-modules.cache", O_RDONLY) = 5
fstat64(5, {st_mode=S_IFREG|0644, st_size=25404, ...}) = 0
mmap2(NULL, 25404, PROT_READ, MAP_SHARED, 5, 0) = 0xb7a63000
close(5)                                = 0
futex(0xce3a4c, FUTEX_WAKE, 2147483647) = 0
open("/var/lib/rpmrebuilddb.4128/__db.Packages",
O_RDWR|O_CREAT|O_EXCL|O_LARGEFILE, 0644) = 5
fcntl64(5, F_SETFD, FD_CLOEXEC)         = 0
fstat64(5, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
stat64("/var/lib/rpmrebuilddb.4128/__db.Packages", {st_mode=S_IFREG|0644,
st_size=0, ...}) = 0
gettimeofday({1156006124, 979843}, NULL) = 0
_llseek(5, 0, [0], SEEK_SET)            = 0
write(5, "\0\0\0\0\1\0\0\0\0\0\0\0a\25\6\0\10\0\0\0\0\20\0\0\0\10"..., 4096) = 4096
_llseek(5, 8192, [8192], SEEK_SET)      = 0
write(5, "\0\0\0\0\1\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\20\0\2\0"..., 4096) = 4096
close(5)                                = 0
stat64("/var/lib/rpmrebuilddb.4128/Packages", 0xbfd7f46c) = -1 ENOENT (No such
file or directory)
rename("/var/lib/rpmrebuilddb.4128/__db.Packages",
"/var/lib/rpmrebuilddb.4128/Packages") = 0
open("/var/lib/rpmrebuilddb.4128/Packages", O_RDWR|O_LARGEFILE) = 5
fcntl64(5, F_SETFD, FD_CLOEXEC)         = 0
fstat64(5, {st_mode=S_IFREG|0644, st_size=12288, ...}) = 0
pread64(5, "\0\0\0\0\1\0\0\0\0\0\0\0a\25\6\0\10\0\0\0\0\20\0\0\0\10"..., 4096,
0) = 4096
rt_sigprocmask(SIG_BLOCK, ~[RTMIN RT_1], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
mmap2(NULL, 143360, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7a40000
futex(0xb7d5ecd0, FUTEX_WAIT, 2, NULL <unfinished ...>


stacktrace for 'rpm --rebuilddb'

#0  0x00d7e402 in ?? ()
#1  0x00de77ae in __lll_mutex_lock_wait () from /lib/libpthread.so.0
#2  0x00de416c in _L_mutex_lock_70 () from /lib/libpthread.so.0
#3  0x00de3fb8 in pthread_mutex_lock () from /lib/libpthread.so.0
#4  0x0047c4bf in __db_pthread_mutex_lock_rpmdb () from /usr/lib/librpmdb-4.4.so
#5  0x0051dddb in __memp_fget_rpmdb () from /usr/lib/librpmdb-4.4.so
#6  0x004ee850 in __db_goff_rpmdb () from /usr/lib/librpmdb-4.4.so
#7  0x004f615d in __db_ret_rpmdb () from /usr/lib/librpmdb-4.4.so
#8  0x004e148d in __db_c_get_rpmdb () from /usr/lib/librpmdb-4.4.so
#9  0x004e7946 in __db_c_get_pp_rpmdb () from /usr/lib/librpmdb-4.4.so
#10 0x00477946 in tagType () from /usr/lib/librpmdb-4.4.so
#11 0x00471f73 in rpmdbNextIterator () from /usr/lib/librpmdb-4.4.so
#12 0x00475d44 in rpmdbRebuild () from /usr/lib/librpmdb-4.4.so
#13 0x0069ed56 in rpmtsRebuildDB () from /usr/lib/librpm-4.4.so
#14 0x08049992 in ?? ()
#15 0x00bc97e4 in __libc_start_main () from /lib/libc.so.6
#16 0x08049231 in ?? ()

Version-Release number of selected component (if applicable):

   rpm-4.4.2-15.2

How reproducible:

   Did not try

Comment 1 Jeff Johnson 2006-08-19 23:50:42 UTC
Doing
    rm -f /var/lib/rpm/__db*
will remove stale locks left from running out of space on /var.

Yes, the file __db* files will reappear after the next rpm command run as root,
the files are expected to be there.

Comment 2 Kasper Dupont 2006-11-23 07:19:30 UTC
This problem shows up all the time on FC5 systems. Removing the __db files does
solve the problem temporarily, but after some time the problem will show up
again. I found that the address shown in the futex call is inside a mmap of
__db.002 (not much of a surprise).

I suspect this might be caused by cron jobs reading the rpm database. If a cron
job has locked the data base when you shut down, it might remain locked after a
restart. I haven't tried to verify this theory.

If I'm right about this, a solution could be to ensure that the database does
not get locked by commands that only read the database. Any access to the
database performed by a process not running as root already reads without
locking. Even when an rpmq command run as root deadlocks, the same command run
as another user works for me.

Comment 3 Jeff Johnson 2006-11-23 12:49:47 UTC
One has to be careful with vocabulary, "deadlock" is not what is happening, blocking on
a stale lock is a more accurate description usually.

You're correct that non-root does not open a dbenv and will often run to completion when
a root process (which does open a dbenv) blocks. Howvere, the non-root process running
without locks can/will segfault on inconsistent data without the ability to create a shared
lock for reading an rpmdb.

Comment 4 Jeff Johnson 2006-12-03 18:30:53 UTC
Segafualts and loss of data are likely due to removing an rpmdb environment
without correcting other problems in the rpmdb.

FYI: Most rpmdb "hangs" are now definitely fixed by purging stale read locks when opening
a database environment in rpm-4.4.8-0.4. There's more todo, but I'm quite sure that a
large class of problems with symptoms of "hang" are now corrected.

Detecting damaged by verifying when needed is well automated in rpm-4.4.8-0.4. Automatically 
correcting all possible damage is going to take more work, but a large class of problems is likely
already fixed in rpm-4.4.8-0.4 as well.

UPSTREAM

Comment 5 Red Hat Bugzilla 2007-08-21 05:24:54 UTC
User pnasrat's account has been closed

Comment 6 Panu Matilainen 2007-08-22 06:30:53 UTC
Reassigning to owner after bugzilla made a mess, sorry about the noise...

Comment 7 Bug Zapper 2008-04-04 03:33:04 UTC
Fedora apologizes that these issues have not been resolved yet. We're
sorry it's taken so long for your bug to be properly triaged and acted
on. We appreciate the time you took to report this issue and want to
make sure no important bugs slip through the cracks.

If you're currently running a version of Fedora Core between 1 and 6,
please note that Fedora no longer maintains these releases. We strongly
encourage you to upgrade to a current Fedora release. In order to
refocus our efforts as a project we are flagging all of the open bugs
for releases which are no longer maintained and closing them.
http://fedoraproject.org/wiki/LifeCycle/EOL

If this bug is still open against Fedora Core 1 through 6, thirty days
from now, it will be closed 'WONTFIX'. If you can reporduce this bug in
the latest Fedora version, please change to the respective version. If
you are unable to do this, please add a comment to this bug requesting
the change.

Thanks for your help, and we apologize again that we haven't handled
these issues to this point.

The process we are following is outlined here:
http://fedoraproject.org/wiki/BugZappers/F9CleanUp

We will be following the process here:
http://fedoraproject.org/wiki/BugZappers/HouseKeeping to ensure this
doesn't happen again.

And if you'd like to join the bug triage team to help make things
better, check out http://fedoraproject.org/wiki/BugZappers

Comment 8 Bug Zapper 2008-05-06 16:14:28 UTC
This bug is open for a Fedora version that is no longer maintained and
will not be fixed by Fedora. Therefore we are closing this bug.

If you can reproduce this bug against a currently maintained version of
Fedora please feel free to reopen thus bug against that version.

Thank you for reporting this bug and we are sorry it could not be fixed.