Bug 1909261
| Summary: | Recursive locks observed in nss-3.53.1 [rhel-7.9.z] | ||
|---|---|---|---|
| Product: | Red Hat Enterprise Linux 7 | Reporter: | Paulo Andrade <pandrade> |
| Component: | nss | Assignee: | Bob Relyea <rrelyea> |
| Status: | CLOSED ERRATA | QA Contact: | Ivan Nikolchev <inikolch> |
| Severity: | urgent | Docs Contact: | |
| Priority: | high | ||
| Version: | 7.9 | CC: | amepatil, avettath, cmurphycode, gabriel.arthur.petursson, gabriel, gunnar.gudvardarson, gunnar, hkario, inikolch, jreznik, lagordon, msauton, rrelyea, sbroz, snejoshi, ssorce, tscherf |
| Target Milestone: | rc | Keywords: | Triaged, ZStream |
| Target Release: | 7.9 | Flags: | pm-rhel:
mirror+
|
| Hardware: | All | ||
| OS: | Linux | ||
| Whiteboard: | |||
| Fixed In Version: | Doc Type: | If docs needed, set a value | |
| Doc Text: | Story Points: | --- | |
| Clone Of: | Environment: | ||
| Last Closed: | 2021-10-12 15:26:38 UTC | Type: | Bug |
| Regression: | --- | Mount Type: | --- |
| Documentation: | --- | CRM: | |
| Verified Versions: | Category: | --- | |
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |
| Cloudforms Team: | --- | Target Upstream Version: | |
| Embargoed: | |||
Same issue as reported in bz#1896808 Extra data that might help: (gdb) f 4 #4 0x00007fc357e11839 in PK11_EnterSlotMonitor (slot=slot@entry=0xd74b70) at pk11slot.c:484 484 PZ_Lock(slot->sessionLock); (gdb) p *slot $1 = {functionList = 0x7fc34cbbace0 <sftk_funcList>, module = 0xd12090, needTest = 0, isPerm = 1, isHW = 0, isInternal = 1, disabled = 0, reason = PK11_DIS_NONE, readOnly = 1, needLogin = 0, hasRandom = 1, defRWSession = 0, isThreadSafe = 0, flags = 1547, session = 1, sessionLock = 0xd12280, slotID = 1, defaultFlags = 2684370749, refCount = 22, freeListLock = 0xd74ef0, freeSymKeysWithSessionHead = 0x0, freeSymKeysHead = 0x0, keyCount = 0, maxKeyCount = 800, askpw = 0, timeout = 30, authTransact = 0, authTime = 0, minPassword = 0, maxPassword = 0, series = 2, flagSeries = 213, flagState = 0, wrapKey = 0, wrapMechanism = 4294967295, refKeys = {0}, mechanismList = 0xd78840, mechanismCount = 207, cert_array = 0x0, array_size = 1, cert_count = 0, serial = '0' <repeats 16 times>, slot_name = "NSS Internal Cryptographic Services", '\000' <repeats 29 times>, token_name = "NSS Generic Crypto Services\000\000\000\000\000", hasRootCerts = 0, hasRootTrust = 0, hasRSAInfo = 0, RSAInfoFlags = 0, protectedAuthPath = 0, isActiveCard = 0, lastLoginCheck = 0, lastState = 0, nssToken = 0xd7b9c0, tokenInfo = {label = "NSS Generic Crypto Services ", manufacturerID = "Mozilla Foundation", ' ' <repeats 14 times>, model = "NSS 3", ' ' <repeats 11 times>, serialNumber = '0' <repeats 16 times>, flags = 1539, ulMaxSessionCount = 0, ulSessionCount = 0, ulMaxRwSessionCount = 0, ulRwSessionCount = 0, ulMaxPinLen = 0, ulMinPinLen = 0, ulTotalPublicMemory = 0, ulFreePublicMemory = 0, ulTotalPrivateMemory = 0, ulFreePrivateMemory = 0, hardwareVersion = {major = 4 '\004', minor = 0 '\000'}, firmwareVersion = {major = 0 '\000', minor = 0 '\000'}, utcTime = '0' <repeats 16 times>}, mechanismBits = "\a\a\006\003\003\003\001\000\000\001\000\000\000\001\000\000\a\a\005\001\001\001\001\000\000\000\000\000\000\000\000\000\a\a\006\002\002\002\000\000\000\000\000\000\000\000\000\000\002\002\002\002\002\002\002\000\000\000\000\000\000\000\000\000\003\003\003\002\002\002\001\000\000\000\000\000\000\000\000\000ldd``ddd\000\000\000\000\000\000\000\000\f\004\f\b\b\b\000\000\000\000\000\000\000\000\000\000\f\f\f\b\000\b\b\b\000\000\000\000\000\000\000\000\b\b", '\000' <repeats 14 times>, "\b\b\b\b\b\b\b\000\000\000\000\000\000\000\000\000\b\b\000\000\000\000\b\b\b\b\b\b\b\b\b\000\b", '\000' <repeats 15 times>, "\b", '\000' <repeats 31 times>..., profileList = 0x0, profileCount = 0} (gdb) f 3 #3 0x00007fc35755feb9 in PR_Lock (lock=0xd12280) at ../../../nspr/pr/src/pthreads/ptsynch.c:176 176 rv = pthread_mutex_lock(&lock->mutex); (gdb) p *lock $2 = {mutex = {__data = {__lock = 2, __count = 0, __owner = 27353, __nusers = 1, __kind = 3, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000\331j\000\000\001\000\000\000\003", '\000' <repeats 22 times>, __align = 2}, notified = {length = 0, cv = {{cv = 0x0, times = 0}, {cv = 0x0, times = 0}, {cv = 0x0, times = 0}, {cv = 0x0, times = 0}, {cv = 0x0, times = 0}, {cv = 0x0, times = 0}}, link = 0x0}, locked = 1, owner = 140476982093888} (gdb) p *context $3 = {operation = 2181038341, key = 0xdbeb90, slot = 0xd74b70, session = 7, sessionLock = 0xdbee00, ownSession = 1, cx = 0x0, savedData = 0x0, savedLength = 0, param = 0x7fc357eaaab0 <pk11_null_params>, init = 0, type = 4231, fortezzaHack = 0, simulate_message = 0, simulate_mechanism = 4231, ivCounter = 0, ivMaxCount = 0, ivLen = 0, ivFixedBits = 0, ivGen = 0} Just tested a nss build with this patch:
"""
diff -up nss-3.53.1/nss/lib/pk11wrap/pk11slot.c.orig nss-3.53.1/nss/lib/pk11wrap/pk11slot.c
--- nss-3.53.1/nss/lib/pk11wrap/pk11slot.c.orig 2020-12-18 14:58:27.838083986 -0500
+++ nss-3.53.1/nss/lib/pk11wrap/pk11slot.c 2020-12-18 15:01:02.682178935 -0500
@@ -738,8 +738,11 @@ PK11_GetRWSession(PK11SlotInfo *slot)
}
if (slot->defRWSession) {
PORT_Assert(slot->session != CK_INVALID_HANDLE);
- if (slot->session != CK_INVALID_HANDLE)
+ if (slot->session != CK_INVALID_HANDLE) {
+ if (haveMonitor)
+ PK11_ExitSlotMonitor(slot);
return slot->session;
+ }
}
crv = PK11_GETTAB(slot)->C_OpenSession(slot->slotID,
@@ -757,6 +760,8 @@ PK11_GetRWSession(PK11SlotInfo *slot)
if (slot->defRWSession) { /* we have the monitor */
slot->session = rwsession;
}
+ if (haveMonitor)
+ PK11_ExitSlotMonitor(slot);
return rwsession;
}
"""
unfortunately the problem was not corrected, so, it should be code elsewhere that
is acquiring the lock.
Also, could not trigger the issue when running under gdb, so, it might be not trivial
do debug the root cause of the problem.
Valgrind with different tools also does not tell anything.
It's pretty unlikely that PK11_GetRWSession is causing the issue. While the code is clearly incorrect, the issue would only show up in non-threadsafe tokens, which the internal crypto module is not, so it never gets the monitor in this code. Clearly we don't often try to write to non-thread safe tokens very often (defRWSession is also pretty rare these days). It sounds like a race (which won't be easy to track down). bob OK, so there is a very clear problem here:
isThreadSafe = 0
The internal crypto cypto should have isThreadSafe set to 1!
What does *slot->module show?
This might be an issue in curl, at least as now only observed the in curl.
(gdb) p *slot->module
$1 = {arena = 0x10b8570, internal = 1, loaded = 1, isFIPS = 0, dllName = 0x0,
commonName = 0x10b87f0 "NSS Internal Crypto Services", library = 0x0,
functionList = 0x7fa7e0d6ace0 <sftk_funcList>, refLock = 0x10b8880,
refCount = 2, slots = 0x10b8810, slotCount = 2, slotInfo = 0x10b87b0,
slotInfoCount = 0, moduleID = 1, isThreadSafe = 0, ssl = {0, 0},
libraryParams = 0x10b8768 "configdir='sql:/etc/pki/nssdb' tokenDescription='NSS system database' ", moduleDBFunc = 0x0, parent = 0x10b6a20, isCritical = 1,
isModuleDB = 0, moduleDBOnly = 0, trustOrder = 80, cipherOrder = 100,
evControlMask = 0, cryptokiVersion = {major = 3 '\003', minor = 0 '\000'},
flags = 1}
Running under gdb, the first time PK11_EnterSlotMonitor is called with
a 'slot->isThreadSafe = 0' is:
Breakpoint 1, PK11_EnterSlotMonitor (slot=0x6d5d30) at pk11slot.c:484
484 PZ_Lock(slot->sessionLock);
(gdb) p slot.isThreadSafe
$6 = 0
(gdb) bt
#0 PK11_EnterSlotMonitor (slot=0x6d5d30) at pk11slot.c:484
#1 0x00007ffff7417555 in PK11_InitToken (slot=slot@entry=0x6d5d30,
loadCerts=loadCerts@entry=1) at pk11slot.c:1263
#2 0x00007ffff74177a4 in PK11_InitSlot (mod=mod@entry=0x6d2330,
slotID=<optimized out>, slot=0x6d5d30) at pk11slot.c:1553
#3 0x00007ffff73fedb7 in secmod_LoadPKCS11Module (mod=mod@entry=0x6d2330,
oldModule=oldModule@entry=0x7fffffffd290) at pk11load.c:594
#4 0x00007ffff740c113 in SECMOD_LoadModule (
modulespec=0x64a450 "library=libnsspem.so name=PEM", parent=0x0, recurse=0)
at pk11pars.c:1840
#5 0x00007ffff740c319 in SECMOD_LoadUserModule (modulespec=<optimized out>,
parent=<optimized out>, recurse=<optimized out>) at pk11pars.c:1936
#6 0x00007ffff7bb448b in nss_connect_common () from /lib64/libcurl.so.4
...
Yes, The module isThreadSafe is set to zero, which is worrying. Hmm let me check SECMOD_LoadUserModule(). That might be why curl is the only one running into this because curl is the only one that uses SECMOD_LoadUserModule(). Wait, the LoadUserModule call loads the libnsspem.so. That token should have slot->isThreadSafe set to false. The place where we are hanging is referencing the lock for the internal crypto module, which should be threadSafe. So the slot->isThreadSafe is set from the module->isThreadSafe. The module isThreadSafe value is must have been set to zero at init time (it's highly unlikely that something selectively trashed both flags). This means we are setting isThreadSafe to false at init time. I suspect libcurl is initializing NSS with the NSS_INIT_PK11RELOAD flag. (either with NSS_InitContext or NSS_Initialize). If so then it's likely libcurl is correct and it's just the only application tripping over this issue because it's the only application using NSS_INIT_PK11RELOAD and loading nss multiple times. After trying several approaches to attempt to debug, finally I got one that appears
to work reliably.
I see on a log file of a running process the first lock as:
11700:0xb59c80
//wrap_pthread.so(pthread_mutex_lock+0x3a) [0x7f47acf5d98f]
/lib64/libnspr4.so(PR_Lock+0x15) [0x7f47abce05f5]
/lib64/libnss3.so(+0x39922) [0x7f47ac573922]
/lib64/libnss3.so(PK11_CreateContextBySymKey+0x3f) [0x7f47ac573a8f]
/lib64/libssl3.so(+0x100ae) [0x7f47acaa60ae]
"""
397 mech_info.pParameter = param->data;
0x00007f47ac573905 <+245>: mov 0x8(%rbp),%rax
0x00007f47ac573910 <+256>: mov %rax,0x8(%rsp)
398 mech_info.ulParameterLen = param->len;
0x00007f47ac573915 <+261>: mov 0x10(%rbp),%eax
0x00007f47ac573918 <+264>: mov %rax,0x10(%rsp)
399 PK11_EnterContextMonitor(context);
0x00007f47ac573909 <+249>: mov %rbx,%rdi
0x00007f47ac57391d <+269>: callq 0x7f47ac572e40 <PK11_EnterContextMonitor>
400 rv = pk11_context_init(context, &mech_info);
0x00007f47ac573922 <+274>: mov %rsp,%rsi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
0x00007f47ac573925 <+277>: mov %rbx,%rdi
0x00007f47ac573928 <+280>: callq 0x7f47ac573510 <pk11_context_init>
0x00007f47ac573930 <+288>: mov %eax,%ebp
401 PK11_ExitContextMonitor(context);
0x00007f47ac57392d <+285>: mov %rbx,%rdi
0x00007f47ac573932 <+290>: callq 0x7f47ac572e70 <PK11_ExitContextMonitor>
"""
and the second, hanging lock, as in the backtrace in the first comment:
1700:0xb59c80
//wrap_pthread.so(pthread_mutex_lock+0x3a) [0x7f47acf5d98f]
/lib64/libnspr4.so(PR_Lock+0x15) [0x7f47abce05f5]
/lib64/libnss3.so(+0x6040d) [0x7f47ac59a40d]
/lib64/libnss3.so(+0x6044b) [0x7f47ac59a44b]
/lib64/libnss3.so(+0x38dfc) [0x7f47ac572dfc]
/lib64/libnss3.so(+0x39769) [0x7f47ac573769]
/lib64/libnss3.so(+0x3992d) [0x7f47ac57392d]
/lib64/libnss3.so(PK11_CreateContextBySymKey+0x3f) [0x7f47ac573a8f]
"""
Dump of assembler code for function pk11_filterSlot:
...
2287 } else {
2288 if (!slot->isThreadSafe)
0x00007f47ac59a333 <+51>: mov 0x38(%rbx),%ecx
0x00007f47ac59a336 <+54>: test %ecx,%ecx
0x00007f47ac59a338 <+56>: je 0x7f47ac59a400 <pk11_filterSlot+256>
2289 PK11_EnterSlotMonitor(slot);
0x00007f47ac59a400 <+256>: mov %rbx,%rdi
0x00007f47ac59a403 <+259>: mov %rsi,0x8(%rsp)
0x00007f47ac59a408 <+264>: callq 0x7f47ac597830 <PK11_EnterSlotMonitor>
0x00007f47ac59a40d <+269>: mov 0x8(%rsp),%rsi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
0x00007f47ac59a412 <+274>: jmpq 0x7f47ac59a33e <pk11_filterSlot+62>
0x00007f47ac59a417 <+279>: nopw 0x0(%rax,%rax,1)
2290 crv = PK11_GETTAB(slot)->C_GetMechanismInfo(slot->slotID, mechanism,
0x00007f47ac59a33e <+62>: mov (%rbx),%rax
0x00007f47ac59a341 <+65>: lea 0x10(%rsp),%rdx
0x00007f47ac59a346 <+70>: mov 0x58(%rbx),%rdi
0x00007f47ac59a34a <+74>: callq *0x48(%rax)
...
"""
I am not familiar with the code, but it did not yet return from
pk11_context_init in the sequence:
PK11_EnterContextMonitor(context);
rv = pk11_context_init(context, &mech_info);
PK11_ExitContextMonitor(context);
and has the lock of context->slot->sessionLock.mutex held.
So, without understanding very well the code, it appears
PK11_EnterContextMonitor and PK11_ExitContextMonitor should not check
the cx->slot->isThreadSafe field or it will hang.
Likely the proper fix is down the stack to make sure
PK11_CreateContextBySymKey only calls pk11_CreateNewContextInSlot if
symKey->slot->isThreadSafe is set.
This isse appears to not happen if running the tests on a vm with a single
cpu, or if adding a very small delay between every curl initialization.
Because debugging tools like valgrind do not show any issues, maybe it is
some race condition when loading libraries and/or static initializers, or
something in the filesystem?
The problem should be a side effect of https://github.com/nss-dev/nss/commit/d733ccc3ae198d5e7e6e476be62f7213803379fa [Bug 1623374 Need to support the new PKCS #11 Message interface for AE…] Likely PK11_DoesMechanismFlag() needs to not call pk11_filterSlot() or somehow know the slot mutex might be locked by PK11_EnterContextMonitor(). Problem still being experienced by user. Significantly more difficult to reproduce,
but still happens. No reproducer right now, just coredumps.
(gdb) bt full
...
#3 0x00007f3a5ef2ceb9 in PR_Lock (lock=0xc824f0) at ../../../nspr/pr/src/pthreads/ptsynch.c:176
No locals.
#4 0x00007f3a5f7de8a9 in PK11_EnterSlotMonitor (slot=<optimized out>) at pk11slot.c:484
No locals.
#5 0x00007f3a5f7cf3dd in pk11_GetNewSession (slot=slot@entry=0xce4e40, owner=owner@entry=0xd1caa0) at pk11obj.c:361
session = 0
#6 0x00007f3a5f7d95fd in pk11_getKeyFromList (needSession=<optimized out>, slot=slot@entry=0xce4e40) at pk11skey.c:96
symKey = 0xd1ca50
#7 pk11_CreateSymKey (slot=slot@entry=0xce4e40, type=306, owner=owner@entry=0, needSession=needSession@entry=1, wincx=wincx@entry=0x0) at pk11skey.c:143
No locals.
#8 0x00007f3a5f7da32f in PK11_SymKeyFromHandle (slot=slot@entry=0xce4e40, parent=parent@entry=0x0, origin=origin@entry=PK11_OriginDerive, type=<optimized out>, keyID=9, owner=owner@entry=0, wincx=wincx@entry=0x0) at pk11skey.c:329
symKey = <optimized out>
needSession = 1
#9 0x00007f3a5f7da3ef in PK11_GetWrapKey (slot=slot@entry=0xce4e40, wrap=wrap@entry=0, type=type@entry=4294967295, series=series@entry=2, wincx=wincx@entry=0x0) at pk11skey.c:381
symKey = 0x0
#10 0x00007f3a5fcf8207 in ssl3_CacheWrappedSecret (ss=ss@entry=0xd13640, sid=0xd0ecb0, secret=0xd25020) at ssl3con.c:11646
keyLength = <optimized out>
wrapKeyIndex = 0
incarnation = 2
wrappingKey = 0xd1ca50
symKeySlot = 0xce4e40
pwArg = 0x0
rv = SECFailure
isServer = <optimized out>
mechanism = 306
#11 0x00007f3a5fcf833b in ssl3_FillInCachedSID (ss=ss@entry=0xd13640, sid=sid@entry=0xd0ecb0, secret=<optimized out>) at ssl3con.c:11895
No locals.
#12 0x00007f3a5fcfb3fd in ssl3_HandleFinished (length=12, b=0xd18684 "\344\331\306\351S(oT,\356\344\275", 'N' <repeats 75 times>, "\v", ss=0xd13640) at ssl3con.c:11835
rv = SECSuccess
isTLS = <optimized out>
hashes = {len = 48, hashAlg = ssl_hash_sha384, u = {raw = "^s:o\004\371-W\016p\351Q\342/L\310eu\324\334\363\220\066\203\244N@%\356\244c\212\367\223\250\326q\254\036r\203\221>\304\347\311g\301\375\312\317^:\177\000\000q\275.i\000\200\377\377", s = {md5 = "^s:o\004\371-W\016p\351Q\342/", <incomplete sequence \310>, sha = "eu\324\334\363\220\066\203\244N@%\356\244c\212\367\223\250", <incomplete sequence \326>}}}
sid = 0xd0ecb0
isServer = <optimized out>
#13 ssl3_HandlePostHelloHandshakeMessage (length=12, b=0xd18684 "\344\331\306\351S(oT,\356\344\275", 'N' <repeats 75 times>, "\v", ss=0xd13640) at ssl3con.c:12209
rv = <optimized out>
#14 ssl3_HandleHandshakeMessage (ss=ss@entry=0xd13640, b=b@entry=0xd18684 "\344\331\306\351S(oT,\356\344\275", 'N' <repeats 75 times>, "\v", length=12, endOfRecord=1) at ssl3con.c:12093
rv = <optimized out>
epoch = 1
#15 0x00007f3a5fcfc913 in ssl3_HandleHandshake (origBuf=0xd13900, ss=0xd13640) at ssl3con.c:12267
buf = {buf = 0xd18684 "\344\331\306\351S(oT,\356\344\275", 'N' <repeats 75 times>, "\v", len = 12, space = <optimized out>, fixed = <optimized out>}
rv = <optimized out>
#16 ssl3_HandleNonApplicationData (ss=ss@entry=0xd13640, rType=<optimized out>, epoch=<optimized out>, seqNum=<optimized out>, databuf=databuf@entry=0xd13900) at ssl3con.c:12792
rv = <optimized out>
#17 0x00007f3a5fcfceb1 in ssl3_HandleRecord (ss=ss@entry=0xd13640, cText=cText@entry=0x7fff96d14380) at ssl3con.c:13090
rv = <optimized out>
isTLS = <optimized out>
epoch = <optimized out>
spec = <optimized out>
recordSizeLimit = 16384
outOfOrderSpec = 0
rType = ssl_ct_handshake
alert = internal_error
#18 0x00007f3a5fcfe4ff in ssl3_GatherCompleteHandshake (ss=ss@entry=0xd13640, flags=flags@entry=0) at ssl3gthr.c:523
processingEarlyData = 0
ssl2gs = {isV2 = 0, padding = 0 '\000'}
ssl2gs_ptr = <optimized out>
rv = <optimized out>
cText = {seqNum = 0, hdr = 0xd13940 "\026\003\003", hdrLen = 5, buf = 0xd13928}
keepGoing = 1
#19 0x00007f3a5fd05f2b in SSL_ForceHandshake (fd=<optimized out>) at sslsecur.c:370
gatherResult = <optimized out>
ss = 0xd13640
rv = SECFailure
#20 0x00007f3a5fd06046 in SSL_ForceHandshakeWithTimeout (fd=<optimized out>, timeout=<optimized out>) at sslsecur.c:402
No locals.
#21 0x00007f3a5ff7df73 in nss_do_connect (sockindex=0, conn=0xc75f90) at nss.c:1672
data = 0xc53020
curlerr = CURLE_SSL_CONNECT_ERROR
timeout = <optimized out>
time_left = <optimized out>
connssl = 0xc76170
#22 nss_connect_common (conn=conn@entry=0xc75f90, sockindex=sockindex@entry=0, done=done@entry=0x7fff96d1464d) at nss.c:1738
connssl = 0xc76170
data = 0xc53020
blocking = false
rv = <optimized out>
#23 0x00007f3a5ff7eff5 in Curl_nss_connect_nonblocking (conn=conn@entry=0xc75f90, sockindex=sockindex@entry=0, done=done@entry=0x7fff96d1464d) at nss.c:1778
No locals.
First lock is in frame 9, line 370 below:
(gdb) f 9
#9 0x00007f3a5f7da3ef in PK11_GetWrapKey (slot=slot@entry=0xce4e40, wrap=wrap@entry=0, type=type@entry=4294967295, series=series@entry=2, wincx=wincx@entry=0x0) at pk11skey.c:381
381 symKey = PK11_SymKeyFromHandle(slot, NULL, PK11_OriginDerive,
(gdb) list PK11_GetWrapKey
362 * This function is provided for ABI compatibility; see PK11_SetWrapKey below.
363 */
364 PK11SymKey *
365 PK11_GetWrapKey(PK11SlotInfo *slot, int wrap, CK_MECHANISM_TYPE type,
366 int series, void *wincx)
367 {
368 PK11SymKey *symKey = NULL;
369
370 PK11_EnterSlotMonitor(slot);
371 if (slot->series != series ||
372 slot->refKeys[wrap] == CK_INVALID_HANDLE) {
373 PK11_ExitSlotMonitor(slot);
374 return NULL;
375 }
376
377 if (type == CKM_INVALID_MECHANISM) {
378 type = slot->wrapMechanism;
379 }
380
381 symKey = PK11_SymKeyFromHandle(slot, NULL, PK11_OriginDerive,
382 slot->wrapMechanism, slot->refKeys[wrap], PR_FALSE, wincx);
383 PK11_ExitSlotMonitor(slot);
384 return symKey;
385 }
and recursive lock happens in frame 5:
(gdb) f 5
#5 0x00007f3a5f7cf3dd in pk11_GetNewSession (slot=slot@entry=0xce4e40, owner=owner@entry=0xd1caa0) at pk11obj.c:361
361 PK11_EnterSlotMonitor(slot);
(gdb) list
356 pk11_GetNewSession(PK11SlotInfo *slot, PRBool *owner)
357 {
358 CK_SESSION_HANDLE session;
359 *owner = PR_TRUE;
360 if (!slot->isThreadSafe)
361 PK11_EnterSlotMonitor(slot);
362 if (PK11_GETTAB(slot)->C_OpenSession(slot->slotID, CKF_SERIAL_SESSION,
363 slot, pk11_notify, &session) != CKR_OK) {
364 *owner = PR_FALSE;
365 session = slot->session;
Based on comment 39, I'm removing this bug from the errata and moving it to assigned. Hi, We're also experiencing this problem in our machines with nss-3.53.1-7.el7_9.x86_64 being used by curl-7.29.0-59.el7_9.1.x86_64. In our situation, reverting to nss-3.44.0-4.el7.x86_64 resolves the issue. I have two questions: 1) @paulo said it is now "Significantly more difficult to reproduce" -- is that due to a version change/fix, or just incidental observation? 2) I found this patch which seems to resolve the recursive locking that is deadlocking for us (and looks similar to what's posted above). Can we confirm this is the same issue, and thus potentially fixed in the latest nss 3.65? https://phabricator.services.mozilla.com/D112092 (https://bugzilla.mozilla.org/show_bug.cgi?id=1705119) , release noted in https://developer.mozilla.org/en-US/docs/Mozilla/Projects/NSS/NSS_3.65_release_notes Thank you very much! > 1) @paulo said it is now "Significantly more difficult to reproduce" -- is that due to a version change/fix, or just incidental observation? The problem was only experienced by an user. The user provided a coredump, that did show the double lock. > 2) I found this patch which seems to resolve the recursive locking that is deadlocking for us (and looks similar to what's posted above). Can we confirm this is the same issue, and thus potentially fixed in the latest nss 3.65? > > https://phabricator.services.mozilla.com/D112092 (https://bugzilla.mozilla.org/show_bug.cgi?id=1705119) , release noted in https://developer.mozilla.org/en-US/docs/Mozilla/Projects/NSS/NSS_3.65_release_notes Did not test the patch, but it appears to treat the exact same root conditions that caused the dead locks (due to recursive locks) reported in this case. It appears to reorder some code, and split several lock/unlock conditions to ensure minimal code is run while locked. I see this change also: - symKey = PK11_SymKeyFromHandle(slot, NULL, PK11_OriginDerive, - slot->wrapMechanism, keyHandle, PR_FALSE, wincx); PK11_ExitSlotMonitor(slot); + symKey = PK11_SymKeyFromHandle(slot, NULL, PK11_OriginDerive, + slot->wrapMechanism, keyHandle, PR_FALSE, wincx); If I recall correctly, this was one of the main conditions that could lead to a recursive lock. Yes, The upstream patch fixes both double free issues (as well as creating a test case to detect these issues). The part of the patch you point out is the patch for the second issue whaich was not fixed in nss-3.53.1-7, but was fixed in nss-3.53.1-7.1.el7_9.0.0.hotfix.0.bz190926. This fix should be in the next nss release (nss-3.67). Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory (nss, nss-softokn, nss-util, and nspr bug fix and enhancement update), and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHBA-2021:3793 |
User had issues with multiple hung curl commands in a script like this on latest rhel-7.9: """ #! /bin/sh for i in {0..29} do curl -k https://www.redhat.com/en & done """ Checking the backtrace we see: __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 135 2: movl %edx, %eax (gdb) bt #0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 #1 0x00007fc35732aed1 in _L_lock_1093 () from /lib64/libpthread.so.0 #2 0x00007fc35732ae72 in __GI___pthread_mutex_lock (mutex=mutex@entry=0xd12280) at ../nptl/pthread_mutex_lock.c:133 #3 0x00007fc35755feb9 in PR_Lock (lock=0xd12280) at ../../../nspr/pr/src/pthreads/ptsynch.c:176 #4 0x00007fc357e11839 in PK11_EnterSlotMonitor (slot=slot@entry=0xd74b70) at pk11slot.c:484 #5 0x00007fc357e1440d in pk11_filterSlot (slot=0xd74b70, mechanism=4231, mechanismInfoFlags=mechanismInfoFlags@entry=4, keySize=keySize@entry=0) at pk11slot.c:2289 #6 0x00007fc357e1444b in PK11_DoesMechanismFlag (slot=<optimized out>, type=<optimized out>, flags=flags@entry=4) at pk11slot.c:2126 #7 0x00007fc357decdfc in pk11_contextInitMessage (context=context@entry=0xdbec20, mech=0x7fff88424b60, initFunc=0x7fc34c99c260 <NSC_MessageDecryptInit>, flags=flags@entry=4, scrv=scrv@entry=0, key=0xdbeb90) at pk11cxt.c:175 #8 0x00007fc357ded769 in pk11_context_init (context=context@entry=0xdbec20, mech_info=mech_info@entry=0x7fff88424b60) at pk11cxt.c:235 #9 0x00007fc357ded92d in pk11_CreateNewContextInSlot (type=type@entry=4231, slot=0xd74b70, operation=operation@entry=2181038341, symKey=symKey@entry=0xdbeb90, param=param@entry=0x7fff88424c20) at pk11cxt.c:400 #10 0x00007fc357deda8f in PK11_CreateContextBySymKey (type=4231, operation=2181038341, symKey=0xdbeb90, param=param@entry=0x7fff88424c20) at pk11cxt.c:481 #11 0x00007fc3583200ae in ssl3_InitPendingContexts (spec=0xda0840, ss=0xda0ab0) at ssl3con.c:1805 #12 0x00007fc358324707 in ssl3_InitPendingCipherSpecs (ss=ss@entry=0xda0ab0, secret=secret@entry=0xdb5b90, derive=derive@entry=1) at ssl3con.c:1878 #13 0x00007fc358344629 in ssl3_SendECDHClientKeyExchange (ss=ss@entry=0xda0ab0, svrPubKey=svrPubKey@entry=0xda9a00) at ssl3ecc.c:235 #14 0x00007fc358329cb3 in ssl3_SendClientKeyExchange (ss=0xda0ab0) at ssl3con.c:6198 #15 ssl3_SendClientSecondRound (ss=ss@entry=0xda0ab0) at ssl3con.c:7856 #16 0x00007fc35832b6d6 in ssl3_HandleServerHelloDone (ss=0xda0ab0) at ssl3con.c:7783 #17 ssl3_HandlePostHelloHandshakeMessage (length=0, b=0xda5894 'N' <repeats 200 times>..., ss=0xda0ab0) at ssl3con.c:12177 #18 ssl3_HandleHandshakeMessage (ss=ss@entry=0xda0ab0, b=b@entry=0xda5894 'N' <repeats 200 times>..., length=0, endOfRecord=1) at ssl3con.c:12088 #19 0x00007fc35832e8d3 in ssl3_HandleHandshake (origBuf=0xda0d70, ss=0xda0ab0) at ssl3con.c:12262 #20 ssl3_HandleNonApplicationData (ss=ss@entry=0xda0ab0, rType=<optimized out>, epoch=<optimized out>, seqNum=<optimized out>, databuf=databuf@entry=0xda0d70) at ssl3con.c:12787 #21 0x00007fc35832ee71 in ssl3_HandleRecord (ss=ss@entry=0xda0ab0, cText=cText@entry=0x7fff884251b0) at ssl3con.c:13078 #22 0x00007fc35833047f in ssl3_GatherCompleteHandshake (ss=ss@entry=0xda0ab0, flags=flags@entry=0) at ssl3gthr.c:523 #23 0x00007fc358337eab in SSL_ForceHandshake (fd=<optimized out>) at sslsecur.c:370 #24 0x00007fc358337fc6 in SSL_ForceHandshakeWithTimeout (fd=<optimized out>, timeout=<optimized out>) at sslsecur.c:402 #25 0x00007fc3585aff73 in nss_do_connect (sockindex=0, conn=0xcf93c0) at nss.c:1672 #26 nss_connect_common (conn=conn@entry=0xcf93c0, sockindex=sockindex@entry=0, done=done@entry=0x7fff8842547d) at nss.c:1738 #27 0x00007fc3585b0ff5 in Curl_nss_connect_nonblocking (conn=conn@entry=0xcf93c0, sockindex=sockindex@entry=0, done=done@entry=0x7fff8842547d) at nss.c:1778 #28 0x00007fc3585a6d8e in Curl_ssl_connect_nonblocking (conn=conn@entry=0xcf93c0, sockindex=sockindex@entry=0, done=0x7fff8842547d) at sslgen.c:231 #29 0x00007fc35857d7ae in https_connecting (conn=0xcf93c0, done=<optimized out>) at http.c:1359 #30 0x00007fc3585a0728 in multi_runsingle (multi=multi@entry=0xce2200, now=..., easy=easy@entry=0xcf0240) at multi.c:1204 #31 0x00007fc3585a1641 in curl_multi_perform (multi_handle=multi_handle@entry=0xce2200, running_handles=running_handles@entry=0x7fff88425570) at multi.c:1763 #32 0x00007fc358598853 in curl_easy_perform (easy=easy@entry=0xcd8020) at easy.c:480 #33 0x0000000000409f46 in operate (config=config@entry=0x7fff88425940, argc=argc@entry=3, argv=argv@entry=0x7fff88425df8) at tool_operate.c:1368 #34 0x000000000040239e in main (argc=3, argv=0x7fff88425df8) at tool_main.c:107 Checking the mutex, it is already locked. On a quick inspection of source code, we see a possible mutex misuses, marked with '<<<--- ' below, in nss-3.53.1/nss/lib/pk11wrap/pk11slot.c: """ PK11_EnterSlotMonitor calls PZ_Lock PK11_ExitSlotMonitor calls PZ_Unlock """ CK_SESSION_HANDLE PK11_GetRWSession(PK11SlotInfo *slot) { CK_SESSION_HANDLE rwsession; CK_RV crv; PRBool haveMonitor = PR_FALSE; if (!slot->isThreadSafe || slot->defRWSession) { PK11_EnterSlotMonitor(slot); haveMonitor = PR_TRUE; } if (slot->defRWSession) { PORT_Assert(slot->session != CK_INVALID_HANDLE); if (slot->session != CK_INVALID_HANDLE) return slot->session; <<<--- did not release lock, will dead lock if attempting to get it again } crv = PK11_GETTAB(slot)->C_OpenSession(slot->slotID, CKF_RW_SESSION | CKF_SERIAL_SESSION, slot, pk11_notify, &rwsession); PORT_Assert(rwsession != CK_INVALID_HANDLE || crv != CKR_OK); if (crv != CKR_OK || rwsession == CK_INVALID_HANDLE) { if (crv == CKR_OK) crv = CKR_DEVICE_ERROR; if (haveMonitor) PK11_ExitSlotMonitor(slot); PORT_SetError(PK11_MapError(crv)); return CK_INVALID_HANDLE; } if (slot->defRWSession) { /* we have the monitor */ slot->session = rwsession; } return rwsession; <<<--- did not release lock, will dead lock if attempting to get it again } """