Bug 2196664
| Summary: | sssd_be segfaults | ||||||
|---|---|---|---|---|---|---|---|
| Product: | Red Hat Enterprise Linux 9 | Reporter: | Stephen Roylance <sdar> | ||||
| Component: | cyrus-sasl | Assignee: | Simo Sorce <ssorce> | ||||
| Status: | CLOSED MIGRATED | QA Contact: | BaseOS QE Security Team <qe-baseos-security> | ||||
| Severity: | medium | Docs Contact: | |||||
| Priority: | medium | ||||||
| Version: | CentOS Stream | CC: | bstinson, davide, jwboyer | ||||
| Target Milestone: | rc | Keywords: | MigratedToJIRA, Triaged | ||||
| Target Release: | --- | Flags: | pm-rhel:
mirror+
|
||||
| Hardware: | x86_64 | ||||||
| OS: | Linux | ||||||
| Whiteboard: | |||||||
| Fixed In Version: | Doc Type: | If docs needed, set a value | |||||
| Doc Text: | Story Points: | --- | |||||
| Clone Of: | Environment: | ||||||
| Last Closed: | 2023-08-28 19:27:58 UTC | Type: | Bug | ||||
| Regression: | --- | Mount Type: | --- | ||||
| Documentation: | --- | CRM: | |||||
| Verified Versions: | Category: | --- | |||||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |||||
| Cloudforms Team: | --- | Target Upstream Version: | |||||
| Embargoed: | |||||||
| Attachments: |
|
||||||
You opened a bug against the RHEL 9 product, but the RPM you mention is an RHEL 8 rpm, did you file against the wrong product or did you copy the wrong rpm version? Sorry, the crash happened on 8. Our next update cycle will be on 9, though, so a fix in 8 won't help us in particular.
if the full backtrace is helpful, this is it with the domain name redacted:
#0 sasl_gss_encode (context=0x0, invec=<optimized out>, numiov=<optimized out>, output=0x562cff3bc538, outputlen=0x7ffd58ffb594, privacy=1) at gssapi.c:370
#1 0x00007f2f6de215ee in _sasl_encodev (conn=conn@entry=0x562cff412780, invec=invec@entry=0x7ffd58ffb560, numiov=numiov@entry=1, p_num_packets=p_num_packets@entry=0x7ffd58ffb4fc,
output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594) at common.c:359
#2 0x00007f2f6de23623 in sasl_encodev (conn=conn@entry=0x562cff412780, invec=invec@entry=0x7ffd58ffb560, numiov=numiov@entry=1, output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594)
at common.c:582
#3 0x00007f2f6de23750 in sasl_encode (conn=0x562cff412780, input=<optimized out>, inputlen=<optimized out>, output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594) at common.c:304
#4 0x00007f2f6e4730ca in sb_sasl_cyrus_encode (p=0x562cff3bc4b0, buf=<optimized out>, len=<optimized out>, dst=0x562cff3bc520) at cyrus.c:157
#5 0x00007f2f6e476350 in sb_sasl_generic_write (sbiod=0x562cff3b8880, buf=0x562cff419ff0, len=<optimized out>) at sasl.c:783
#6 0x00007f2f6e25585c in sb_debug_write (sbiod=0x562cff3a3050, buf=0x562cff419ff0, len=286) at sockbuf.c:854
#7 0x00007f2f6e25585c in sb_debug_write (sbiod=0x562cff3c2900, buf=0x562cff419ff0, len=286) at sockbuf.c:854
#8 0x00007f2f6e256f85 in ber_int_sb_write (sb=sb@entry=0x562cff2ef480, buf=0x562cff419ff0, len=len@entry=286) at sockbuf.c:445
#9 0x00007f2f6e253223 in ber_flush2 (sb=0x562cff2ef480, ber=0x562cff3720f0, freeit=freeit@entry=0) at io.c:246
#10 0x00007f2f6e481775 in ldap_int_flush_request (ld=ld@entry=0x562cff3d81a0, lr=lr@entry=0x562cff2ef2a0) at request.c:186
#11 0x00007f2f6e4819a7 in ldap_send_server_request (ld=ld@entry=0x562cff3d81a0, ber=ber@entry=0x562cff3720f0, msgid=msgid@entry=13, parentreq=parentreq@entry=0x0, srvlist=srvlist@entry=0x0,
lc=<optimized out>, lc@entry=0x0, bind=0x0, m_noconn=0, m_res=0) at request.c:408
#12 0x00007f2f6e481e26 in ldap_send_initial_request (ld=ld@entry=0x562cff3d81a0, msgtype=msgtype@entry=99, dn=dn@entry=0x562cff3c2f60 "cn=certmap,dc=XXX,dc=facebook,dc=com", ber=0x562cff3720f0, msgid=13)
at request.c:169
#13 0x00007f2f6e470d32 in ldap_pvt_search (ld=0x562cff3d81a0, base=0x562cff3c2f60 "cn=certmap,dc=XXX,dc=facebook,dc=com", scope=2,
filter=0x7f2f6a8afb10 "(|(&(objectClass=ipaCertMapRule)(ipaEnabledFlag=TRUE))(objectClass=ipaCertMapConfigObject))", attrs=0x7ffd58ffbd10, attrsonly=0, sctrls=0x562cff3d7990, cctrls=0x0, timeout=0x0,
sizelimit=0, deref=-1, msgidp=0x7ffd58ffba64) at search.c:128
#14 0x00007f2f6e470e14 in ldap_search_ext (ld=<optimized out>, base=<optimized out>, scope=<optimized out>, filter=<optimized out>, attrs=<optimized out>, attrsonly=<optimized out>, sctrls=0x562cff3d7990,
cctrls=0x0, timeout=0x0, sizelimit=0, msgidp=0x7ffd58ffba64) at search.c:69
#15 0x00007f2f6a1760d9 in sdap_get_generic_ext_step (req=req@entry=0x562cff3d76d0) at src/providers/ldap/sdap_async.c:1629
#16 0x00007f2f6a1765e9 in sdap_get_generic_ext_send (memctx=<optimized out>, ev=ev@entry=0x562cff2da460, opts=opts@entry=0x562cff2eab30, sh=sh@entry=0x562cff3b4dc0,
search_base=search_base@entry=0x562cff3c2f60 "cn=certmap,dc=XXX,dc=facebook,dc=com", scope=scope@entry=2,
filter=0x7f2f6a8afb10 "(|(&(objectClass=ipaCertMapRule)(ipaEnabledFlag=TRUE))(objectClass=ipaCertMapConfigObject))", attrs=0x7ffd58ffbd10, serverctrls=0x0, clientctrls=0x0, sizelimit=0, timeout=0,
parse_cb=0x7f2f6a173ae0 <sdap_get_and_parse_generic_parse_entry>, cb_data=0x562cff3dd390, flags=0) at src/providers/ldap/sdap_async.c:1567
#17 0x00007f2f6a177270 in sdap_get_and_parse_generic_send (memctx=memctx@entry=0x562cff3fa7a0, ev=ev@entry=0x562cff2da460, opts=opts@entry=0x562cff2eab30, sh=sh@entry=0x562cff3b4dc0,
search_base=search_base@entry=0x562cff3c2f60 "cn=certmap,dc=XXX,dc=facebook,dc=com", scope=scope@entry=2,
filter=0x7f2f6a8afb10 "(|(&(objectClass=ipaCertMapRule)(ipaEnabledFlag=TRUE))(objectClass=ipaCertMapConfigObject))", attrs=0x7ffd58ffbd10, map=0x0, map_num_attrs=0, attrsonly=0, serverctrls=0x0,
clientctrls=0x0, sizelimit=0, timeout=0, allow_paging=false) at src/providers/ldap/sdap_async.c:2020
#18 0x00007f2f6a177512 in sdap_get_generic_send (memctx=0x562cff3fa7a0, ev=0x562cff2da460, opts=0x562cff2eab30, sh=0x562cff3b4dc0, search_base=0x562cff3c2f60 "cn=certmap,dc=XXX,dc=facebook,dc=com", scope=2,
filter=0x7f2f6a8afb10 "(|(&(objectClass=ipaCertMapRule)(ipaEnabledFlag=TRUE))(objectClass=ipaCertMapConfigObject))", attrs=0x7ffd58ffbd10, map=0x0, map_num_attrs=0, timeout=0, allow_paging=false)
at src/providers/ldap/sdap_async.c:2121
#19 0x00007f2f6a871e52 in ipa_subdomains_refresh_ranges_done () from /usr/lib64/sssd/libsss_ipa.so
#20 0x00007f2f717b1ec2 in _tevent_req_error (req=<optimized out>, error=<optimized out>, location=<optimized out>) at ../../tevent_req.c:211
#21 0x00007f2f6a870969 in ipa_subdomains_ranges_done () from /usr/lib64/sssd/libsss_ipa.so
#22 0x00007f2f717b1ec2 in _tevent_req_error (req=req@entry=0x562cff3dfff0, error=error@entry=5, location=location@entry=0x7f2f6a1d7b20 "src/providers/ldap/sdap_ops.c:192") at ../../tevent_req.c:211
#23 0x00007f2f6a1a2a52 in sdap_search_bases_ex_done (subreq=0x0) at src/providers/ldap/sdap_ops.c:192
#24 0x00007f2f717b1ec2 in _tevent_req_error (req=<optimized out>, error=<optimized out>, location=<optimized out>) at ../../tevent_req.c:211
#25 0x00007f2f717b1ec2 in _tevent_req_error (req=req@entry=0x562cff3dd1d0, error=error@entry=5, location=location@entry=0x7f2f6a1beef0 "src/providers/ldap/sdap_async.c:1948") at ../../tevent_req.c:211
#26 0x00007f2f6a1738fe in generic_ext_search_handler (subreq=0x0, opts=<optimized out>) at src/providers/ldap/sdap_async.c:1948
#27 0x00007f2f717b1ec2 in _tevent_req_error (req=req@entry=0x562cff3d76d0, error=error@entry=5, location=location@entry=0x7f2f6a1bfdf0 "src/providers/ldap/sdap_async.c:1739") at ../../tevent_req.c:211
#28 0x00007f2f6a176b62 in sdap_get_generic_op_finished (op=<optimized out>, reply=0x0, error=5, pvt=<optimized out>) at src/providers/ldap/sdap_async.c:1739
#29 0x00007f2f6a174bff in sdap_handle_release (sh=0x562cff3b4dc0) at src/providers/ldap/sdap_async.c:143
#30 sdap_process_result (ev=<optimized out>, pvt=<optimized out>) at src/providers/ldap/sdap_async.c:245
#31 0x00007f2f717b0f97 in tevent_common_invoke_fd_handler (fde=fde@entry=0x562cff3b3f20, flags=<optimized out>, removed=removed@entry=0x0) at ../../tevent_fd.c:142
#32 0x00007f2f717b77af in epoll_event_loop (tvalp=0x7ffd58ffbfe0, epoll_ev=0x562cff2da740) at ../../tevent_epoll.c:736
#33 epoll_event_loop_once (ev=<optimized out>, location=<optimized out>) at ../../tevent_epoll.c:937
#34 0x00007f2f717b579b in std_event_loop_once (ev=0x562cff2da460, location=0x7f2f7461fff4 "src/util/server.c:744") at ../../tevent_standard.c:110
#35 0x00007f2f717b0365 in _tevent_loop_once (ev=ev@entry=0x562cff2da460, location=location@entry=0x7f2f7461fff4 "src/util/server.c:744") at ../../tevent.c:790
#36 0x00007f2f717b060b in tevent_common_loop_wait (ev=0x562cff2da460, location=0x7f2f7461fff4 "src/util/server.c:744") at ../../tevent.c:913
#37 0x00007f2f717b572b in std_event_loop_wait (ev=0x562cff2da460, location=0x7f2f7461fff4 "src/util/server.c:744") at ../../tevent_standard.c:141
#38 0x00007f2f745fda37 in server_loop (main_ctx=0x562cff2da7d0) at src/util/server.c:744
#39 0x0000562cfe3b0955 in main (argc=8, argv=<optimized out>) at src/providers/data_provider_be.c:802
Do you know if there is a way to reproduce this crash on demand, or is this happening at random? I can't trigger it on demand. It happens consistently, a few times a day, on our DGX nodes in production, and I can reliably see it happen by running all_reduce_perf from https://github.com/NVIDIA/nccl-tests for long enough on similar nodes in our test environment. Would you be able to use a test build with the patch and provide feedback on whether you see a drop in occurences? (In reply to Simo Sorce from comment #6) > Would you be able to use a test build with the patch and provide feedback on > whether you see a drop in occurences? yea, happy to. Will take at least a few weeks to get everything lined back up and get dedicated time on the test nodes. Created attachment 1968598 [details]
Bundle with cyrus-sasl test rpms
I attache dto the bug a set of test packages to try. If they do resolve the issue I can schedule work to include this in a future RHEL update. Stephen, any news on this? (In reply to Simo Sorce from comment #10) > Stephen, > any news on this? sorry for the delay, I lost the test nodes to another project and am waiting for them to get rebuilt so I can use them. Issue migration from Bugzilla to Jira is in process at this time. This will be the last message in Jira copied from the Bugzilla bug. This BZ has been automatically migrated to the issues.redhat.com Red Hat Issue Tracker. All future work related to this report will be managed there. To find the migrated issue, look in the "Links" section for a direct link to the new issue location. The issue key will have an icon of 2 footprints next to it, and begin with "RHEL-" followed by an integer. You can also find this issue by visiting https://issues.redhat.com/issues/?jql= and searching the "Bugzilla Bug" field for this BZ's number, e.g. a search like: "Bugzilla Bug" = 1234567 In the event you have trouble locating or viewing this issue, you can file an issue by sending mail to rh-issues. |
Description of problem: sssd_be sometimes segfaults under load Version-Release number of selected component (if applicable): cyrus-sasl-2.1.27-6.el8_5.x86_64 How reproducible: on an nvidia DGX100 joined to an IPA realm running an mpi all_reduce performance test Additional info: #0 sasl_gss_encode (context=0x0, invec=<optimized out>, numiov=<optimized out>, output=0x562cff3bc538, outputlen=0x7ffd58ffb594, privacy=1) at gssapi.c:370 #1 0x00007f2f6de215ee in _sasl_encodev (conn=conn@entry=0x562cff412780, invec=invec@entry=0x7ffd58ffb560, numiov=numiov@entry=1, p_num_packets=p_num_packets@entry=0x7ffd58ffb4fc, output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594) at common.c:359 #2 0x00007f2f6de23623 in sasl_encodev (conn=conn@entry=0x562cff412780, invec=invec@entry=0x7ffd58ffb560, numiov=numiov@entry=1, output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594) at common.c:582 #3 0x00007f2f6de23750 in sasl_encode (conn=0x562cff412780, input=<optimized out>, inputlen=<optimized out>, output=output@entry=0x562cff3bc538, outputlen=outputlen@entry=0x7ffd58ffb594) at common.c:304 #4 0x00007f2f6e4730ca in sb_sasl_cyrus_encode (p=0x562cff3bc4b0, buf=<optimized out>, len=<optimized out>, dst=0x562cff3bc520) at cyrus.c:157 #5 0x00007f2f6e476350 in sb_sasl_generic_write (sbiod=0x562cff3b8880, buf=0x562cff419ff0, len=<optimized out>) at sasl.c:783 #6 0x00007f2f6e25585c in sb_debug_write (sbiod=0x562cff3a3050, buf=0x562cff419ff0, len=286) at sockbuf.c:854 #7 0x00007f2f6e25585c in sb_debug_write (sbiod=0x562cff3c2900, buf=0x562cff419ff0, len=286) at sockbuf.c:854 #8 0x00007f2f6e256f85 in ber_int_sb_write (sb=sb@entry=0x562cff2ef480, buf=0x562cff419ff0, len=len@entry=286) at sockbuf.c:445 #9 0x00007f2f6e253223 in ber_flush2 (sb=0x562cff2ef480, ber=0x562cff3720f0, freeit=freeit@entry=0) at io.c:246 #10 0x00007f2f6e481775 in ldap_int_flush_request (ld=ld@entry=0x562cff3d81a0, lr=lr@entry=0x562cff2ef2a0) at request.c:186 #11 0x00007f2f6e4819a7 in ldap_send_server_request (ld=ld@entry=0x562cff3d81a0, ber=ber@entry=0x562cff3720f0, msgid=msgid@entry=13, parentreq=parentreq@entry=0x0, srvlist=srvlist@entry=0x0, lc=<optimized out>, lc@entry=0x0, bind=0x0, m_noconn=0, m_res=0) at request.c:408 Based on the conditions, I suspect this may be resolved with the upstream commit https://github.com/cyrusimap/cyrus-sasl/commit/df037bd4e20f7508fc36a9292d75e94c04dc8daa