Hide Forgot
Created attachment 486235 [details] Tarball containing logs, config, backtraces, etc Description of problem: Corosync crashes on multiple nodes with a segfault Version-Release number of selected component (if applicable): 1.3.0-1.fc14 How reproducible: Semi-regularly with Pacemaker CTS Additional info: Attachment will contain all logs, config files, backtraces and commands executed in the lead up to the crashes.
Created attachment 486454 [details] Core file with hopefully the same symptoms
#0 0x0000000000000000 in ?? () #1 0x00007fafba88eab7 in coroipcs_handler_dispatch (fd=<value optimized out>, revent=1, context=0x2426c90) at coroipcs.c:1662 #2 0x00007fafbaca19b2 in poll_run (handle=5902762718137417728) at coropoll.c:510 #3 0x000000000040774b in main (argc=<value optimized out>, argv=<value optimized out>, envp=<value optimized out>) at main.c:1813 (gdb) up #1 0x00007fafba88eab7 in coroipcs_handler_dispatch (fd=<value optimized out>, revent=1, context=0x2426c90) at coroipcs.c:1662 1662 api->init_fn_get (conn_info->service) (conn_info); (gdb) print conn_info->service $1 = 9 (gdb) print conn_info $2 = (struct conn_info *) 0x2426c90 (gdb) print conn_info $3 = (struct conn_info *) 0x2426c90 (gdb) print *conn_info $4 = {fd = 30, thread = 0, client_pid = 23604, thread_attr = { __size = '\000' <repeats 55 times>, __align = 0}, service = 9, state = CONN_STATE_THREAD_ACTIVE, refcount = 1, stats_handle = 0, pending_semops = 0, mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, control_buffer = 0x7fafbaef4000, request_buffer = 0x7fafad40d000 "V\002", response_buffer = 0x7fafad30d000 "", dispatch_buffer = 0x7fafad10d000 "", control_size = 8192, request_size = 1048576, response_size = 1048576, dispatch_size = 1048576, outq_head = {next = 0x2426d68, prev = 0x2426d68}, private_data = 0x23f4a70, list = {next = 0x241c4e0, prev = 0x7fafbaa8fb60}, setup_msg = "\t", '\000' <repeats 15 times>, "/dev/shm/control_buffer-I4whc5", '\000' <repeats 4066 times>, "/dev/shm/request_buffer-U7rXbE", '\000' <repeats 4066 times>, "/dev/shm/response_buffer-yZJ0bd", '\000' <repeats 4065 times>, "/dev/shm/dispatch_buffer-JyoscM", '\000' <repeats 4066 times>, " \000\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000\020\000\000\000\000", setup_bytes_read = 0, zcb_mapped_list_head = { next = 0x242adc8, prev = 0x242adc8}, sending_allowed_private_data = { 0x0 <repeats 64 times>}, poll_state = 1} (gdb) print dispatch_buffer[4096] No symbol "dispatch_buffer" in current context. (gdb) print (char *)dispatch_buffer[4096] No symbol "dispatch_buffer" in current context. (gdb) print services No symbol "services" in current context. (gdb) up #2 0x00007fafbaca19b2 in poll_run (handle=5902762718137417728) at coropoll.c:510 510 res = poll_instance->poll_entries[i].dispatch_fn (handle, (gdb) p $5 = {fd = 30, thread = 0, client_pid = 23604, thread_attr = { __size = '\000' <repeats 55 times>, __align = 0}, service = 9, state = CONN_STATE_THREAD_ACTIVE, refcount = 1, stats_handle = 0, pending_semops = 0, mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, control_buffer = 0x7fafbaef4000, request_buffer = 0x7fafad40d000 "V\002", response_buffer = 0x7fafad30d000 "", dispatch_buffer = 0x7fafad10d000 "", control_size = 8192, request_size = 1048576, response_size = 1048576, dispatch_size = 1048576, outq_head = {next = 0x2426d68, prev = 0x2426d68}, private_data = 0x23f4a70, list = {next = 0x241c4e0, prev = 0x7fafbaa8fb60}, setup_msg = "\t", '\000' <repeats 15 times>, "/dev/shm/control_buffer-I4whc5", '\000' <repeats 4066 times>, "/dev/shm/request_buffer-U7rXbE", '\000' <repeats 4066 times>, "/dev/shm/response_buffer-yZJ0bd", '\000' <repeats 4065 times>, "/dev/shm/dispatch_buffer-JyoscM", '\000' <repeats 4066 times>, " \000\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000\020\000\000\000\000", setup_bytes_read = 0, zcb_mapped_list_head = { next = 0x242adc8, prev = 0x242adc8}, sending_allowed_private_data = { 0x0 <repeats 64 times>}, poll_state = 1} (gdb) up #3 0x000000000040774b in main (argc=<value optimized out>, argv=<value optimized out>, envp=<value optimized out>) at main.c:1813 1813 poll_run (corosync_poll_handle); (gdb) up Initial frame selected; you cannot go up. (gdb) print services No symbol "services" in current context. (gdb) print ais_servies No symbol "ais_servies" in current context. (gdb) print ais_service[9] $6 = (struct corosync_service_engine *) 0x7fafb58bf900 (gdb) print *ais_service[9] $7 = {name = 0x7fafb56bc818 <Address 0x7fafb56bc818 out of bounds>, id = 9, priority = 0, private_data_size = 0, flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, allow_inquorate = CS_LIB_DISALLOW_INQUORATE, exec_init_fn = 0x7fafb56b5fb0, exec_exit_fn = 0, exec_dump_fn = 0, lib_init_fn = 0, lib_exit_fn = 0x7fafb56b5a10, lib_engine = 0x0, lib_engine_count = 0, exec_engine = 0x0, exec_engine_count = 0, config_init_fn = 0, confchg_fn = 0, sync_mode = CS_SYNC_V1, sync_init = 0, sync_process = 0, sync_activate = 0, sync_abort = 0} [sdake@beast corosync]$ grep -r CMAN * include/corosync/corodefs.h: CMAN_SERVICE = 9, Looks like something attempts to connect to the cman service via ipc. As I recall, cman doesn't use corosync IPC, but uses its own instead. PCMK_SERVICE = 10,
Do you have the fplay results from this crash? Thanks.
from stable-1.0 tree include/crm/ais_common.h:#define CRM_SERVICE 9 ... ut oh....
We have fix for this in https://bugzilla.redhat.com/attachment.cgi?id=488470&action=diff as part of trying to resolve https://bugzilla.redhat.com/show_bug.cgi?id=689418 It was posted to ML, but still not reviewed ([Openais] [PATCH 1/1] coroipcs: Deny connect to service without initfn)
Patch now included in upstream git as 719fddd8e16b6da8694fa84dd2fafbb202401200