Description of problem: netsnmp_swrun_container_load appears to get stuck in a loop which causes snmpwalk to snmpwalk to fail along with snmpd strace output from when it happens: 26821 14:04:16.651384 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 trace: ", 27) = 27 <0.000025> 26821 14:04:16.651455 write(3</var/log/snmpd.log>, "netsnmp_swrun_container_load(): host/data_access/swrun.c, 201:\n", 63) = 63 <0.000035> 26821 14:04:16.651611 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 swrun:container:load: ", 42) = 42 <0.000027> 26821 14:04:16.651682 write(3</var/log/snmpd.log>, "load\n", 5) = 5 <0.000022> // it appears that the process gets hung in an incrementing loop here while processing /proc 26821 14:04:16.655383 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000030> 26821 14:04:16.655460 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021> 26821 14:04:16.655539 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021> 26821 14:04:16.655610 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000022> 26821 14:04:16.655675 write(3</var/log/snmpd.log>, ".2", 2) = 2 <0.000021> 26821 14:04:16.655739 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021> 26821 14:04:16.655884 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000022> 26821 14:04:16.655949 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000027> 26821 14:04:16.657370 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000027> 26821 14:04:16.657448 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021> 26821 14:04:16.657513 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021> 26821 14:04:16.657577 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000021> 26821 14:04:16.657640 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021> 26821 14:04:16.657703 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021> 26821 14:04:16.657839 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000027> 26821 14:04:16.657908 write(3</var/log/snmpd.log>, "result was 0\n", 13) = 13 <0.000022> < ... snipped a lot of repetition ... > 26821 14:18:47.123124 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000023> 26821 14:18:47.123191 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000023> 26821 14:18:47.123259 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000024> 26821 14:18:47.123327 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000024> 26821 14:18:47.123406 write(3</var/log/snmpd.log>, ".2358", 5) = 5 <0.000025> 26821 14:18:47.123478 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000024> 26821 14:18:47.123626 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000024> 26821 14:18:47.123699 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000024> 26821 14:18:47.123849 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000024> 26821 14:18:47.123920 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000025> 26821 14:18:47.124002 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000024> 26821 14:18:47.124072 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000024> 26821 14:18:47.124142 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000025> 26821 14:18:47.124211 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000023> < ... end of output to snmpd.log ... > Version: net-snmp-5.7.2-49.el7_9.2.x86_64 net-snmp-agent-libs-5.7.2-49.el7_9.2.x86_64 net-snmp-libs-5.7.2-49.el7_9.2.x86_64 net-snmp-utils-5.7.2-49.el7_9.2.x86_64
This is where it gets stuck: 26821 14:04:16.651384 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 trace: ", 27) = 27 <0.000025> 26821 14:04:16.651455 write(3</var/log/snmpd.log>, "netsnmp_swrun_container_load(): host/data_access/swrun.c, 201:\n", 63) = 63 <0.000035> 26821 14:04:16.651611 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 swrun:container:load: ", 42) = 42 <0.000027> 26821 14:04:16.651682 write(3</var/log/snmpd.log>, "load\n", 5) = 5 <0.000022> ========== net-snmp-5.7.2/agent/mibgroup/host/data_access/swrun.c /** * load swrun information in specified container * * @param container empty container to be filled. * pass NULL to have the function create one. * @param load_flags flags to modify behaviour. Examples: * NETSNMP_SWRUN_ALL_OR_NONE * * @retval NULL error * @retval !NULL pointer to container */ netsnmp_container* netsnmp_swrun_container_load(netsnmp_container* user_container, u_int load_flags) { netsnmp_container* container = user_container; int rc; DEBUGMSGTL(("swrun:container:load", "load\n")); netsnmp_assert(1 == _swrun_init); if (NULL == container) container = netsnmp_swrun_container(); if (NULL == container) { snmp_log(LOG_ERR, "no container specified/found for swrun\n"); return NULL; } rc = netsnmp_arch_swrun_container_load(container, load_flags); <=== I suspect this function is where it gets stuck, unfortunately there is no DEBUGMSGTL output until it finishes... if (0 != rc) { if (NULL == user_container) { netsnmp_swrun_container_free(container, NETSNMP_SWRUN_NOFLAGS); container = NULL; } else if (load_flags & NETSNMP_SWRUN_ALL_OR_NONE) { DEBUGMSGTL(("swrun:container:load", " discarding partial results\n")); netsnmp_swrun_container_free_items(container); } } return container; } =========== net-snmp-5.7.2/agent/mibgroup/host/data_access/swrun_procfs_status.c /* --------------------------------------------------------------------- */ int netsnmp_arch_swrun_container_load( netsnmp_container *container, u_int flags) { DIR *procdir = NULL; struct dirent *procentry_p; FILE *fp; int pid, i; unsigned long long cpu; char buf[BUFSIZ], buf2[BUFSIZ], *cp; netsnmp_swrun_entry *entry; procdir = opendir("/proc"); if ( NULL == procdir ) { snmp_log( LOG_ERR, "Failed to open /proc" ); return -1; } /* * Walk through the list of processes in the /proc tree */ while ( NULL != (procentry_p = readdir( procdir ))) { pid = atoi( procentry_p->d_name ); if ( 0 == pid ) continue; /* Presumably '.' or '..' */ entry = netsnmp_swrun_entry_create(pid); if (NULL == entry) continue; /* error already logged by function */ /* * Now extract the interesting information * from the various /proc{PID}/ interface files */ /* * Name: process name */ snprintf( buf2, BUFSIZ, "/proc/%d/status", pid ); fp = fopen( buf2, "r" ); if (!fp) { netsnmp_swrun_entry_free(entry); continue; /* file (process) probably went away */ } memset(buf, 0, sizeof(buf)); if (fgets( buf, BUFSIZ-1, fp ) == NULL) { fclose(fp); netsnmp_swrun_entry_free(entry); continue; } fclose(fp); for ( cp = buf; *cp != ':'; cp++ ) ; while (isspace(*(++cp))) /* Skip ':' and following spaces */ ; entry->hrSWRunName_len = snprintf(entry->hrSWRunName, sizeof(entry->hrSWRunName)-1, "%s", cp); if ( '\n' == entry->hrSWRunName[ entry->hrSWRunName_len-1 ]) { entry->hrSWRunName[ entry->hrSWRunName_len-1 ] = '\0'; entry->hrSWRunName_len--; /* Stamp on trailing newline */ } /* * Command Line: * argv[0] '\0' argv[1] '\0' .... */ snprintf( buf2, BUFSIZ, "/proc/%d/cmdline", pid ); fp = fopen( buf2, "r" ); if (!fp) { netsnmp_swrun_entry_free(entry); continue; /* file (process) probably went away */ } memset(buf, 0, sizeof(buf)); if ((cp = fgets( buf, BUFSIZ-1, fp )) == NULL) { fclose(fp); netsnmp_swrun_entry_free(entry); continue; } fclose(fp); /* * argv[0] is hrSWRunPath */ entry->hrSWRunPath_len = snprintf(entry->hrSWRunPath, sizeof(entry->hrSWRunPath)-1, "%s", buf); /* * Stitch together argv[1..] to construct hrSWRunParameters */ cp = buf + entry->hrSWRunPath_len+1; while ( 1 ) { while (*cp) cp++; if ( '\0' == *(cp+1)) break; /* '\0''\0' => End of command line */ *cp = ' '; } entry->hrSWRunParameters_len = sprintf(entry->hrSWRunParameters, "%.*s", (int)sizeof(entry->hrSWRunParameters) - 1, buf + entry->hrSWRunPath_len + 1); /* * XXX - No information regarding system processes vs applications */ entry->hrSWRunType = HRSWRUNTYPE_APPLICATION; /* * {xxx} {xxx} STATUS {xxx}*10 UTIME STIME {xxx}*8 RSS */ snprintf( buf, BUFSIZ, "/proc/%d/stat", pid ); fp = fopen( buf, "r" ); if (!fp) { netsnmp_swrun_entry_free(entry); continue; /* file (process) probably went away */ } if (fgets( buf, BUFSIZ-1, fp ) == NULL) { fclose(fp); netsnmp_swrun_entry_free(entry); continue; } fclose(fp); cp = buf; while ( ' ' != *(cp++)) /* Skip first field */ ; while ( ' ' != *(cp++)) /* Skip second field */ ; switch (*cp) { case 'R': entry->hrSWRunStatus = HRSWRUNSTATUS_RUNNING; break; case 'S': entry->hrSWRunStatus = HRSWRUNSTATUS_RUNNABLE; break; case 'D': case 'T': entry->hrSWRunStatus = HRSWRUNSTATUS_NOTRUNNABLE; break; case 'Z': default: entry->hrSWRunStatus = HRSWRUNSTATUS_INVALID; break; } for (i=11; i; i--) { /* Skip STATUS + 10 fields */ while (' ' != *(++cp)) ; cp++; } cpu = atol( cp ); /* utime */ while ( ' ' != *(++cp)) ; cpu += atol( cp ); /* +stime */ entry->hrSWRunPerfCPU = cpu * 100 / sc_clk_tck; for (i=9; i; i--) { /* Skip stime + 8 fields */ while (' ' != *(++cp)) ; cp++; } entry->hrSWRunPerfMem = atol( cp ); /* rss */ entry->hrSWRunPerfMem *= (pagesize/1024); /* in kB */ CONTAINER_INSERT(container, entry); <=== This is probably where the next function comes in (see below) } closedir( procdir ); DEBUGMSGTL(("swrun:load:arch"," loaded %" NETSNMP_PRIz "d entries\n", <=== Appears it never actually makes it to here as we don't see it in the debug output CONTAINER_SIZE(container))); return 0; } ========= The CONTAINER_INSERT(container, entry); function is in the same source file "net-snmp-5.7.2/snmplib/container.c" as the functions that print the output that is stuck in the loop. ========== And all the messages that look like the following come from the following code: 26821 14:04:16.655383 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000030> 26821 14:04:16.655460 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021> 26821 14:04:16.655539 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021> 26821 14:04:16.655610 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000022> 26821 14:04:16.655675 write(3</var/log/snmpd.log>, ".2", 2) = 2 <0.000021> 26821 14:04:16.655739 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021> 26821 14:04:16.655884 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000022> 26821 14:04:16.655949 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000027> ========== net-snmp-5.7.2/snmplib/container.c : /*------------------------------------------------------------------ * * simple comparison routines * */ int netsnmp_compare_netsnmp_index(const void *lhs, const void *rhs) { int rc; netsnmp_assert((NULL != lhs) && (NULL != rhs)); DEBUGIF("compare:index") { DEBUGMSGT(("compare:index", "compare ")); DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) lhs)->oids, ((const netsnmp_index *) lhs)->len)); DEBUGMSG(("compare:index", " to ")); DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) rhs)->oids, ((const netsnmp_index *) rhs)->len)); DEBUGMSG(("compare:index", "\n")); } rc = snmp_oid_compare(((const netsnmp_index *) lhs)->oids, ((const netsnmp_index *) lhs)->len, ((const netsnmp_index *) rhs)->oids, ((const netsnmp_index *) rhs)->len); DEBUGMSGT(("compare:index", "result was %d\n", rc)); return rc; } int netsnmp_ncompare_netsnmp_index(const void *lhs, const void *rhs) { int rc; netsnmp_assert((NULL != lhs) && (NULL != rhs)); DEBUGIF("compare:index") { DEBUGMSGT(("compare:index", "compare ")); DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) lhs)->oids, ((const netsnmp_index *) lhs)->len)); DEBUGMSG(("compare:index", " to ")); DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) rhs)->oids, ((const netsnmp_index *) rhs)->len)); DEBUGMSG(("compare:index", "\n")); } rc = snmp_oid_ncompare(((const netsnmp_index *) lhs)->oids, ((const netsnmp_index *) lhs)->len, ((const netsnmp_index *) rhs)->oids, ((const netsnmp_index *) rhs)->len, ((const netsnmp_index *) rhs)->len); DEBUGMSGT(("compare:index", "result was %d\n", rc)); return rc; } ==========