Bug 2209466

Summary: netsnmp_arch_swrun_container_load hung in a loop
Product: Red Hat Enterprise Linux 7 Reporter: Chris Cheney <ccheney>
Component: net-snmpAssignee: Josef Ridky <jridky>
Status: CLOSED WONTFIX QA Contact: CS System Management SST QE <rhel-cs-system-management-subsystem-qe>
Severity: medium Docs Contact:
Priority: medium    
Version: 7.9CC: ovasik
Target Milestone: rc   
Target Release: ---   
Hardware: All   
OS: All   
Whiteboard:
Fixed In Version: Doc Type: If docs needed, set a value
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2023-08-09 08:31:57 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description Chris Cheney 2023-05-24 03:02:16 UTC
Description of problem:

netsnmp_swrun_container_load appears to get stuck in a loop which causes snmpwalk to snmpwalk to fail along with snmpd


strace output from when it happens:

26821 14:04:16.651384 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 trace: ", 27) = 27 <0.000025>
26821 14:04:16.651455 write(3</var/log/snmpd.log>, "netsnmp_swrun_container_load(): host/data_access/swrun.c, 201:\n", 63) = 63 <0.000035>
26821 14:04:16.651611 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 swrun:container:load: ", 42) = 42 <0.000027>
26821 14:04:16.651682 write(3</var/log/snmpd.log>, "load\n", 5) = 5 <0.000022>

// it appears that the process gets hung in an incrementing loop here while processing /proc

26821 14:04:16.655383 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000030>
26821 14:04:16.655460 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021>
26821 14:04:16.655539 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021>
26821 14:04:16.655610 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000022>
26821 14:04:16.655675 write(3</var/log/snmpd.log>, ".2", 2) = 2 <0.000021>
26821 14:04:16.655739 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021>

26821 14:04:16.655884 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000022>
26821 14:04:16.655949 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000027>

26821 14:04:16.657370 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000027>
26821 14:04:16.657448 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021>
26821 14:04:16.657513 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021>
26821 14:04:16.657577 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000021>
26821 14:04:16.657640 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021>
26821 14:04:16.657703 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021>

26821 14:04:16.657839 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000027>
26821 14:04:16.657908 write(3</var/log/snmpd.log>, "result was 0\n", 13) = 13 <0.000022>

< ... snipped a lot of repetition ... >

26821 14:18:47.123124 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000023>
26821 14:18:47.123191 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000023>
26821 14:18:47.123259 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000024> 
26821 14:18:47.123327 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000024>
26821 14:18:47.123406 write(3</var/log/snmpd.log>, ".2358", 5) = 5 <0.000025>
26821 14:18:47.123478 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000024>

26821 14:18:47.123626 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000024>
26821 14:18:47.123699 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000024>

26821 14:18:47.123849 write(3</var/log/snmpd.log>, "2023-05-22 14:18:47 compare:index: ", 35) = 35 <0.000024>
26821 14:18:47.123920 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000025>
26821 14:18:47.124002 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000024>
26821 14:18:47.124072 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000024>
26821 14:18:47.124142 write(3</var/log/snmpd.log>, ".2280", 5) = 5 <0.000025>
26821 14:18:47.124211 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000023>

< ... end of output to snmpd.log ... >


Version:

net-snmp-5.7.2-49.el7_9.2.x86_64
net-snmp-agent-libs-5.7.2-49.el7_9.2.x86_64
net-snmp-libs-5.7.2-49.el7_9.2.x86_64
net-snmp-utils-5.7.2-49.el7_9.2.x86_64

Comment 3 Chris Cheney 2023-05-24 03:06:14 UTC
This is where it gets stuck:

26821 14:04:16.651384 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 trace: ", 27) = 27 <0.000025>
26821 14:04:16.651455 write(3</var/log/snmpd.log>, "netsnmp_swrun_container_load(): host/data_access/swrun.c, 201:\n", 63) = 63 <0.000035>
26821 14:04:16.651611 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 swrun:container:load: ", 42) = 42 <0.000027>
26821 14:04:16.651682 write(3</var/log/snmpd.log>, "load\n", 5) = 5 <0.000022>


==========


net-snmp-5.7.2/agent/mibgroup/host/data_access/swrun.c

/**
 * load swrun information in specified container
 *
 * @param container empty container to be filled.
 *                  pass NULL to have the function create one.
 * @param load_flags flags to modify behaviour. Examples:
 *                   NETSNMP_SWRUN_ALL_OR_NONE
 *
 * @retval NULL  error
 * @retval !NULL pointer to container
 */
netsnmp_container*
netsnmp_swrun_container_load(netsnmp_container* user_container, u_int load_flags)
{
    netsnmp_container* container = user_container;
    int rc;

    DEBUGMSGTL(("swrun:container:load", "load\n"));
    netsnmp_assert(1 == _swrun_init);

    if (NULL == container)
        container = netsnmp_swrun_container();
    if (NULL == container) {
        snmp_log(LOG_ERR, "no container specified/found for swrun\n");
        return NULL;
    }

    rc =  netsnmp_arch_swrun_container_load(container, load_flags);                <=== I suspect this function is where it gets stuck, unfortunately there is no DEBUGMSGTL output until it finishes...
    if (0 != rc) {
        if (NULL == user_container) {
            netsnmp_swrun_container_free(container, NETSNMP_SWRUN_NOFLAGS);
            container = NULL;
        }
        else if (load_flags & NETSNMP_SWRUN_ALL_OR_NONE) {
            DEBUGMSGTL(("swrun:container:load",
                        " discarding partial results\n"));
            netsnmp_swrun_container_free_items(container);
        }
    }

    return container;
}


===========


net-snmp-5.7.2/agent/mibgroup/host/data_access/swrun_procfs_status.c 

/* ---------------------------------------------------------------------
 */
int
netsnmp_arch_swrun_container_load( netsnmp_container *container, u_int flags)
{
    DIR                 *procdir = NULL;
    struct dirent       *procentry_p;
    FILE                *fp;
    int                  pid, i;
    unsigned long long   cpu;
    char                 buf[BUFSIZ], buf2[BUFSIZ], *cp;
    netsnmp_swrun_entry *entry;
    
    procdir = opendir("/proc");
    if ( NULL == procdir ) {
        snmp_log( LOG_ERR, "Failed to open /proc" );
        return -1;
    }

    /*
     * Walk through the list of processes in the /proc tree
     */
    while ( NULL != (procentry_p = readdir( procdir ))) {
        pid = atoi( procentry_p->d_name );
        if ( 0 == pid )
            continue;   /* Presumably '.' or '..' */

        entry = netsnmp_swrun_entry_create(pid);
        if (NULL == entry)
            continue;   /* error already logged by function */

        /*
         * Now extract the interesting information
         *   from the various /proc{PID}/ interface files
         */

        /*
         *   Name:  process name
         */
        snprintf( buf2, BUFSIZ, "/proc/%d/status", pid );
        fp = fopen( buf2, "r" );
        if (!fp) {
            netsnmp_swrun_entry_free(entry);
            continue; /* file (process) probably went away */
        }
        memset(buf, 0, sizeof(buf));
        if (fgets( buf, BUFSIZ-1, fp ) == NULL) {
            fclose(fp);
            netsnmp_swrun_entry_free(entry);
            continue;
        }
        fclose(fp);

        for ( cp = buf; *cp != ':'; cp++ )
            ;
        while (isspace(*(++cp)))        /* Skip ':' and following spaces */
            ;
        entry->hrSWRunName_len = snprintf(entry->hrSWRunName,
                                   sizeof(entry->hrSWRunName)-1, "%s", cp);
        if ( '\n' == entry->hrSWRunName[ entry->hrSWRunName_len-1 ]) {
            entry->hrSWRunName[ entry->hrSWRunName_len-1 ] = '\0';
            entry->hrSWRunName_len--;           /* Stamp on trailing newline */
        }

        /*
         *  Command Line:
         *     argv[0] '\0' argv[1] '\0' ....
         */
        snprintf( buf2, BUFSIZ, "/proc/%d/cmdline", pid );
        fp = fopen( buf2, "r" );
        if (!fp) {
            netsnmp_swrun_entry_free(entry);
            continue; /* file (process) probably went away */
        }
        memset(buf, 0, sizeof(buf));
        if ((cp = fgets( buf, BUFSIZ-1, fp )) == NULL) {
            fclose(fp);
            netsnmp_swrun_entry_free(entry);
            continue;
        }
        fclose(fp);

        /*
         *     argv[0]   is hrSWRunPath
         */ 
        entry->hrSWRunPath_len = snprintf(entry->hrSWRunPath,
                                   sizeof(entry->hrSWRunPath)-1, "%s", buf);
        /*
         * Stitch together argv[1..] to construct hrSWRunParameters
         */
        cp = buf + entry->hrSWRunPath_len+1;
        while ( 1 ) {
            while (*cp)
                cp++;
            if ( '\0' == *(cp+1))
                break;      /* '\0''\0' => End of command line */
            *cp = ' ';
        }
        entry->hrSWRunParameters_len
            = sprintf(entry->hrSWRunParameters, "%.*s",
                      (int)sizeof(entry->hrSWRunParameters) - 1,
                      buf + entry->hrSWRunPath_len + 1);
 
        /*
         * XXX - No information regarding system processes vs applications
         */
        entry->hrSWRunType = HRSWRUNTYPE_APPLICATION;

        /*
         *   {xxx} {xxx} STATUS  {xxx}*10  UTIME STIME  {xxx}*8 RSS
         */
        snprintf( buf, BUFSIZ, "/proc/%d/stat", pid );
        fp = fopen( buf, "r" );
        if (!fp) {
            netsnmp_swrun_entry_free(entry);
            continue; /* file (process) probably went away */
        }
        if (fgets( buf, BUFSIZ-1, fp ) == NULL) {
            fclose(fp);
            netsnmp_swrun_entry_free(entry);
            continue;
        }
        fclose(fp);

        cp = buf;
        while ( ' ' != *(cp++))    /* Skip first field */
            ;
        while ( ' ' != *(cp++))    /* Skip second field */
            ;
        
        switch (*cp) {
        case 'R':  entry->hrSWRunStatus = HRSWRUNSTATUS_RUNNING;
                   break;
        case 'S':  entry->hrSWRunStatus = HRSWRUNSTATUS_RUNNABLE;
                   break;
        case 'D':
        case 'T':  entry->hrSWRunStatus = HRSWRUNSTATUS_NOTRUNNABLE;
                   break;
        case 'Z':
        default:   entry->hrSWRunStatus = HRSWRUNSTATUS_INVALID;
                   break;
        }
        for (i=11; i; i--) {   /* Skip STATUS + 10 fields */
            while (' ' != *(++cp))
                ;
            cp++;
        }
        cpu  = atol( cp );                     /*  utime */
        while ( ' ' != *(++cp))
            ;
        cpu += atol( cp );                     /* +stime */
        entry->hrSWRunPerfCPU  = cpu * 100 / sc_clk_tck;

        for (i=9; i; i--) {   /* Skip stime + 8 fields */
            while (' ' != *(++cp))
                ;
            cp++;
        }
        entry->hrSWRunPerfMem  = atol( cp );       /* rss   */
        entry->hrSWRunPerfMem *= (pagesize/1024);  /* in kB */
        CONTAINER_INSERT(container, entry);                                                     <===  This is probably where the next function comes in (see below)
    }
    closedir( procdir );

    DEBUGMSGTL(("swrun:load:arch"," loaded %" NETSNMP_PRIz "d entries\n",                       <=== Appears it never actually makes it to here as we don't see it in the debug output
                CONTAINER_SIZE(container)));

    return 0;
}


=========


The CONTAINER_INSERT(container, entry); function is in the same source file "net-snmp-5.7.2/snmplib/container.c" as the functions that print the output that is stuck in the loop.


==========


And all the messages that look like the following come from the following code:

26821 14:04:16.655383 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000030>
26821 14:04:16.655460 write(3</var/log/snmpd.log>, "compare ", 8) = 8 <0.000021>
26821 14:04:16.655539 write(3</var/log/snmpd.log>, ".1", 2) = 2 <0.000021>
26821 14:04:16.655610 write(3</var/log/snmpd.log>, " to ", 4) = 4 <0.000022>
26821 14:04:16.655675 write(3</var/log/snmpd.log>, ".2", 2) = 2 <0.000021>
26821 14:04:16.655739 write(3</var/log/snmpd.log>, "\n", 1) = 1 <0.000021>

26821 14:04:16.655884 write(3</var/log/snmpd.log>, "2023-05-22 14:04:16 compare:index: ", 35) = 35 <0.000022>
26821 14:04:16.655949 write(3</var/log/snmpd.log>, "result was -1\n", 14) = 14 <0.000027>


==========


net-snmp-5.7.2/snmplib/container.c :

/*------------------------------------------------------------------
 *
 * simple comparison routines
 * 
 */
int
netsnmp_compare_netsnmp_index(const void *lhs, const void *rhs)
{
    int rc;
    netsnmp_assert((NULL != lhs) && (NULL != rhs));
    DEBUGIF("compare:index") {
        DEBUGMSGT(("compare:index", "compare "));
        DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) lhs)->oids,
                     ((const netsnmp_index *) lhs)->len));
        DEBUGMSG(("compare:index", " to "));
        DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) rhs)->oids,
                     ((const netsnmp_index *) rhs)->len));
        DEBUGMSG(("compare:index", "\n"));
    }
    rc = snmp_oid_compare(((const netsnmp_index *) lhs)->oids,
                          ((const netsnmp_index *) lhs)->len,
                          ((const netsnmp_index *) rhs)->oids,
                          ((const netsnmp_index *) rhs)->len);
    DEBUGMSGT(("compare:index", "result was %d\n", rc));
    return rc;
}   

int
netsnmp_ncompare_netsnmp_index(const void *lhs, const void *rhs)
{
    int rc;
    netsnmp_assert((NULL != lhs) && (NULL != rhs));
    DEBUGIF("compare:index") {
        DEBUGMSGT(("compare:index", "compare "));
        DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) lhs)->oids,
                     ((const netsnmp_index *) lhs)->len));
        DEBUGMSG(("compare:index", " to "));
        DEBUGMSGSUBOID(("compare:index", ((const netsnmp_index *) rhs)->oids,
                     ((const netsnmp_index *) rhs)->len));
        DEBUGMSG(("compare:index", "\n"));
    }
    rc = snmp_oid_ncompare(((const netsnmp_index *) lhs)->oids,
                           ((const netsnmp_index *) lhs)->len,
                           ((const netsnmp_index *) rhs)->oids,
                           ((const netsnmp_index *) rhs)->len,
                           ((const netsnmp_index *) rhs)->len);
    DEBUGMSGT(("compare:index", "result was %d\n", rc));
    return rc;
}


==========