Bug 1900261

Summary: stalld is not restarted after crash
Product: OpenShift Container Platform Reporter: OpenShift BugZilla Robot <openshift-bugzilla-robot>
Component: Node Tuning OperatorAssignee: Jiří Mencák <jmencak>
Status: CLOSED ERRATA QA Contact: Simon <skordas>
Severity: high Docs Contact:
Priority: high    
Version: 4.6.zCC: sejug
Target Milestone: ---   
Target Release: 4.6.z   
Hardware: Unspecified   
OS: Unspecified   
Whiteboard:
Fixed In Version: Doc Type: If docs needed, set a value
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2020-12-14 13:51:18 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Bug Depends On: 1900196    
Bug Blocks:    

Comment 3 Simon 2020-11-30 20:04:05 UTC
$ oc get clusterversions.config.openshift.io
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.6.0-0.nightly-2020-11-29-153115   True        False         5h27m   Cluster version is 4.6.0-0.nightly-2020-11-29-153115

$ oc project openshift-cluster-node-tuning-operator
Now using project "openshift-cluster-node-tuning-operator" on server "https://api.skordas3011.qe.devcluster.openshift.com:6443".

$ oc get nodes
NAME                                         STATUS   ROLES    AGE     VERSION
ip-10-0-130-111.us-east-2.compute.internal   Ready    worker   5h44m   v1.19.0+1348ff8
ip-10-0-137-83.us-east-2.compute.internal    Ready    master   5h52m   v1.19.0+1348ff8
ip-10-0-165-228.us-east-2.compute.internal   Ready    worker   5h46m   v1.19.0+1348ff8
ip-10-0-179-178.us-east-2.compute.internal   Ready    master   5h51m   v1.19.0+1348ff8
ip-10-0-192-112.us-east-2.compute.internal   Ready    master   5h52m   v1.19.0+1348ff8
ip-10-0-219-138.us-east-2.compute.internal   Ready    worker   5h46m   v1.19.0+1348ff8

$ # Using worker node
$ node=ip-10-0-130-111.us-east-2.compute.internal

$ echo $node
ip-10-0-130-111.us-east-2.compute.internal

$ oc label node $node node-role.kubernetes.io/worker-rt=
node/ip-10-0-130-111.us-east-2.compute.internal labeled


$ oc create -f- <<EOF
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigPool
metadata:
 name: worker-rt
 labels:
   worker-rt: ""
spec:
 machineConfigSelector:
   matchExpressions:
     - {key: machineconfiguration.openshift.io/role, operator: In, values: [worker,worker-rt]}
 nodeSelector:
   matchLabels:
     node-role.kubernetes.io/worker-rt: ""
EOF
machineconfigpool.machineconfiguration.openshift.io/worker-rt created

$ oc create -f- <<EOF
apiVersion: tuned.openshift.io/v1
kind: Tuned
metadata:
 name: openshift-realtime
 namespace: openshift-cluster-node-tuning-operator
spec:
 profile:
 - data: |
     [main]
     summary=Custom OpenShift realtime profile
     include=openshift-node,realtime
     [variables]
     # isolated_cores take a list of ranges; e.g. isolated_cores=2,4-7
     isolated_cores=1
     #isolate_managed_irq=Y
     not_isolated_cores_expanded=${f:cpulist_invert:${isolated_cores_expanded}}
     [bootloader]
     cmdline_ocp_realtime=+systemd.cpu_affinity=${not_isolated_cores_expanded}
     [service]
     service.stalld=start,enable
   name: openshift-realtime

 recommend:
 - machineConfigLabels:
     machineconfiguration.openshift.io/role: "worker-rt"
   priority: 20
   profile: openshift-realtime
EOF
tuned.tuned.openshift.io/openshift-realtime created

$ oc get nodes
NAME                                         STATUS   ROLES              AGE     VERSION
ip-10-0-130-111.us-east-2.compute.internal   Ready    worker,worker-rt   5h58m   v1.19.0+1348ff8
ip-10-0-137-83.us-east-2.compute.internal    Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-165-228.us-east-2.compute.internal   Ready    worker             6h1m    v1.19.0+1348ff8
ip-10-0-179-178.us-east-2.compute.internal   Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-192-112.us-east-2.compute.internal   Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-219-138.us-east-2.compute.internal   Ready    worker             6h1m    v1.19.0+1348ff8

$ oc get mcp
NAME        CONFIG                                                UPDATED   UPDATING   DEGRADED   MACHINECOUNT   READYMACHINECOUNT   UPDATEDMACHINECOUNT   DEGRADEDMACHINECOUNT   AGE
master      rendered-master-bcd76863189cb74a9a9308f09d269ec2      True      False      False      3              3                   3                     0                      6h5m
worker      rendered-worker-a9cd4042060bde8806aa3e8ec4cfd290      True      False      False      2              2                   2                     0                      6h5m
worker-rt   rendered-worker-rt-69e07f420cd5c15e33ed5ca889787e4f   True      False      False      1              1                   1                     0                      11m

$ oc debug node/$node
Starting pod/ip-10-0-130-111us-east-2computeinternal-debug ...
To use host binaries, run `chroot /host`
Pod IP: 10.0.130.111
If you don't see a command prompt, try pressing enter.
sh-4.4# ps auxww | grep stalld
root        3495  0.4  0.0   7440  2612 ?        Ss   19:53   0:02 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        8832  0.0  0.0   9180   972 pts/0    S+   20:01   0:00 grep stalld
sh-4.4# kill 3495
sh-4.4# ps auxww | grep stalld
root        8936  1.0  0.0   7388  2412 ?        Ss   20:02   0:00 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        8978  0.0  0.0   9180  1080 pts/0    S+   20:02   0:00 grep stalld
sh-4.4# kill 8936
sh-4.4# ps auxww | grep stalld
root        9048  1.0  0.0   7388  2400 ?        Ss   20:02   0:00 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        9068  0.0  0.0   9180  1092 pts/0    S+   20:02   0:00 grep stalld
sh-4.4# exit
exit

Removing debug pod ...

^^ New stalld PID after killing procces

Comment 5 errata-xmlrpc 2020-12-14 13:51:18 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory (Moderate: OpenShift Container Platform 4.6.8 security and bug fix update), and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHSA-2020:5259