Bug 1900261 - stalld is not restarted after crash
Summary: stalld is not restarted after crash
Keywords:
Status: CLOSED ERRATA
Alias: None
Product: OpenShift Container Platform
Classification: Red Hat
Component: Node Tuning Operator
Version: 4.6.z
Hardware: Unspecified
OS: Unspecified
high
high
Target Milestone: ---
: 4.6.z
Assignee: jmencak
QA Contact: Simon
URL:
Whiteboard:
Depends On: 1900196
Blocks:
TreeView+ depends on / blocked
 
Reported: 2020-11-21 19:45 UTC by OpenShift BugZilla Robot
Modified: 2020-12-14 13:51 UTC (History)
1 user (show)

Fixed In Version:
Doc Type: If docs needed, set a value
Doc Text:
Clone Of:
Environment:
Last Closed: 2020-12-14 13:51:18 UTC
Target Upstream Version:


Attachments (Terms of Use)


Links
System ID Private Priority Status Summary Last Updated
Github openshift cluster-node-tuning-operator pull 182 0 None closed [release-4.6] Bug 1900261: Automatically restart stalld after exit. 2021-02-17 19:56:15 UTC
Red Hat Product Errata RHSA-2020:5259 0 None None None 2020-12-14 13:51:42 UTC

Comment 3 Simon 2020-11-30 20:04:05 UTC
$ oc get clusterversions.config.openshift.io
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.6.0-0.nightly-2020-11-29-153115   True        False         5h27m   Cluster version is 4.6.0-0.nightly-2020-11-29-153115

$ oc project openshift-cluster-node-tuning-operator
Now using project "openshift-cluster-node-tuning-operator" on server "https://api.skordas3011.qe.devcluster.openshift.com:6443".

$ oc get nodes
NAME                                         STATUS   ROLES    AGE     VERSION
ip-10-0-130-111.us-east-2.compute.internal   Ready    worker   5h44m   v1.19.0+1348ff8
ip-10-0-137-83.us-east-2.compute.internal    Ready    master   5h52m   v1.19.0+1348ff8
ip-10-0-165-228.us-east-2.compute.internal   Ready    worker   5h46m   v1.19.0+1348ff8
ip-10-0-179-178.us-east-2.compute.internal   Ready    master   5h51m   v1.19.0+1348ff8
ip-10-0-192-112.us-east-2.compute.internal   Ready    master   5h52m   v1.19.0+1348ff8
ip-10-0-219-138.us-east-2.compute.internal   Ready    worker   5h46m   v1.19.0+1348ff8

$ # Using worker node
$ node=ip-10-0-130-111.us-east-2.compute.internal

$ echo $node
ip-10-0-130-111.us-east-2.compute.internal

$ oc label node $node node-role.kubernetes.io/worker-rt=
node/ip-10-0-130-111.us-east-2.compute.internal labeled


$ oc create -f- <<EOF
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigPool
metadata:
 name: worker-rt
 labels:
   worker-rt: ""
spec:
 machineConfigSelector:
   matchExpressions:
     - {key: machineconfiguration.openshift.io/role, operator: In, values: [worker,worker-rt]}
 nodeSelector:
   matchLabels:
     node-role.kubernetes.io/worker-rt: ""
EOF
machineconfigpool.machineconfiguration.openshift.io/worker-rt created

$ oc create -f- <<EOF
apiVersion: tuned.openshift.io/v1
kind: Tuned
metadata:
 name: openshift-realtime
 namespace: openshift-cluster-node-tuning-operator
spec:
 profile:
 - data: |
     [main]
     summary=Custom OpenShift realtime profile
     include=openshift-node,realtime
     [variables]
     # isolated_cores take a list of ranges; e.g. isolated_cores=2,4-7
     isolated_cores=1
     #isolate_managed_irq=Y
     not_isolated_cores_expanded=${f:cpulist_invert:${isolated_cores_expanded}}
     [bootloader]
     cmdline_ocp_realtime=+systemd.cpu_affinity=${not_isolated_cores_expanded}
     [service]
     service.stalld=start,enable
   name: openshift-realtime

 recommend:
 - machineConfigLabels:
     machineconfiguration.openshift.io/role: "worker-rt"
   priority: 20
   profile: openshift-realtime
EOF
tuned.tuned.openshift.io/openshift-realtime created

$ oc get nodes
NAME                                         STATUS   ROLES              AGE     VERSION
ip-10-0-130-111.us-east-2.compute.internal   Ready    worker,worker-rt   5h58m   v1.19.0+1348ff8
ip-10-0-137-83.us-east-2.compute.internal    Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-165-228.us-east-2.compute.internal   Ready    worker             6h1m    v1.19.0+1348ff8
ip-10-0-179-178.us-east-2.compute.internal   Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-192-112.us-east-2.compute.internal   Ready    master             6h6m    v1.19.0+1348ff8
ip-10-0-219-138.us-east-2.compute.internal   Ready    worker             6h1m    v1.19.0+1348ff8

$ oc get mcp
NAME        CONFIG                                                UPDATED   UPDATING   DEGRADED   MACHINECOUNT   READYMACHINECOUNT   UPDATEDMACHINECOUNT   DEGRADEDMACHINECOUNT   AGE
master      rendered-master-bcd76863189cb74a9a9308f09d269ec2      True      False      False      3              3                   3                     0                      6h5m
worker      rendered-worker-a9cd4042060bde8806aa3e8ec4cfd290      True      False      False      2              2                   2                     0                      6h5m
worker-rt   rendered-worker-rt-69e07f420cd5c15e33ed5ca889787e4f   True      False      False      1              1                   1                     0                      11m

$ oc debug node/$node
Starting pod/ip-10-0-130-111us-east-2computeinternal-debug ...
To use host binaries, run `chroot /host`
Pod IP: 10.0.130.111
If you don't see a command prompt, try pressing enter.
sh-4.4# ps auxww | grep stalld
root        3495  0.4  0.0   7440  2612 ?        Ss   19:53   0:02 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        8832  0.0  0.0   9180   972 pts/0    S+   20:01   0:00 grep stalld
sh-4.4# kill 3495
sh-4.4# ps auxww | grep stalld
root        8936  1.0  0.0   7388  2412 ?        Ss   20:02   0:00 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        8978  0.0  0.0   9180  1080 pts/0    S+   20:02   0:00 grep stalld
sh-4.4# kill 8936
sh-4.4# ps auxww | grep stalld
root        9048  1.0  0.0   7388  2400 ?        Ss   20:02   0:00 /usr/local/bin/stalld -p 1000000000 -r 10000 -d 3 -t 20 --log_syslog --log_kmsg --foreground --pidfile /run/stalld.pid
root        9068  0.0  0.0   9180  1092 pts/0    S+   20:02   0:00 grep stalld
sh-4.4# exit
exit

Removing debug pod ...

^^ New stalld PID after killing procces

Comment 5 errata-xmlrpc 2020-12-14 13:51:18 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory (Moderate: OpenShift Container Platform 4.6.8 security and bug fix update), and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHSA-2020:5259


Note You need to log in before you can comment on or make changes to this bug.