Bug 1872584 - The crd conversion webhook pods are stuck in CrashLoopBackOff
Summary: The crd conversion webhook pods are stuck in CrashLoopBackOff
Keywords:
Status: CLOSED ERRATA
Alias: None
Product: OpenShift Container Platform
Classification: Red Hat
Component: OLM
Version: 4.6
Hardware: Unspecified
OS: Unspecified
high
high
Target Milestone: ---
: 4.6.0
Assignee: Alexander Greene
QA Contact: yhui
URL:
Whiteboard:
: 1871250 (view as bug list)
Depends On:
Blocks:
TreeView+ depends on / blocked
 
Reported: 2020-08-26 07:02 UTC by yhui
Modified: 2020-10-27 16:34 UTC (History)
4 users (show)

Fixed In Version:
Doc Type: If docs needed, set a value
Doc Text:
Clone Of:
Environment:
Last Closed: 2020-10-27 16:34:19 UTC
Target Upstream Version:
Embargoed:


Attachments (Terms of Use)


Links
System ID Private Priority Status Summary Last Updated
Github operator-framework operator-lifecycle-manager pull 1767 0 None closed BUG 1872584: Fix conversion webhooks 2021-02-12 21:57:01 UTC
Red Hat Product Errata RHBA-2020:4196 0 None None None 2020-10-27 16:34:33 UTC

Description yhui 2020-08-26 07:02:00 UTC
Description of problem:
The crd conversion webhook pods are stuck in CrashLoopBackOff.
[root@preserve-olm-env OCP-34181]# oc get pods -n global
NAME                                     READY   STATUS             RESTARTS   AGE
crd-conversion-webhook-547f9dff-rgczw    0/1     CrashLoopBackOff   5          4m36s
crd-conversion-webhook-b8b55cfcc-qzg26   0/1     CrashLoopBackOff   5          4m37s


Version-Release number of selected component (if applicable):
[root@preserve-olm-env OCP-34181]# oc get clusterversion
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.6.0-0.nightly-2020-08-25-204643   True        False         92m     Cluster version is 4.6.0-0.nightly-2020-08-25-204643

[root@preserve-olm-env OCP-34181]# oc exec olm-operator-5b7d79f4c-52ffg -n openshift-operator-lifecycle-manager -- olm --version
OLM version: 0.16.0
git commit: c3852d57c86707deb80c042c2155ad82c2d9628f


How reproducible:
Always


Steps to Reproduce:
1.Prepare a CRD yaml file, including crd conversion webhook and two versions of CR. And the schema of these CR are different. 
# cat crd.yaml
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: crontabs.stable.example.com
spec:
  group: stable.example.com
  versions:
  - name: v1beta1
    served: true
    storage: false
    schema:
      openAPIV3Schema:
        type: object
        properties:
          spec:
            type: object
            properties:
              hostPort:
                type: string
  - name: v1
    served: true
    storage: true
    schema:
      openAPIV3Schema:
        type: object
        properties:
          spec:
            type: object
            properties:
              host:
                type: string
              port:
                type: string
  preserveUnknownFields: false
  conversion:
    strategy: Webhook
    webhookClientConfig:
      service:
        namespace: default
        name: example-webhook-name
        path: /crdconvert 
  scope: Namespaced   
  names:
    plural: crontabs
    singular: crontab
    kind: CronTab
    shortNames:
    - ct

Then apply the CRD.
# oc apply -f crd.yaml

2. Create a new project and operatorgroup.
# cat ns.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: global
# cat og.yaml
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: global
  namespace: global
# oc apply -f ns.yaml
# oc apply -f og.yaml

3. Prepare the csv yaml file and apply it.
# cat csv.yaml

apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
  annotations:
    alm-examples: '[{"apiVersion":"serving.knative.dev/v1alpha1","kind":"KnativeServing","metadata":{"name":"knative-serving"},"spec":{"config":{"autoscaler":{"container-concurrency-target-default":"100","container-concurrency-target-percentage":"1.0","enable-scale-to-zero":"true","max-scale-up-rate":"10","panic-threshold-percentage":"200.0","panic-window":"6s","panic-window-percentage":"10.0","scale-to-zero-grace-period":"30s","stable-window":"60s","tick-interval":"2s"},"defaults":{"revision-cpu-limit":"1000m","revision-cpu-request":"400m","revision-memory-limit":"200M","revision-memory-request":"100M","revision-timeout-seconds":"300"},"deployment":{"registriesSkippingTagResolving":"ko.local,dev.local"},"gc":{"stale-revision-create-delay":"24h","stale-revision-lastpinned-debounce":"5h","stale-revision-minimum-generations":"1","stale-revision-timeout":"15h"},"logging":{"loglevel.activator":"info","loglevel.autoscaler":"info","loglevel.controller":"info","loglevel.queueproxy":"info","loglevel.webhook":"info"},"observability":{"logging.enable-var-log-collection":"false","metrics.backend-destination":"prometheus"},"tracing":{"enable":"false","sample-rate":"0.1"}}}}]'
    capabilities: Seamless Upgrades
    categories: Networking,Integration & Delivery,Cloud Provider,Developer Tools
    certified: "false"
    containerImage: quay.io/openshift-knative/serverless-operator:v1.0.0
    createdAt: "2019-07-27T17:00:00Z"
    description: |-
      Provides a collection of API's to support deploying and serving
      of serverless applications and functions.
    repository: https://github.com/openshift-knative/serverless-operator
    support: Red Hat
  name: webhook.v1.0.0
  namespace: global
spec:
  apiservicedefinitions: {}
  webhookdefinitions:
  - generateName: object.auditor.com
    type: ValidatingAdmissionWebhook
    deploymentName: "crd-conversion-webhook"
    containerPort: 443
    sideEffects: "None"
    failurePolicy: Ignore
    admissionReviewVersions:
    - "v1"
    - "v1beta1"
    webhookPath: "/validate"
    conversionCRDs: 
    - "crontabs.stable.example.com"
  customresourcedefinitions:
    owned:
    - description: Crontab is a sample Schema
      kind: Crontab
      name: crontabs.stable.example.com
      version: v1   
  description: |
    A simple Webhook.
  displayName: Simple Webhook
  install:
    spec:
      deployments:
      - name: crd-conversion-webhook
        spec:
          replicas: 1
          selector:
            matchLabels:
                name: crd-conversion-webhook
          template:
            metadata:
              labels:
                name: crd-conversion-webhook
            spec:
              containers:
                - name: crd-conversion-webhook
                  image: quay.io/madorn/crd-conversion-webhook
                  args:
                  - -tls-cert-file=/apiserver.local.config/certificates/apiserver.crt
                  - -tls-private-key-file=/apiserver.local.config/certificates/apiserver.key
    strategy: deployment
  installModes:
  - supported: false
    type: OwnNamespace
  - supported: false
    type: SingleNamespace
  - supported: false
    type: MultiNamespace
  - supported: true
    type: AllNamespaces
  keywords:
  - wbhooks
  links:
  - name: Documentation
    url: https://access.redhat.com/documentation/en-us/openshift_container_platform/4.1/html-single/serverless/index
  maintainers:
  - email: knative
    name: Serverless Team
  maturity: alpha
  provider:
    name: Red Hat
  version: 1.0.0

# oc apply -f csv.yaml

4. Check the csv and pods.
[root@preserve-olm-env OCP-34181]# oc get pods -n global
NAME                                      READY   STATUS             RESTARTS   AGE
crd-conversion-webhook-5769bb8ff6-k9c4l   0/1     CrashLoopBackOff   4          2m35s
crd-conversion-webhook-7856867854-7q54d   0/1     CrashLoopBackOff   4          2m37s

[root@preserve-olm-env OCP-34181]# oc describe pods crd-conversion-webhook-b8b55cfcc-qzg26 -n global
Name:         crd-conversion-webhook-b8b55cfcc-qzg26
Events:
  Type     Reason          Age                            From                                                Message
  ----     ------          ----                           ----                                                -------
  Normal   Scheduled       <invalid>                                                                          Successfully assigned global/crd-conversion-webhook-b8b55cfcc-qzg26 to ip-10-0-154-10.us-east-2.compute.internal
  Normal   AddedInterface  <invalid>                      multus                                              Add eth0 [10.129.2.57/23]
  Normal   Pulled          <invalid>                      kubelet, ip-10-0-154-10.us-east-2.compute.internal  Successfully pulled image "quay.io/madorn/crd-conversion-webhook" in 1.252883251s
  Normal   Pulled          <invalid>                      kubelet, ip-10-0-154-10.us-east-2.compute.internal  Successfully pulled image "quay.io/madorn/crd-conversion-webhook" in 926.69701ms
  Normal   Pulled          <invalid>                      kubelet, ip-10-0-154-10.us-east-2.compute.internal  Successfully pulled image "quay.io/madorn/crd-conversion-webhook" in 1.254066661s
  Normal   Created         <invalid> (x4 over <invalid>)  kubelet, ip-10-0-154-10.us-east-2.compute.internal  Created container crd-conversion-webhook
  Normal   Started         <invalid> (x4 over <invalid>)  kubelet, ip-10-0-154-10.us-east-2.compute.internal  Started container crd-conversion-webhook
  Normal   Pulled          <invalid>                      kubelet, ip-10-0-154-10.us-east-2.compute.internal  Successfully pulled image "quay.io/madorn/crd-conversion-webhook" in 1.429940031s
  Warning  BackOff         <invalid> (x7 over <invalid>)  kubelet, ip-10-0-154-10.us-east-2.compute.internal  Back-off restarting failed container
  Normal   Pulling         <invalid> (x5 over <invalid>)  kubelet, ip-10-0-154-10.us-east-2.compute.internal  Pulling image "quay.io/madorn/crd-conversion-webhook"
  Normal   Pulled          <invalid>                      kubelet, ip-10-0-154-10.us-east-2.compute.internal  Successfully pulled image "quay.io/madorn/crd-conversion-webhook" in 1.280870385s

[root@preserve-olm-env OCP-34181]# oc logs crd-conversion-webhook-7c87bc56f7-s9ql7 -n global
[root@preserve-olm-env OCP-34181]#
There is no output.


Actual results:
The crd conversion webhook pods are stuck in CrashLoopBackOff.

Expected results:
The crd conversion webhook pods are created successfully.

Additional info:

Comment 1 Sim Dhaliwal 2020-08-26 14:28:28 UTC
This occurs because the example webhook server deployment image (i.e. quay.io/madorn/crd-conversion-webhook) works with root user and works fine in a Minikube cluster environment. To be able to run this in an OpenShift cluster, the image has to be rebuilt to run with non-root user. I can provide a new example image shortly.

Comment 2 yhui 2020-08-27 06:41:25 UTC
Got it. Very appreciate for your support!

Comment 3 yhui 2020-08-31 06:09:39 UTC
Hi Sim,

Thanks for your reply. Could you please help to update the image ASAP? The story test is blocked by the issue. Very appreciate for your support in advance!

Comment 4 Sim Dhaliwal 2020-08-31 08:01:41 UTC
Hi Hui,

There are some changes related to APIService/Webhook ports that need to be merged in OLM's master branch first.

After these changes are merged, you will be able to use the following CSV to test this feature on OpenShift cluster:

# csv.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: global
---
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: crd-conversion-webhook
  namespace: global
---
apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
  annotations:
    alm-examples: '[{"apiVersion":"serving.knative.dev/v1alpha1","kind":"KnativeServing","metadata":{"name":"knative-serving"},"spec":{"config":{"autoscaler":{"container-concurrency-target-default":"100","container-concurrency-target-percentage":"1.0","enable-scale-to-zero":"true","max-scale-up-rate":"10","panic-threshold-percentage":"200.0","panic-window":"6s","panic-window-percentage":"10.0","scale-to-zero-grace-period":"30s","stable-window":"60s","tick-interval":"2s"},"defaults":{"revision-cpu-limit":"1000m","revision-cpu-request":"400m","revision-memory-limit":"200M","revision-memory-request":"100M","revision-timeout-seconds":"300"},"deployment":{"registriesSkippingTagResolving":"ko.local,dev.local"},"gc":{"stale-revision-create-delay":"24h","stale-revision-lastpinned-debounce":"5h","stale-revision-minimum-generations":"1","stale-revision-timeout":"15h"},"logging":{"loglevel.activator":"info","loglevel.autoscaler":"info","loglevel.controller":"info","loglevel.queueproxy":"info","loglevel.webhook":"info"},"observability":{"logging.enable-var-log-collection":"false","metrics.backend-destination":"prometheus"},"tracing":{"enable":"false","sample-rate":"0.1"}}}}]'
    capabilities: Seamless Upgrades
    categories: Networking,Integration & Delivery,Cloud Provider,Developer Tools
    certified: "false"
    containerImage: quay.io/openshift-knative/serverless-operator:v1.0.0
    createdAt: "2019-07-27T17:00:00Z"
    description: |-
      Provides a collection of API's to support deploying and serving
      of serverless applications and functions.
    repository: https://github.com/openshift-knative/serverless-operator
    support: Red Hat
  name: webhook.v1.0.0
  namespace: global
spec:
  apiservicedefinitions: {}
  webhookdefinitions:
  - generateName: sample.webhook.com
    type: ValidatingAdmissionWebhook
    deploymentName: "crd-conversion-webhook"
    containerPort: 9443
    sideEffects: "None"
    failurePolicy: Ignore
    admissionReviewVersions:
    - "v1"
    - "v1beta1"
    webhookPath: "/validate"
    conversionCRDs: 
    - "crontabs.stable.example.com"
  customresourcedefinitions:
    owned:
    - description: Crontab is a sample Schema
      kind: Crontab
      name: crontabs.stable.example.com
      version: v1beta1   
  description: |
    A simple Webhook.
  displayName: Simple Webhook
  install:
    spec:
      deployments:
      - name: crd-conversion-webhook
        spec:
          replicas: 1
          selector:
            matchLabels:
                name: crd-conversion-webhook
          template:
            metadata:
              labels:
                name: crd-conversion-webhook
            spec:
              containers:
                - name: crd-conversion-webhook
                  image: quay.io/madorn/openshift-crd-conversion-webhook:latest
                  imagePullPolicy: Always
                  args:
                  - -tls-cert-file=/apiserver.local.config/certificates/apiserver.crt
                  - -tls-private-key-file=/apiserver.local.config/certificates/apiserver.key 
    strategy: deployment
  installModes:
  - supported: false
    type: OwnNamespace
  - supported: false
    type: SingleNamespace
  - supported: false
    type: MultiNamespace
  - supported: true
    type: AllNamespaces
  keywords:
  - wbhooks
  links:
  - name: Documentation
    url: https://access.redhat.com/documentation/en-us/openshift_container_platform/4.1/html-single/serverless/index
  maintainers:
  - email: knative
    name: Serverless Team
  maturity: alpha
  provider:
    name: Red Hat
  version: 1.0.0

Comment 6 Alexander Greene 2020-09-11 20:12:39 UTC
This bug only affects Conversion webhooks on OpenShift Clusters, as shown in the e2e test case added to the GitHub PR associated with this issue [1]. We are continuing to investigate why this constraint applies to Conversion Webhooks and not Validating or Mutating Webhooks.

REF:
[1] https://github.com/operator-framework/operator-lifecycle-manager/pull/1754

Comment 7 yhui 2020-09-15 08:19:11 UTC
Hi,

I retest the bug using the csv.yaml file that Sim provided. The pods can be created successfully. But the CR cannot be converted to another version. 
Met the error: 
error: crontabs.stable.example.com "cr1" could not be patched: conversion webhook for stable.example.com/v1, Kind=CronTab failed: Post "https://crd-conversion-webhook-service.memcached-operator-system.svc:443/crdconvert?timeout=30s": dial tcp 10.128.2.13:443: connect: connection refused

The procedure is as follows. Could you please help to check it again? Thanks!


Version:
[root@preserve-olm-env 1066]# oc get clusterversion
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.6.0-0.nightly-2020-09-14-221526   True        False         5h37m   Cluster version is 4.6.0-0.nightly-2020-09-14-221526


Steps to test:
1, Prepare a CRD yaml file, including crd conversion webhook and two versions of CR. And the schema of these CR are different. 
[root@preserve-olm-env 1066]# cat crd.yaml
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: crontabs.stable.example.com
spec:
  group: stable.example.com
  versions:
  - name: v1beta1
    served: true
    storage: false
    schema:
      openAPIV3Schema:
        type: object
        properties:
          spec:
            type: object
            properties:
              hostPort:
                type: string
  - name: v1
    served: true
    storage: true
    schema:
      openAPIV3Schema:
        type: object
        properties:
          spec:
            type: object
            properties:
              host:
                type: string
              port:
                type: string
  preserveUnknownFields: false
  conversion:
    strategy: Webhook
    webhookClientConfig:
      service:
        namespace: default
        name: example-webhook-name
        path: /crdconvert 
  scope: Namespaced   
  names:
    plural: crontabs
    singular: crontab
    kind: CronTab
    shortNames:
    - ct

Then apply the CRD.
[root@preserve-olm-env 1066]# oc apply -f crd.yaml


2. Create a operatorgroup.
[root@preserve-olm-env 1066]# cat og.yaml
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: global
[root@preserve-olm-env 1066]# oc apply -f og.yaml


3. Prepare the csv yaml file and apply it.
[root@preserve-olm-env 1066]# cat csv.yaml
apiVersion: operators.coreos.com/v1alpha1
kind: ClusterServiceVersion
metadata:
  annotations:
    alm-examples: '[{"apiVersion":"serving.knative.dev/v1alpha1","kind":"KnativeServing","metadata":{"name":"knative-serving"},"spec":{"config":{"autoscaler":{"container-concurrency-target-default":"100","container-concurrency-target-percentage":"1.0","enable-scale-to-zero":"true","max-scale-up-rate":"10","panic-threshold-percentage":"200.0","panic-window":"6s","panic-window-percentage":"10.0","scale-to-zero-grace-period":"30s","stable-window":"60s","tick-interval":"2s"},"defaults":{"revision-cpu-limit":"1000m","revision-cpu-request":"400m","revision-memory-limit":"200M","revision-memory-request":"100M","revision-timeout-seconds":"300"},"deployment":{"registriesSkippingTagResolving":"ko.local,dev.local"},"gc":{"stale-revision-create-delay":"24h","stale-revision-lastpinned-debounce":"5h","stale-revision-minimum-generations":"1","stale-revision-timeout":"15h"},"logging":{"loglevel.activator":"info","loglevel.autoscaler":"info","loglevel.controller":"info","loglevel.queueproxy":"info","loglevel.webhook":"info"},"observability":{"logging.enable-var-log-collection":"false","metrics.backend-destination":"prometheus"},"tracing":{"enable":"false","sample-rate":"0.1"}}}}]'
    capabilities: Seamless Upgrades
    categories: Networking,Integration & Delivery,Cloud Provider,Developer Tools
    certified: "false"
    containerImage: quay.io/openshift-knative/serverless-operator:v1.0.0
    createdAt: "2019-07-27T17:00:00Z"
    description: |-
      Provides a collection of API's to support deploying and serving
      of serverless applications and functions.
    repository: https://github.com/openshift-knative/serverless-operator
    support: Red Hat
  name: webhook.v1.0.0
spec:
  apiservicedefinitions: {}
  webhookdefinitions:
  - generateName: sample.webhook.com
    type: ValidatingAdmissionWebhook
    deploymentName: "crd-conversion-webhook"
    containerPort: 9443
    sideEffects: "None"
    failurePolicy: Ignore
    admissionReviewVersions:
    - "v1"
    - "v1beta1"
    webhookPath: "/validate"
    conversionCRDs: 
    - "crontabs.stable.example.com"
  customresourcedefinitions:
    owned:
    - description: Crontab is a sample Schema
      kind: Crontab
      name: crontabs.stable.example.com
      version: v1beta1   
  description: |
    A simple Webhook.
  displayName: Simple Webhook
  install:
    spec:
      deployments:
      - name: crd-conversion-webhook
        spec:
          replicas: 1
          selector:
            matchLabels:
                name: crd-conversion-webhook
          template:
            metadata:
              labels:
                name: crd-conversion-webhook
            spec:
              containers:
                - name: crd-conversion-webhook
                  image: quay.io/madorn/openshift-crd-conversion-webhook:latest
                  imagePullPolicy: Always
                  args:
                  - -tls-cert-file=/apiserver.local.config/certificates/apiserver.crt
                  - -tls-private-key-file=/apiserver.local.config/certificates/apiserver.key 
    strategy: deployment
  installModes:
  - supported: false
    type: OwnNamespace
  - supported: false
    type: SingleNamespace
  - supported: false
    type: MultiNamespace
  - supported: true
    type: AllNamespaces
  keywords:
  - wbhooks
  links:
  - name: Documentation
    url: https://access.redhat.com/documentation/en-us/openshift_container_platform/4.1/html-single/serverless/index
  maintainers:
  - email: knative
    name: Serverless Team
  maturity: alpha
  provider:
    name: Red Hat
  version: 1.0.0

[root@preserve-olm-env 1066]# oc apply -f csv.yaml


4. Check the csv and pods.
[root@preserve-olm-env 1066]# oc get csv
NAME             DISPLAY          VERSION   REPLACES   PHASE
webhook.v1.0.0   Simple Webhook   1.0.0                Succeeded
[root@preserve-olm-env 1066]# oc get pods
NAME                                                     READY   STATUS    RESTARTS   AGE
crd-conversion-webhook-9dd7684c9-dblgh                   1/1     Running   0          2m36s


5. Prepare the cr file and apply it.
[root@preserve-olm-env 1066]# cat cr.yaml 
apiVersion: stable.example.com/v1
kind: CronTab
metadata:
  name: cr1
spec:
  host: "localhost"
  port: "8080"
[root@preserve-olm-env 1066]# oc apply -f cr.yaml 
crontab.stable.example.com/cr1 created
The CR is created successfully.
[root@preserve-olm-env 1066]# oc get CronTab
NAME   AGE
cr1    26s


6. Edit the CRD as follows.
Version: v1beta1
storage: false  -> true

Version: v1
storage: true -> false

[root@preserve-olm-env 1066]# oc edit crd crontabs.stable.example.com
customresourcedefinition.apiextensions.k8s.io/crontabs.stable.example.com edited


7, Update the CR from v1 to v1beta1.
Edit the CR.
apiVersion: stable.example.com/v1 -> stable.example.com/v1beta1

There is an error.
[root@preserve-olm-env 1066]# oc edit CronTab cr1
error: crontabs.stable.example.com "cr1" could not be patched: conversion webhook for stable.example.com/v1, Kind=CronTab failed: Post "https://crd-conversion-webhook-service.memcached-operator-system.svc:443/crdconvert?timeout=30s": no service port 443 found for service "crd-conversion-webhook-service"
You can run `oc replace -f /tmp/oc-edit-vdk5w.yaml` to try this update again.


8, Edit the service port to 443.
[root@preserve-olm-env 1066]# oc get svc
NAME                                                    TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)    AGE
crd-conversion-webhook-service                          ClusterIP   172.30.88.241    <none>        9443/TCP   9m54s
[root@preserve-olm-env 1066]# oc edit svc crd-conversion-webhook-service 
service/crd-conversion-webhook-service edited
[root@preserve-olm-env 1066]# oc get svc
NAME                                                    TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)    AGE
crd-conversion-webhook-service                          ClusterIP   172.30.88.241    <none>        443/TCP    11m


9, Then edit the CR from v1 to v1beta1 again.
[root@preserve-olm-env 1066]# oc edit CronTab cr1
error: crontabs.stable.example.com "cr1" could not be patched: conversion webhook for stable.example.com/v1, Kind=CronTab failed: Post "https://crd-conversion-webhook-service.memcached-operator-system.svc:443/crdconvert?timeout=30s": dial tcp 10.128.2.13:443: connect: connection refused
You can run `oc replace -f /tmp/oc-edit-t9uiz.yaml` to try this update again.

Comment 8 Jian Zhang 2020-09-16 03:45:13 UTC
Hi, Alexander 

I changed the TargetRlease back to 4.6 since this is a new feature issue. We should fix it for 4.6, otherwise, we should remove it from the 4.6 new features.

Comment 9 Alexander Greene 2020-09-16 22:26:11 UTC
Hello - I wanted to provide a quick update to this bug given its urgency.

OLM currently sets the Webhook service's port and targetPort to the value provided in the webhookDescription.containerPort field of the csv. There are instances where these two values must be different, such as the one that lead to the scenario described above. I was able to successfully install an operator with a conversion webhook using OLM by:

1. Setting OLM as unmanged
$ oc patch clusterversion version --type=merge -p '{"spec": {"overrides":[{"kind": "Deployment", "name": "olm-operator", "namespace": "openshift-operator-lifecycle-manager", "unmanaged": true, "group": "apps"},{"kind": "Deployment", "name": "catalog-operator", "namespace": "openshift-operator-lifecycle-manager", "unmanaged": true, "group": "apps"},{"kind": "CustomResourceDefinition", "name": "clusterserviceversions.operators.coreos.com", "namespace": "", "unmanaged": true, "group": "apiextensions.k8s.io"}]}}'
clusterversion.config.openshift.io/version patched

2. Deleting the CSV CRD
$ oc delete -f manifests/0000_50_olm_00-clusterserviceversions.crd.yaml 
customresourcedefinition.apiextensions.k8s.io "clusterserviceversions.operators.coreos.com" deleted

3. Applying my updated CSV CRD, Found here [1]:
$ oc apply -f manifests/0000_50_olm_00-clusterserviceversions.crd.yaml 
customresourcedefinition.apiextensions.k8s.io/clusterserviceversions.operators.coreos.com created

4. Updating the OLM deployment with this image: quay.io/agreene/olm:fix-conversion-webhooks

5. Applying a catalogsource that contains an operator with a validating, mutating, and conversion webhook:
$ cat 00_catsrc.yaml 
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
  name: webhook-operator-catalog
  namespace: openshift-marketplace
spec:
  displayName: Webhook Operator Catalog
  image: quay.io/agreene/webhook-operator-index:admission-and-conversion-webhooks-fix-install-modes-with-target-port
  sourceType: grpc

$ oc apply -f 00_catsrc.yaml 
catalogsource.operators.coreos.com/webhook-operator-catalog created


6. Subscribing to the operator
$ cat 01_sub.yaml 
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: webhook-operator-subscription
  namespace: openshift-operators
spec:
  channel: "alpha"
  installPlanApproval: Automatic
  name: webhook-operator
  source: webhook-operator-catalog
  sourceNamespace: openshift-marketplace

$ oc apply -f 01_sub.yaml 
subscription.operators.coreos.com/webhook-operator-subscription created

7. This gets us CSV in the openshift-operators namespace, but it is missing targetPorts because opm hasn't been updated. We can set them manually, add targetPort: 4343 to each WebhookDefinition
8. You will also need to update the Service ports to the following:
spec:
  ports:
  - name: "443"
    port: 443
    protocol: TCP
    targetPort: 4343

9. With this, the conversion webhook works as expected:
$ cat passes.validation.webhook_v1_webhooktest.yaml 
apiVersion: webhook.operators.coreos.io/v1
kind: WebhookTest
metadata:
  name: webhooktest-sample
  namespace: openshift-operators
spec:
  valid: true

$ oc get webhooktests.v1.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml | yq read - spec
mutate: true
valid: true

$  oc get webhooktests.v2.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml | yq read - spec
conversion:
  mutate: true
  valid: true


I will work on removing steps 7 and 8 so the install works smoother. Hopefully have a change in within a day or so.

Ref:
[1] https://github.com/awgreene/operator-lifecycle-manager/commit/f58a187a8cb542d0c52585088df437bdc606ca58

Comment 10 Alexander Greene 2020-09-18 01:39:48 UTC
I created a PR [1] which should allow users to set ports and targetPorts and resolves the issue seen above. The CSV file above will not work because it doesn't set the new fields. I will be on vacation tomorrow but will provide a working conversion webhook on Monday.

Ref:
[1] https://github.com/operator-framework/operator-lifecycle-manager/pull/1767/files

Comment 12 Alexander Greene 2020-09-22 01:04:03 UTC
Now that OLM has been updated, please test this feature with the following process:

```bash
# Install the webhook operator
$ cat <<EOF | kubectl create -f -
apiVersion: operators.coreos.com/v1alpha1
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
  name: webhook-operator-catalog
  namespace: openshift-marketplace
spec:
  displayName: Webhook Operator Catalog
  image: quay.io/olmtest/webhook-operator-index:0.0.3
  sourceType: grpc
---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: webhook-operator-subscription
  namespace: openshift-operators
spec:
  channel: "alpha"
  installPlanApproval: Automatic
  name: webhook-operator
  source: webhook-operator-catalog
  sourceNamespace: openshift-marketplace
EOF

# Try to install an invalid webhooktest CR
$ cat <<EOF | kubectl create -f -
apiVersion: webhook.operators.coreos.io/v1
kind: WebhookTest
metadata:
  name: webhooktest-sample
  namespace: openshift-operators
spec:
  valid: false    
EOF
Error from server (WebhookTest.test.operators.coreos.com "webhooktest-sample" is invalid: spec.schedule: Invalid value: false: Spec.Valid must be true): error when creating "STDIN": admission webhook "vwebhooktest.kb.io" denied the request: WebhookTest.test.operators.coreos.com "webhooktest-sample" is invalid: spec.schedule: Invalid value: false: Spec.Valid must be true


# Try to install a valid webhooktest CR, which should succeed
$ cat <<EOF | kubectl create -f -
apiVersion: webhook.operators.coreos.io/v1
kind: WebhookTest
metadata:
  name: webhooktest-sample
  namespace: openshift-operators
spec:
  valid: true    
EOF
webhooktest.webhook.operators.coreos.io/webhooktest-sample created

# Check that the Conversion Webhook can serve v1 of the webhookTest CR and that the spec.mutate field is true
$ oc get webhooktests.v1.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml | yq read - spec
mutate: true
valid: true

# Check that the Conversion Webhook can serve v2 of the webhookTest CR and that the spec.conversion.mutate field is true
$ oc get webhooktests.v2.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml | yq read - spec
conversion:
  mutate: true
  valid: true
```

Comment 13 Alexander Greene 2020-09-22 01:08:39 UTC
Please note that the pod must be running for the mutating, validating, and conversion webhooks to work. Before attempting to test any webhooks, make sure that the both containers in the pod are READY:
```bash
$ oc get pods -n openshift-operators
NAME                                        READY   STATUS    RESTARTS   AGE
webhook-operator-webhook-58dd76f85c-28zdt   2/2     Running   0          84s
```

Comment 14 Alexander Greene 2020-09-22 16:27:13 UTC
*** Bug 1871250 has been marked as a duplicate of this bug. ***

Comment 15 yhui 2020-09-24 16:24:49 UTC
Version:
[root@preserve-olm-env iib-manifests]# oc get clusterversion
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.6.0-0.nightly-2020-09-24-074159   True        False         84m     Cluster version is 4.6.0-0.nightly-2020-09-24-074159
[root@preserve-olm-env iib-manifests]# oc exec olm-operator-c8445b49f-qfps2 -n openshift-operator-lifecycle-manager -- olm --version
OLM version: 0.16.1
git commit: d0746139120f09ceaf7b18d6429751e6eb2c98a5

Steps to test:
1, Install catalogsource using the image quay.io/olmtest/webhook-operator-index:0.0.3. The image includes csv and crd (including conversion webhook definition) yaml files.
[root@preserve-olm-env iib-manifests]# cat <<EOF | kubectl create -f -
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
  name: webhook-operator-catalog
  namespace: openshift-marketplace
spec:
  displayName: Webhook Operator Catalog
  image: quay.io/olmtest/webhook-operator-index:0.0.3
  sourceType: grpc
EOF

2, Create the subscription.
[root@preserve-olm-env iib-manifests]# cat <<EOF | kubectl create -f -
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: webhook-operator-subscription
  namespace: openshift-operators
spec:
  channel: "alpha"
  installPlanApproval: Automatic
  name: webhook-operator
  source: webhook-operator-catalog
  sourceNamespace: openshift-marketplace
EOF

3, Check the sub, csv and pods status.
[root@preserve-olm-env iib-manifests]# oc get sub -n openshift-operators
NAME                            PACKAGE            SOURCE                     CHANNEL
webhook-operator-subscription   webhook-operator   webhook-operator-catalog   alpha
[root@preserve-olm-env iib-manifests]# oc get csv -n openshift-operators
NAME                      DISPLAY            VERSION   REPLACES   PHASE
webhook-operator.v0.0.1   Webhook Operator   0.0.1                Succeeded
[root@preserve-olm-env iib-manifests]# oc get pods -n openshift-operators
NAME                                        READY   STATUS    RESTARTS   AGE
webhook-operator-webhook-6c44475687-ft6lr   2/2     Running   0          21m

4, Install a valid webhooktest CR
[root@preserve-olm-env iib-manifests]# cat <<EOF | kubectl create -f -
apiVersion: webhook.operators.coreos.io/v1
kind: WebhookTest
metadata:
  name: webhooktest-sample
  namespace: openshift-operators
spec:
  valid: true    
EOF
webhooktest.webhook.operators.coreos.io/webhooktest-sample created

5, Check that the Conversion Webhook can serve v1 of the webhookTest CR and that the spec.mutate field is true
[root@preserve-olm-env iib-manifests]# oc get webhooktests.v1.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml
...
spec:
  mutate: true
  valid: true

Check that the Conversion Webhook can serve v2 of the webhookTest CR and that the spec.conversion.mutate field is true
[root@preserve-olm-env iib-manifests]# oc get webhooktests.v2.webhook.operators.coreos.io webhooktest-sample -n openshift-operators -o yaml
...
spec:
  conversion:
    mutate: true
    valid: true

6, Check the CR status on the cluster.
[root@preserve-olm-env iib-manifests]# oc get WebhookTest -n openshift-operators
NAME                 AGE
webhooktest-sample   66m
[root@preserve-olm-env 1868712]# oc get WebhookTest webhooktest-sample -n openshift-operators -o yaml
apiVersion: webhook.operators.coreos.io/v2
kind: WebhookTest
metadata:
  creationTimestamp: "2020-09-24T15:16:20Z"
  generation: 1
  managedFields:
  - apiVersion: webhook.operators.coreos.io/v1
    fieldsType: FieldsV1
    fieldsV1:
      f:spec:
        .: {}
        f:valid: {}
    manager: kubectl
    operation: Update
    time: "2020-09-24T15:16:20Z"
  name: webhooktest-sample
  namespace: openshift-operators
  resourceVersion: "72776"
  selfLink: /apis/webhook.operators.coreos.io/v2/namespaces/openshift-operators/webhooktests/webhooktest-sample
  uid: 11c7a5eb-a900-4975-b889-c7dddc810262
spec:
  conversion:
    mutate: true
    valid: true
status: {}

The CR has been converted to v2 successfully.

Comment 18 errata-xmlrpc 2020-10-27 16:34:19 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory (OpenShift Container Platform 4.6 GA Images), and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHBA-2020:4196


Note You need to log in before you can comment on or make changes to this bug.