Bug 2155498
| Summary: | test_ceph_osd_stopped is falling on IBM Z | ||||||||
|---|---|---|---|---|---|---|---|---|---|
| Product: | [Red Hat Storage] Red Hat OpenShift Data Foundation | Reporter: | Sujoy Batabyal <sbatabya> | ||||||
| Component: | rook | Assignee: | Travis Nielsen <tnielsen> | ||||||
| Status: | CLOSED NOTABUG | QA Contact: | Neha Berry <nberry> | ||||||
| Severity: | unspecified | Docs Contact: | |||||||
| Priority: | unspecified | ||||||||
| Version: | 4.12 | CC: | brgardne, fbalak, madam, ocs-bugs, odf-bz-bot, sbatabya | ||||||
| Target Milestone: | --- | ||||||||
| Target Release: | --- | ||||||||
| Hardware: | s390x | ||||||||
| OS: | Linux | ||||||||
| Whiteboard: | |||||||||
| Fixed In Version: | Doc Type: | If docs needed, set a value | |||||||
| Doc Text: | Story Points: | --- | |||||||
| Clone Of: | Environment: | ||||||||
| Last Closed: | 2023-01-17 15:11:23 UTC | Type: | Bug | ||||||
| Regression: | --- | Mount Type: | --- | ||||||
| Documentation: | --- | CRM: | |||||||
| Verified Versions: | Category: | --- | |||||||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |||||||
| Cloudforms Team: | --- | Target Upstream Version: | |||||||
| Embargoed: | |||||||||
| Attachments: |
|
||||||||
|
Description
Sujoy Batabyal
2022-12-21 11:03:43 UTC
Message: AssertionError: Incorrect number of CephOSDDiskNotResponding alerts (2 instead of 1 with states: ['pending']).
Alerts: [{'labels': {'alertname': 'CephOSDDiskNotResponding', 'ceph_daemon': 'osd.2', 'container': 'mgr', 'device': '/dev/dm-4', 'disk': '2', 'endpoint': 'http-metrics', 'host': 'worker-1.ocs-ci-large.test.ocs', 'instance': '10.129.4.30:9283', 'job': 'rook-ceph-mgr', 'managedBy': 'ocs-storagecluster', 'namespace': 'openshift-storage', 'pod': 'rook-ceph-mgr-a-77b6fb49bc-kkzqf', 'service': 'rook-ceph-mgr', 'severity': 'critical'}, 'annotations': {'description': 'Disk device /dev/dm-4 not responding, on host worker-1.ocs-ci-large.test.ocs.', 'message': 'Disk not responding', 'severity_level': 'error', 'storage_type': 'ceph'}, 'state': 'pending', 'activeAt': '2022-12-10T14:35:12.677556721Z', 'value': '2e+00'}, {'labels': {'alertname': 'CephOSDDiskNotResponding', 'ceph_daemon': 'osd.2', 'container': 'mgr', 'device': '/dev/dm-4', 'disk': '2', 'endpoint': 'http-metrics', 'host': 'worker-1.ocs-ci-large.test.ocs', 'instance': '10.129.4.30:9283', 'job': 'rook-ceph-mgr', 'managedBy': 'ocs-storagecluster', 'namespace': 'openshift-storage', 'pod': 'rook-ceph-mgr-a-77b6fb49bc-kkzqf', 'service': 'rook-ceph-mgr', 'severity': 'critical'}, 'annotations': {'description': 'Disk device /dev/dm-4 not responding, on host worker-1.ocs-ci-large.test.ocs.', 'message': 'Disk not responding', 'severity_level': 'error', 'storage_type': 'ceph'}, 'state': 'firing', 'activeAt': '2022-12-10T14:35:12.677556721Z', 'value': '2e+00'}]
Type: None
Text:
measure_stop_ceph_osd = {'first_run': True, 'metadata': None, 'prometheus_alerts': [{'activeAt': '2022-12-10T14:34:11.581193756Z', 'annotation...cf1ff25) pacific (stable)', 'container': 'mgr', ...}, 'state': 'pending', ...}, ...], 'result': 'rook-ceph-osd-2', ...}
@tier4c
@pytest.mark.polarion_id("OCS-900")
@skipif_managed_service
def test_ceph_osd_stopped(measure_stop_ceph_osd):
"""
Test that there is appropriate alert related to situation when ceph osd
is down. Alert is cleared when osd disk is back online.
"""
api = prometheus.PrometheusAPI()
# get alerts from time when manager deployment was scaled down
alerts = measure_stop_ceph_osd.get("prometheus_alerts")
for target_label, target_msg, target_states, target_severity, ignore in [
(
constants.ALERT_OSDDISKNOTRESPONDING,
"Disk not responding",
["pending"],
"error",
False,
),
(
constants.ALERT_DATARECOVERYTAKINGTOOLONG,
"Data recovery is slow",
["pending"],
"warning",
True,
),
(
constants.ALERT_CLUSTERWARNINGSTATE,
"Storage cluster is in degraded state",
["pending"],
"warning",
False,
),
]:
> prometheus.check_alert_list(
label=target_label,
msg=target_msg,
alerts=alerts,
states=target_states,
severity=target_severity,
ignore_more_occurences=ignore,
)
tests/manage/monitoring/prometheus/test_deployment_status.py:146:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
label = 'CephOSDDiskNotResponding', msg = 'Disk not responding'
alerts = [{'activeAt': '2022-12-10T14:34:11.581193756Z', 'annotations': {'description': 'Storage cluster is in warning state fo... (68de1f204d3c34ec62bd59fae7a9814accf1ff25) pacific (stable)', 'container': 'mgr', ...}, 'state': 'pending', ...}, ...]
states = ['pending'], severity = 'error', ignore_more_occurences = False
def check_alert_list(
label, msg, alerts, states, severity="warning", ignore_more_occurences=True
):
"""
Check list of alerts that there are alerts with requested label and
message for each provided state. If some alert is missing then this check
fails.
Args:
label (str): Alert label
msg (str): Alert message
alerts (list): List of alerts to check
states (list): List of states to check, order is important
ignore_more_occurences (bool): If true then there is checkced only
occurence of alert with requested label, message and state but
it is not checked if there is more of occurences than one.
"""
target_alerts = [
alert for alert in alerts if alert.get("labels").get("alertname") == label
]
logger.info(f"Checking properties of found {label} alerts")
if ignore_more_occurences:
for state in states:
delete = False
for key, alert in reversed(list(enumerate(target_alerts))):
if alert.get("state") == state:
if delete:
d_msg = f"Ignoring {alert} as alert already appeared."
logger.debug(d_msg)
target_alerts.pop(key)
else:
delete = True
assert_msg = (
f"Incorrect number of {label} alerts ({len(target_alerts)} "
f"instead of {len(states)} with states: {states})."
f"\nAlerts: {target_alerts}"
)
> assert len(target_alerts) == len(states), assert_msg
E AssertionError: Incorrect number of CephOSDDiskNotResponding alerts (2 instead of 1 with states: ['pending']).
E Alerts: [{'labels': {'alertname': 'CephOSDDiskNotResponding', 'ceph_daemon': 'osd.2', 'container': 'mgr', 'device': '/dev/dm-4', 'disk': '2', 'endpoint': 'http-metrics', 'host': 'worker-1.ocs-ci-large.test.ocs', 'instance': '10.129.4.30:9283', 'job': 'rook-ceph-mgr', 'managedBy': 'ocs-storagecluster', 'namespace': 'openshift-storage', 'pod': 'rook-ceph-mgr-a-77b6fb49bc-kkzqf', 'service': 'rook-ceph-mgr', 'severity': 'critical'}, 'annotations': {'description': 'Disk device /dev/dm-4 not responding, on host worker-1.ocs-ci-large.test.ocs.', 'message': 'Disk not responding', 'severity_level': 'error', 'storage_type': 'ceph'}, 'state': 'pending', 'activeAt': '2022-12-10T14:35:12.677556721Z', 'value': '2e+00'}, {'labels': {'alertname': 'CephOSDDiskNotResponding', 'ceph_daemon': 'osd.2', 'container': 'mgr', 'device': '/dev/dm-4', 'disk': '2', 'endpoint': 'http-metrics', 'host': 'worker-1.ocs-ci-large.test.ocs', 'instance': '10.129.4.30:9283', 'job': 'rook-ceph-mgr', 'managedBy': 'ocs-storagecluster', 'namespace': 'openshift-storage', 'pod': 'rook-ceph-mgr-a-77b6fb49bc-kkzqf', 'service': 'rook-ceph-mgr', 'severity': 'critical'}, 'annotations': {'description': 'Disk device /dev/dm-4 not responding, on host worker-1.ocs-ci-large.test.ocs.', 'message': 'Disk not responding', 'severity_level': 'error', 'storage_type': 'ceph'}, 'state': 'firing', 'activeAt': '2022-12-10T14:35:12.677556721Z', 'value': '2e+00'}]
ocs_ci/utility/prometheus.py:61: AssertionError
Created attachment 1933907 [details]
Failed test case test_ceph_osd_stopped on ODF 4.12
This looks like an automation issue. According to logs, there are 2 log records for alert CephOSDDiskNotResponding: one in Pending state and one in Firing state. This is expected. The test is marked to look for both states in triggered alerts correctly: https://github.com/red-hat-storage/ocs-ci/blob/master/tests/manage/monitoring/prometheus/test_deployment_status.py#L133. @Sujoy can you please share test logs to investigate? In any case I think that this can be closed as NOTABUG. Created attachment 1937823 [details]
Test case test_ceph_osd_stopped
@fbalak attached is the test logs.
Closing since it's not a bug as Filip mentioned |