Bug 1532277
Summary: | RHEL7.5: VM is stuck in shutting down state | ||||||||
---|---|---|---|---|---|---|---|---|---|
Product: | [oVirt] vdsm | Reporter: | Elad <ebenahar> | ||||||
Component: | Core | Assignee: | Francesco Romani <fromani> | ||||||
Status: | CLOSED CURRENTRELEASE | QA Contact: | Lilach Zitnitski <lzitnits> | ||||||
Severity: | high | Docs Contact: | |||||||
Priority: | unspecified | ||||||||
Version: | 4.20.11 | CC: | amureini, bugs, derez, ebenahar, fromani, lrotenbe, michal.skrivanek, mprivozn, nsoffer, pkrempa | ||||||
Target Milestone: | ovirt-4.2.2 | Keywords: | Automation, TestOnly | ||||||
Target Release: | --- | Flags: | rule-engine:
ovirt-4.2+
|
||||||
Hardware: | x86_64 | ||||||||
OS: | Unspecified | ||||||||
Whiteboard: | |||||||||
Fixed In Version: | Doc Type: | If docs needed, set a value | |||||||
Doc Text: | Story Points: | --- | |||||||
Clone Of: | Environment: | ||||||||
Last Closed: | 2018-03-29 11:02:17 UTC | Type: | Bug | ||||||
Regression: | --- | Mount Type: | --- | ||||||
Documentation: | --- | CRM: | |||||||
Verified Versions: | Category: | --- | |||||||
oVirt Team: | Virt | RHEL 7.3 requirements from Atomic Host: | |||||||
Cloudforms Team: | --- | Target Upstream Version: | |||||||
Embargoed: | |||||||||
Bug Depends On: | 1536461 | ||||||||
Bug Blocks: | |||||||||
Attachments: |
|
Description
Elad
2018-01-08 14:45:13 UTC
VM XML: {'xml': '<?xml version="1.0" encoding="UTF-8"?><domain type="kvm" xmlns:ovirt-tune="http://ovirt.org/vm/tune/1.0" xmlns:ovirt-vm="http://ovirt.org/vm/1.0"><name>vm_TestCase5994_powering_o_0715481633</name><uuid>d9a4f246-5e03-4cc8-8000-b0a664751d49</uuid><memory>1048576</memory><currentMemory>1048576</currentMemory><maxMemory slots="16">4194304</maxMemory><vcpu current="1">16</vcpu><sysinfo type="smbios"><system><entry name="manufacturer">oVirt</entry><entry name="product">OS-NAME:</entry><entry name="version">OS-VERSION:</entry><entry name="serial">HOST-SERIAL:</entry><entry name="uuid">d9a4f246-5e03-4cc8-8000-b0a664751d49</entry></system></sysinfo><clock offset="variable" adjustment="0"><timer name="rtc" tickpolicy="catchup"></timer><timer name="pit" tickpolicy="delay"></timer><timer name="hpet" present="no"></timer></clock><features><acpi></acpi></features><cpu match="exact"><model>Conroe</model><topology cores="1" threads="1" sockets="16"></topology><numa><cell cpus="0" memory="1048576"></cell></numa></cpu><cputune></cputune><devices><input type="mouse" bus="ps2"></input><channel type="unix"><target type="virtio" name="ovirt-guest-agent.0"></target><source mode="bind" path="/var/lib/libvirt/qemu/channels/d9a4f246-5e03-4cc8-8000-b0a664751d49.ovirt-guest-agent.0"></source></channel><channel type="unix"><target type="virtio" name="org.qemu.guest_agent.0"></target><source mode="bind" path="/var/lib/libvirt/qemu/channels/d9a4f246-5e03-4cc8-8000-b0a664751d49.org.qemu.guest_agent.0"></source></channel><controller type="virtio-serial" index="0" ports="16"></controller><rng model="virtio"><backend model="random">/dev/urandom</backend></rng><controller type="usb" model="piix3-uhci" index="0"></controller><sound model="ich6"></sound><graphics type="spice" port="-1" autoport="yes" passwd="*****" passwdValidTo="1970-01-01T00:00:01" tlsPort="-1"><channel name="main" mode="secure"></channel><channel name="inputs" mode="secure"></channel><channel name="cursor" mode="secure"></channel><channel name="playback" mode="secure"></channel><channel name="record" mode="secure"></channel><channel name="display" mode="secure"></channel><channel name="smartcard" mode="secure"></channel><channel name="usbredir" mode="secure"></channel><listen type="network" network="vdsm-ovirtmgmt"></listen></graphics><video><model type="qxl" vram="8192" heads="1" ram="65536" vgamem="16384"></model></video><controller type="scsi" model="virtio-scsi" index="0"></controller><memballoon model="virtio"><stats period="5"></stats></memballoon><channel type="spicevmc"><target type="virtio" name="com.redhat.spice.0"></target></channel><interface type="bridge"><model type="virtio"></model><link state="up"></link><source bridge="ovirtmgmt"></source><mac address="00:1a:4a:16:25:d2"></mac><filterref filter="vdsm-no-mac-spoofing"></filterref><bandwidth></bandwidth></interface><disk type="file" device="cdrom" snapshot="no"><driver name="qemu" type="raw" error_policy="report"></driver><source file="" startupPolicy="optional"></source><target dev="hdc" bus="ide"></target><readonly></readonly></disk><disk snapshot="no" type="block" device="disk"><target dev="vda" bus="virtio"></target><source dev="/rhev/data-center/mnt/blockSD/28f3176f-a7a8-499f-83f7-ee0404977e7b/images/97813ba5-9f6a-48e9-841c-e1307cd630bb/73a55d23-fd42-461a-9b0f-4b7c97bd6a3d"></source><driver name="qemu" io="native" type="qcow2" error_policy="stop" cache="none"></driver><boot order="1"></boot><serial>97813ba5-9f6a-48e9-841c-e1307cd630bb</serial></disk></devices><pm><suspend-to-disk enabled="no"></suspend-to-disk><suspend-to-mem enabled="no"></suspend-to-mem></pm><os><type arch="x86_64" machine="pc-i440fx-rhel7.3.0">hvm</type><smbios mode="sysinfo"></smbios></os><metadata><ovirt-tune:qos></ovirt-tune:qos><ovirt-vm:vm><minGuaranteedMemoryMb type="int">1024</minGuaranteedMemoryMb><clusterVersion>4.2</clusterVersion><ovirt-vm:custom></ovirt-vm:custom><ovirt-vm:device mac_address="00:1a:4a:16:25:d2"><ovirt-vm:custom></ovirt-vm:custom></ovirt-vm:device><ovirt-vm:device devtype="disk" name="vda"><ovirt-vm:imageID>97813ba5-9f6a-48e9-841c-e1307cd630bb</ovirt-vm:imageID><ovirt-vm:poolID>6ccfd104-5f57-4a83-a24d-9d7a5a5c92c8</ovirt-vm:poolID><ovirt-vm:volumeID>73a55d23-fd42-461a-9b0f-4b7c97bd6a3d</ovirt-vm:volumeID><ovirt-vm:domainID>28f3176f-a7a8-499f-83f7-ee0404977e7b</ovirt-vm:domainID></ovirt-vm:device><launchPaused>false</launchPaused><resumeBehavior>auto_resume</resumeBehavior></ovirt-vm:vm></metadata></domain>', 'status': 'Powering down', 'vmName': 'vm_TestCase5994_powering_o_0715481633', 'vmId': 'd9a4f246-5e03-4cc8-8000-b0a664751d49', 'devices': [{'index': 2, 'iface': 'ide', 'name': 'hdc', 'vm_custom': {}, 'format': 'raw', 'discard': False, 'vmid': 'd9a4f246-5e03-4cc8-8000-b0a664751d49', 'diskType': 'file', 'specParams': {}, 'readonly': 'True', 'alias': 'ide0-1-0', 'address': {'bus': '1', 'controller': '0', 'type': 'drive', 'target': '0', 'unit': '0'}, 'device': 'cdrom', 'path': '', 'propagateErrors': 'off', 'type': 'disk'}, {'poolID': '6ccfd104-5f57-4a83-a24d-9d7a5a5c92c8', 'reqsize': '0', 'serial': '97813ba5-9f6a-48e9-841c-e1307cd630bb', 'index': 0, 'iface': 'virtio', 'apparentsize': '1073741824', 'specParams': {}, 'cache': 'none', 'imageID': '97813ba5-9f6a-48e9-841c-e1307cd630bb', 'readonly': 'False', 'truesize': '1073741824', 'type': 'disk', 'domainID': '28f3176f-a7a8-499f-83f7-ee0404977e7b', 'volumeInfo': {'path': '/rhev/data-center/mnt/blockSD/28f3176f-a7a8-499f-83f7-ee0404977e7b/images/97813ba5-9f6a-48e9-841c-e1307cd630bb/73a55d23-fd42-461a-9b0f-4b7c97bd6a3d', 'type': 'block'}, 'format': 'cow', 'address': {'slot': '0x07', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}, 'device': 'disk', 'path': '/rhev/data-center/mnt/blockSD/28f3176f-a7a8-499f-83f7-ee0404977e7b/images/97813ba5-9f6a-48e9-841c-e1307cd630bb/73a55d23-fd42-461a-9b0f-4b7c97bd6a3d', 'propagateErrors': 'off', 'name': 'vda', 'vm_custom': {}, 'bootOrder': '1', 'vmid': 'd9a4f246-5e03-4cc8-8000-b0a664751d49', 'volumeID': '73a55d23-fd42-461a-9b0f-4b7c97bd6a3d', 'diskType': 'block', 'alias': 'virtio-disk0', 'discard': False, 'volumeChain': [{'domainID': '28f3176f-a7a8-499f-83f7-ee0404977e7b', 'leaseOffset': 112197632, 'volumeID': '73a55d23-fd42-461a-9b0f-4b7c97bd6a3d', 'leasePath': '/dev/28f3176f-a7a8-499f-83f7-ee0404977e7b/leases', 'imageID': '97813ba5-9f6a-48e9-841c-e1307cd630bb', 'path': '/rhev/data-center/mnt/blockSD/28f3176f-a7a8-499f-83f7-ee0404977e7b/images/97813ba5-9f6a-48e9-841c-e1307cd630bb/73a55d23-fd42-461a-9b0f-4b7c97bd6a3d'}, {'domainID': '28f3176f-a7a8-499f-83f7-ee0404977e7b', 'leaseOffset': 109051904, 'volumeID': 'b93fdf52-31eb-4188-89ed-f93f3b6a4c1d', 'leasePath': '/dev/28f3176f-a7a8-499f-83f7-ee0404977e7b/leases', 'imageID': '97813ba5-9f6a-48e9-841c-e1307cd630bb', 'path': '/rhev/data-center/mnt/blockSD/28f3176f-a7a8-499f-83f7-ee0404977e7b/images/97813ba5-9f6a-48e9-841c-e1307cd630bb/b93fdf52-31eb-4188-89ed-f93f3b6a4c1d'}]}, {'device': 'ich6', 'alias': 'sound0', 'type': 'sound', 'address': {'slot': '0x04', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}}, {'device': 'memballoon', 'specParams': {'model': 'virtio'}, 'type': 'memballoon'}, {'tlsPort': '-1', 'vmid': 'd9a4f246-5e03-4cc8-8000-b0a664751d49', 'specParams': {'fileTransferEnable': True, 'copyPasteEnable': True, 'displayIp': '10.35.82.73', 'displayNetwork': 'ovirtmgmt'}, 'device': 'spice', 'type': 'spice', 'port': '-1'}, {'specParams': {'source': 'urandom'}, 'vmid': 'd9a4f246-5e03-4cc8-8000-b0a664751d49', 'alias': 'rng0', 'address': {'slot': '0x09', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}, 'device': 'rng', 'model': 'virtio', 'type': 'rng'}, {'device': 'virtio-serial', 'index': '0', 'type': 'virtio-serial', 'ports': '16'}, {'device': 'usb', 'index': '0', 'model': 'piix3-uhci', 'type': 'usb'}, {'device': 'scsi', 'index': '0', 'model': 'virtio-scsi', 'type': 'scsi'}, {'device': 'qxl', 'specParams': {'vram': '8192', 'vgamem': '16384', 'heads': '1', 'ram': '65536'}, 'address': {'slot': '0x02', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}, 'type': 'video', 'alias': 'video0'}, {'nicModel': 'pv', 'macAddr': '00:1a:4a:16:25:d2', 'linkActive': True, 'name': 'vnet0', 'alias': 'net0', 'address': {'slot': '0x03', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}, 'device': 'bridge', 'type': 'interface', 'network': 'ovirtmgmt'}, {'device': 'virtio-serial', 'alias': 'virtio-serial0', 'type': 'controller', 'address': {'slot': '0x05', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}}, {'device': 'usb', 'alias': 'usb', 'type': 'controller', 'address': {'slot': '0x01', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x2'}}, {'device': 'scsi', 'alias': 'scsi0', 'type': 'controller', 'address': {'slot': '0x06', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}}, {'device': 'ide', 'alias': 'ide', 'type': 'controller', 'address': {'slot': '0x01', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x1'}}, {'device': 'unix', 'alias': 'channel0', 'type': 'channel', 'address': {'bus': '0', 'controller': '0', 'type': 'virtio-serial', 'port': '1'}}, {'device': 'unix', 'alias': 'channel1', 'type': 'channel', 'address': {'bus': '0', 'controller': '0', 'type': 'virtio-serial', 'port': '2'}}, {'device': 'spicevmc', 'alias': 'channel2', 'type': 'channel', 'address': {'bus': '0', 'controller': '0', 'type': 'virtio-serial', 'port': '3'}}, {'device': '', 'alias': 'balloon0', 'type': 'memballoon', 'address': {'slot': '0x08', 'bus': '0x00', 'domain': '0x0000', 'type': 'pci', 'function': '0x0'}}], 'guestDiskMapping': {u'97813ba5-9f6a-48e9-8': {u'name': u'/dev/vda'}, u'QEMU_DVD-ROM_QM00003': {u'name': u'/dev/sr0'}}, 'clientIp': '', 'statusTime': '4331654500', 'arch': 'x86_64'} NotResponding is correct from oVirt perspective when it didn't die on destroy call. Further hints are likely in libvirt, so you would have to reproduce that with libvirt debug on so libvirt people can look at that. Not sure if we can do anything if it is not reproducible... Allon, is there any defined behavior for storage live migration when VM shuts down? Can you just doublecheck what actually happened with that part? (In reply to Michal Skrivanek from comment #2) > Allon, is there any defined behavior for storage live migration when VM > shuts down? Can you just doublecheck what actually happened with that part? There's no special treatment from RHV's side AFAIK (Daniel - please keep me honest here). Copying the backing chain is an "offline" SPM operation, and once it ends, engine issues a VM.diskReplicateFinish call, which should just fail since the VM is down. Why do you suspect LSM? Created attachment 1380767 [details]
logs (with libvirtd.log)
Reproduced again, this time with libvirtd.log.
engine.log
2018-01-11 19:31:16,542+02 INFO [org.ovirt.engine.core.vdsbroker.vdsbroker.DestroyVDSCommand] (default task-5) [vms_syncAction_b76af43e-b2e2-4290] START, DestroyVDSCommand(HostName = host_mixed_3, DestroyVmVDSCommandParameters:{hostId='5c2ee991-6583-4dc2-a072-db27aa7615d7', vmId='6dd1bd5e-8e57-4c0e-8eeb-45b4f4384a13', secondsToWait='0', gracefully='false', reason='', ignoreNoVm='false'}), log id: 6b0c2475
2018-01-11 19:31:28,396+02 WARN [org.ovirt.engine.core.vdsbroker.vdsbroker.VdsBrokerObjectsBuilder] (EE-ManagedThreadFactory-engineScheduled-Thread-89) [] Failed to convert app: [[Ljava.lang.Object;] is not a string
vdsm.log:
2018-01-11 19:31:00,929+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/50 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950> timeout=15, duration=30 at 0x7f8666b51a90> task#=137 at 0x7f8666bf2490>, traceback:
File: "/usr/lib64/python2.7/threading.py", line 785, in __bootstrap
self.__bootstrap_inner()
File: "/usr/lib64/python2.7/threading.py", line 812, in __bootstrap_inner
self.run()
File: "/usr/lib64/python2.7/threading.py", line 765, in run
self.__target(*self.__args, **self.__kwargs)
File: "/usr/lib/python2.7/site-packages/vdsm/common/concurrent.py", line 194, in run
ret = func(*args, **kwargs)
File: "/usr/lib/python2.7/site-packages/vdsm/executor.py", line 301, in _run
self._execute_task()
File: "/usr/lib/python2.7/site-packages/vdsm/executor.py", line 315, in _execute_task
task()
File: "/usr/lib/python2.7/site-packages/vdsm/executor.py", line 391, in __call__
self._callable()
File: "/usr/lib/python2.7/site-packages/vdsm/virt/periodic.py", line 213, in __call__
self._func()
File: "/usr/lib/python2.7/site-packages/vdsm/virt/sampling.py", line 578, in __call__
stats = hostapi.get_stats(self._cif, self._samples.stats())
File: "/usr/lib/python2.7/site-packages/vdsm/host/api.py", line 62, in get_stats
avail, commit = _memUsageInfo(cif)
File: "/usr/lib/python2.7/site-packages/vdsm/host/api.py", line 121, in _memUsageInfo
mem_info = v.memory_info()
File: "/usr/lib/python2.7/site-packages/vdsm/virt/vm.py", line 802, in memory_info
dom_stats = self._dom.memoryStats()
File: "/usr/lib/python2.7/site-packages/vdsm/virt/virdomain.py", line 98, in f
ret = attr(*args, **kwargs)
File: "/usr/lib/python2.7/site-packages/vdsm/common/libvirtconnection.py", line 130, in wrapper
ret = f(*args, **kwargs)
File: "/usr/lib/python2.7/site-packages/vdsm/common/function.py", line 92, in wrapper
return func(inst, *args, **kwargs)
File: "/usr/lib64/python2.7/site-packages/libvirt.py", line 1510, in memoryStats
ret = libvirtmod.virDomainMemoryStats(self._o) (executor:363)
libvirtd.log:
2018-01-11 17:31:46.553+0000: 1798: warning : qemuDomainObjBeginJobInternal:4707 : Cannot start job (destroy, none) for domain vm_TestCase5994_powering_o_1119121722; current job is (query, none) owned by (1795 remoteDispatchConnectGetAllDomainStats, 0 <null>) for (932s, 0s)
Raising severity as it reproduces quite frequently in our automation storage migration job. More specifically in https://polarion.engineering.redhat.com/polarion/redirect/project/RHEVM3/workitem?id=RHEVM3-5994 This feels eerily familiar, but I can't place it. Didn't we recently solve some hanging in virDomainMemoryStats? (In reply to Allon Mureinik from comment #6) > This feels eerily familiar, but I can't place it. Didn't we recently solve > some hanging in virDomainMemoryStats? We fixed a possible hang of the entire process in libvirt python bindings. This version of libvirt contains this fix, so it should not be related. According to libvirt log, it looks like the python binding is ok, and libvirt have internal error destroying the vm. I think this should bug should move to libvirt. Since destroying a vm is a virt flow, I think virt team should handle this. Elad, how much time the libvirt call is stuck? We see 2 logs here, one stuck for 15 seconds, and the other for 30 seconds. We use a timeout of 30 seconds for libvirt, so calls should not be stuck for more then 30 seconds. The logs are from different executions.
Are you referring to the HostMonitor sampling?
The following keeps repeating every 15/30 seconds:
2018-01-11 21:55:05,792+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/92 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950
> timeout=15, duration=15 at 0x7f8666d8c350> task#=1092 at 0x7f8666a68850>, traceback:
2018-01-11 21:55:20,798+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/92 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950> timeout=15, duration=30 at 0x7f8666d8c350> task#=1092 at 0x7f8666a68850>, traceback:
2018-01-11 21:55:50,853+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/92 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950> timeout=15, duration=15 at 0x7f8666c12cd0> task#=1094 at 0x7f8666a68850>, traceback:
2018-01-11 21:56:05,855+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/92 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950> timeout=15, duration=30 at 0x7f8666c12cd0> task#=1094 at 0x7f8666a68850>, traceback:
2018-01-11 21:56:35,920+0200 WARN (vdsm.Scheduler) [Executor] Worker blocked: <Worker name=periodic/88 running <Task <Operation action=<vdsm.virt.sampling.HostMonitor object at 0x7f8666bc3910> at 0x7f8666bc3950> timeout=15, duration=15 at 0x7f8666c12c90> task#=1358 at 0x7f86780dd710>, traceback:
This 2018-01-11 17:31:46.553+0000: 1798: warning : qemuDomainObjBeginJobInternal:4707 : Cannot start job (destroy, none) for domain vm_TestCase5994_powering_o_1119121722; current job is (query, none) owned by (1795 remoteDispatchConnectGetAllDomainStats, 0 <null>) for (932s, 0s) Is suspicious. Back in time libvirt had two thread pools, one for high priority and one for all the operations, so high-priority tasks could be still served if the "main" thread pool run out of resources. Now, the monitor lock *could* in theory act as choke point (disclaimer: I have *not* yer reviewed the libvirt code) and nullify this assumption. The effect could be that a rogue query can indeed prevent the shutdown. But, if this is the case, I don't see how we can avoid that at Vdsm level, this is something it must be addressed at libvirt level. +++ Additionally, ad Vdsm level we have logic in the destroy flow to 1. retry to gracefully shutdown the VM a given number of times (should be set by Engine, AFAIK not yet implemented) 2. try forcefully if graceful attempt fails. Maybe we should switch to forceful if the libvirt call fails for timeout? not sure if makes things better. (In reply to Francesco Romani from comment #10) > 2018-01-11 17:31:46.553+0000: 1798: warning : > qemuDomainObjBeginJobInternal:4707 : Cannot start job (destroy, none) for > domain vm_TestCase5994_powering_o_1119121722; current job is (query, none) > owned by (1795 remoteDispatchConnectGetAllDomainStats, 0 <null>) for (932s, > 0s) Hi Peter, what would libvirt do in this case? There is one query job (e.g. virDomainGetAllDomain stats) holding the monitor lock for one domain, and we are attempting to destroy the same domain while the lock is hold. Since both actions require access to the monitor, it seems to me that the only option is to wait (and maybe eventually fail for timeout) in the destroy path for the lock to be released, am I right? If the above is right, can we ever destroy VMs whose monitor lock is held by a stuck/unresponsive task or we just can wait? No actually destroy does not need the monitor. The destroy API at first kills the qemu process and just then initializes the cleanup steps. This usually triggers closing of the monitor socket which in turn promptly unlocks and fails out of the thread waiting for the monitor. Note that the above applies for the virDomainDestroyFlags API. The graceful shutdown API requires monitor access or guest agent access so it may get stuck the same way. Unless qemu does not die after being killed in the virDomainDestroy API (stuck NFS perhaps?) and does not refuse closing the monitor socket, the destroy API should work. If the above thing happens again, it would be useful to see a backtrace to determine whether ConnectGetAllDomainStats got stuck in the monitor and also whether the qemu process can be actually killed. (In reply to Peter Krempa from comment #12) > No actually destroy does not need the monitor. The destroy API at first > kills the qemu process and just then initializes the cleanup steps. This > usually triggers closing of the monitor socket which in turn promptly > unlocks and fails out of the thread waiting for the monitor. > > Note that the above applies for the virDomainDestroyFlags API. The graceful > shutdown API requires monitor access or guest agent access so it may get > stuck the same way. > > Unless qemu does not die after being killed in the virDomainDestroy API > (stuck NFS perhaps?) and does not refuse closing the monitor socket, the > destroy API should work. > > If the above thing happens again, it would be useful to see a backtrace to > determine whether ConnectGetAllDomainStats got stuck in the monitor and also > whether the qemu process can be actually killed. Thanks Peter. It seems we can handle this in the management layer, libvirt seems OK at this moment - we tried only graceful at this point. So we need to decide how do we want to handle this case in Vdsm *** Bug 1539360 has been marked as a duplicate of this bug. *** Looks like we have a libvirt bug in this area: https://www.redhat.com/archives/libvirt-users/2018-January/msg00039.html waiting for libvirt fix. The depends on bug is already verified. *** Bug 1544203 has been marked as a duplicate of this bug. *** We just need to pull libvirt-3.9.0-9.el7 and test it again. No further changes on Vdsm needed. no doc_text needed, we changed nothing in Vdsm -------------------------------------- Tested with the following code: ---------------------------------------- ovirt-engine-4.2.2-0.1.el7.noarch vdsm-4.20.18-1.el7ev.x86_64 Tested with the following scenario: Steps to Reproduce: 1. Try to live migrate VM disk while the VM is in 'Powering off' state Actual results: VM is powering off successfully Expected results: Moving to VERIFIED! This bugzilla is included in oVirt 4.2.2 release, published on March 28th 2018. Since the problem described in this bug report should be resolved in oVirt 4.2.2 release, it has been closed with a resolution of CURRENT RELEASE. If the solution does not work for you, please open a new bug report. |