Bug 1647979
| Summary: | memory leak when having multiple dynflow executors and sending more candlepin events to LOCE task | ||
|---|---|---|---|
| Product: | Red Hat Satellite | Reporter: | Pavel Moravec <pmoravec> |
| Component: | Tasks Plugin | Assignee: | satellite6-bugs <satellite6-bugs> |
| Status: | CLOSED DUPLICATE | QA Contact: | |
| Severity: | high | Docs Contact: | |
| Priority: | high | ||
| Version: | 6.4 | CC: | ahumbe, aruzicka, inecas |
| Target Milestone: | Unspecified | Keywords: | PrioBumpGSS, Triaged |
| Target Release: | Unused | ||
| Hardware: | x86_64 | ||
| OS: | Linux | ||
| Whiteboard: | |||
| Fixed In Version: | Doc Type: | If docs needed, set a value | |
| Doc Text: | Story Points: | --- | |
| Clone Of: | Environment: | ||
| Last Closed: | 2018-12-07 14:08:41 UTC | Type: | Bug |
| Regression: | --- | Mount Type: | --- |
| Documentation: | --- | CRM: | |
| Verified Versions: | Category: | --- | |
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |
| Cloudforms Team: | --- | Target Upstream Version: | |
| Embargoed: | |||
aruzicka++ fix that seems to work well on my 6.4 system: sed -i 's/world.dynflow_/::Rails./g' /opt/theforeman/tfm/root/usr/share/gems/gems/dynflow-*/lib/dynflow/rails/daemon.rb (In reply to Pavel Moravec from comment #1) > aruzicka++ fix that seems to work well on my 6.4 system: > > sed -i 's/world.dynflow_/::Rails./g' > /opt/theforeman/tfm/root/usr/share/gems/gems/dynflow-*/lib/dynflow/rails/ > daemon.rb Forgot to mention: that was a fix for 6.4, havent tried on 6.3. BUT Letting the reproducer running with this fix on 6.4, I got: E, [2018-11-08T21:16:17.573255 #28105] ERROR -- : stack level too deep (SystemStackError) /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/synchronization/mri_lockable_object.rb:35:in `synchronize' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/atomic/event.rb:49:in `set?' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_executor_service.rb:67:in `ns_running?' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/abstract_executor_service.rb:42:in `block in running?' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/synchronization/mri_lockable_object.rb:36:in `synchronize' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/abstract_executor_service.rb:42:in `running?' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_executor_service.rb:21:in `block in post' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/synchronization/mri_lockable_object.rb:38:in `block in synchronize' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/synchronization/mri_lockable_object.rb:38:in `synchronize' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/synchronization/mri_lockable_object.rb:38:in `synchronize' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_executor_service.rb:19:in `post' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:77:in `call_job' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `ensure in work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:86:in `call_job' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `ensure in work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `work' .. .. /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:86:in `call_job' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:86:in `call_job' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `ensure in work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:104:in `work' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/serialized_execution.rb:77:in `block in call_job' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:348:in `run_task' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:337:in `block (3 levels) in create_worker' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:320:in `loop' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:320:in `block (2 levels) in create_worker' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:319:in `catch' /opt/theforeman/tfm-ror51/root/usr/share/gems/gems/concurrent-ruby-1.0.5/lib/concurrent/executor/ruby_thread_pool_executor.rb:319:in `block in create_worker' /opt/theforeman/tfm/root/usr/share/gems/gems/logging-2.2.2/lib/logging/diagnostic_context.rb:474:in `block in create_with_logging_context' after logging (maybe unrelated): I, [2018-11-08T21:16:17.465628 #28105] INFO -- /default_dead_letter_handler: got dead letter #<Concurrent::Actor::Envelope:0x00007fa291a870b0 @message=[:dispatch_response, Dynflow::Dispatcher::Envelope[request_id: 74287, sender_id: 048b98da-64b1-41aa-b00e-2832dee8401c, receiver_id: 048b98da-64b1-41aa-b00e-2832dee8401c, message: Dynflow::Dispatcher::Failed[error: step b20c3591-ac5e-4eae-83d1-3a76d13e711f:2 dropping event Dynflow::Director::Event[execution_plan_id: b20c3591-ac5e-4eae-83d1-3a76d13e711f, step_id: 2, event: Actions::Candlepin::ListenOnCandlepinEvents::Event[message_id: ab9d159f-0516-361a-bdc2-cd86c43b92a6, subject: compliance.created, content: {"id":null,"type":"CREATED","target":"COMPLIANCE","targetName":"rhel7u3-2.gsslab.brq.redhat.com","principalStore":"{\"type\":\"trusteduser\",\"name\":\"foreman_admin\"}","timestamp":1541691652065,"entityId":"38537d9e-6e4f-4c57-a9b2-72a1005644ba","ownerId":"8aac017e66f386d90166f38788860001","consumerUuid":"38537d9e-6e4f-4c57-a9b2-72a1005644ba","referenceId":null,"referenceType":null,"eventData":"{\"reasons\":[{\"productName\":\"Red Hat Enterprise Linux Server\",\"message\":\"Not supported by a valid subscription.\"}],\"status\":\"invalid\"}","messageText":null,"principal":{"type":"trusteduser","name":"foreman_admin"}}], result: <#Concurrent::Edge::CompletableFuture:0x7fa2754b3510 pending>]]]], @future=nil, @sender=#<Concurrent::Actor::Reference:0x74afce8 /connector-database-core (Dynflow::Connectors::Database::Core)>, @address=#<Concurrent::Actor::Reference:0x757ecf0 /client-dispatcher (Dynflow::Dispatcher::ClientDispatcher)>> and LOCE task is "running" but dropped connection to qpidd - VERY dangerous. We're currently tracking the issue in https://bugzilla.redhat.com/show_bug.cgi?id=1654217 and it's already proposed for 6.4.z *** This bug has been marked as a duplicate of bug 1654217 *** |
Description of problem: When having multiple dynflow executors and candlepin is frequently sending events via katello_event_queue to LOCE task, dynflow_executor process grows in memory consumption over the time. This process is expected to be restarted after reaching some configurable threshold, but this does not happen. Just OOM killer does the ultimate kill of the process :). Version-Release number of selected component (if applicable): Sat 6.4, seen also in 6.3 How reproducible: 100% Steps to Reproduce: 1. Tune dynflow setting to speedup reproducer: in /etc/sysconfig/dynflowd (or foreman-tasks for 6.3), set: EXECUTORS_COUNT=2 EXECUTOR_MEMORY_LIMIT=300mb EXECUTOR_MEMORY_MONITOR_DELAY=600 EXECUTOR_MEMORY_MONITOR_INTERVAL=60 2. restart foreman-tasks service to apply the tuning 3. generate many candlepin events (register a systen, attach subscription, detach it, unregister) - see additional info to speed this up 4. monitor dynflow_executor processes memory utilization over time Actual results: 4. shows memory growth of the still same process Expected results: 4. memory growth exceeds the EXECUTOR_MEMORY_LIMIT but only for few minutes (per delay/interval setting) until new process is respawned Additional info: To speed up reproducer / mimic many candlepin events: - disable SSL to qpidd to simplify further work: in /etc/qpid/qpidd.conf, comment out "require-encryption" and optionally increase "default-queue-limit" value: #require-encryption=yes default-queue-limit=1048576000 - and restart the services: systemctl restart qpidd.service - temporarily make katello_event_queue as non-durable (otherwise journals will shout about duplicit messages), and add some more qpid provisioning: qpid-config del queue katello_event_queue --force qpid-config add queue katello_event_queue qpid-config add queue katello_event_queue_2 for key in compliance.created entitlement.created entitlement.deleted pool.created pool.deleted; do qpid-config bind event katello_event_queue $key qpid-config bind event katello_event_queue_2 $key done qpid-config add queue katello_event_queue_3 qpid-config add exchange fanout event_2 qpid-config bind event_2 katello_event_queue_3 - restart foreman-tasks just to be sure it starts with clean table: systemctl restart foreman-tasks.service - have below C++ program "reroute_queue.cpp" to re-route messages from queues to exchanges: #include <qpid/messaging/Connection.h> #include <qpid/messaging/Session.h> #include <qpid/messaging/Sender.h> #include <qpid/messaging/Receiver.h> #include <qpid/messaging/Message.h> #include <qpid/messaging/Address.h> #include <iostream> #include <cstdlib> using namespace std; using namespace qpid::messaging; using namespace qpid::types; int main(int argc, char** argv) { if (argc < 4) { cerr << "Invalid number of parameters, expecting: queue_name, quantity, exchange_name" << endl; return 1; } string queue_name = argv[1]; uint32_t qty = atoi(argv[2]); string exchange_name = argv[3]; Connection connection(argc>4?argv[4]:"localhost:5672"); connection.open(); Session session = connection.createSession(); Sender sender = session.createSender("qmf.default.direct/broker"); Address responseQueue("#reply-queue; {create:always, node:{x-declare:{auto-delete:true}}}"); Receiver receiver = session.createReceiver(responseQueue); Message message; Variant::Map content; Variant::Map OID; Variant::Map arguments; string object_name = "org.apache.qpid.broker:queue:" + queue_name; OID["_object_name"] = object_name; arguments["request"] = qty; arguments["useAltExchange"] = false; arguments["exchange"] = exchange_name; content["_object_id"] = OID; content["_method_name"] = "reroute"; content["_arguments"] = arguments; encode(content, message); message.setReplyTo(responseQueue); message.setProperty("x-amqp-0-10.app-id", "qmf2"); message.setProperty("qmf.opcode", "_method_request"); message.setContentType("amqp/map"); Variant::Map map; decode(message, map); sender.send(message, true); Message response; if (receiver.fetch(response,qpid::messaging::Duration(30000)) == true) { qpid::types::Variant::Map recv_props = response.getProperties(); if (recv_props["x-amqp-0-10.app-id"] == "qmf2") if (recv_props["qmf.opcode"] == "_method_response") std::cout << "Response: OK" << std::endl; else if (recv_props["qmf.opcode"] == "_exception") std::cerr << "Error: " << response.getContent() << std::endl; else std::cerr << "Invalid response received!" << std::endl; else std::cerr << "Invalid response not of qmf2 type received!" << std::endl; } else std::cout << "Timeout: No response received within 30 seconds!" << std::endl; receiver.close(); sender.close(); session.close(); connection.close(); return 0; } - Compile it: yum install gcc-c++ -y g++ -Wall -lqpidclient -lqpidcommon -lqpidmessaging -lqpidtypes reroute_queue.cpp -o reroute_queue - now generate some candlepin events - check in "qpid-stat -q | grep katello_event_queue" that katello_event_queue_2 has some messages - re-route the messages to katello_event_queue again: ./reroute_queue katello_event_queue_2 0 event_2 127.0.0.1:5672 ./reroute_queue katello_event_queue_3 0 event 127.0.0.1:5672 - check "qpid-stat -q | grep katello_event_queue" again - stats will be updated on all three queues - repeat the two "reroute_queue" commands in a loop - once the testing finishes, revert the changes (esp. make katello_event_queue queue durable again): service qpidd restart qpid-config add queue katello_event_queue --durable for key in compliance.created entitlement.created entitlement.deleted pool.created pool.deleted; do qpid-config bind event katello_event_queue $key done service foreman-tasks restart