Hide Forgot
Description of problem: Qpid low level replaying_sender hangs at failover. Solaris clients compiled from failover/ examples do not work as expected. Sender (replaying_sender) do not fail over to other known cluster nodes. There are two observations: alpha] client hangs forever (> 30 minutes) Terminal transcript: bash-3.00$ LD_LIBRARY_PATH=${QPID_HOME}/lib:${QPID_HOME}/lib/ossxtra:/usr/local/lib:${HOME}/croot/lib:${LD_LIBRARY_PATH} ./declare_queues 10.34.26.114 5672 2012-02-08 10:49:02 warning Session was not closed cleanly: anonymous.dff85874-2dee-ef61-b8a2-d279bee0bb39 2012-02-08 10:49:02 warning Ignoring frame while closing connection: Frame[BEbe; channel=2; {SessionDetachedBody: name=dff85874-2dee-ef61-b8a2-d279bee0bb39; code=0; }] bash-3.00$ LD_LIBRARY_PATH=${QPID_HOME}/lib:${QPID_HOME}/lib/ossxtra:/usr/local/lib:${HOME}/croot/lib:${LD_LIBRARY_PATH} ./replaying_sender 10.34.26.112 5672 15000 sent 1000 of 15000 sent 2000 of 15000 sent 3000 of 15000 sent 4000 of 15000 sent 5000 of 15000 sent 6000 of 15000 sent 7000 of 15000 sent 8000 of 15000 sent 9000 of 15000 sent 10000 of 15000 sent 11000 of 15000 <hang> At the <hang> 10.34.26.112's qpidd got restarted. Cluster is still functional: [root@dhcp-26-112 ~]# qpid-cluster ; qpid-stat -bc Cluster Name: soltestcluster Cluster Status: ACTIVE Cluster Size: 2 Members: ID=192.168.1.5:27491 URL=amqp:tcp:10.34.26.112:5672,tcp:192.168.1.5:5672 : ID=192.168.1.6:8457 URL=amqp:tcp:10.34.26.114:5672,tcp:192.168.1.6:5672 Connections client-addr cproc cpid auth connected idle msgIn msgOut ======================================================================================== 127.0.0.1:5672-127.0.0.1:45543 anonymous 0s 0s 224 247 127.0.0.1:5672-127.0.0.1:45539 anonymous 31m 24s 0s 1.20k 46 [root@dhcp-26-114 ~]# qpid-cluster ; qpid-stat -bc Cluster Name: soltestcluster Cluster Status: ACTIVE Cluster Size: 2 Members: ID=192.168.1.5:27491 URL=amqp:tcp:10.34.26.112:5672,tcp:192.168.1.5:5672 : ID=192.168.1.6:8457 URL=amqp:tcp:10.34.26.114:5672,tcp:192.168.1.6:5672 Connections client-addr cproc cpid auth connected idle msgIn msgOut ======================================================================================== 127.0.0.1:5672-127.0.0.1:58507 anonymous 32m 25s 0s 1.29k 76 127.0.0.1:5672-127.0.0.1:58516 anonymous 0s 0s 224 247 Hang dump of the replaying_sender process bash-3.00# pstack 1423 1423: ./replaying_sender 10.34.26.112 5672 15000 ----------------- lwp# 1 / thread# 1 -------------------- febc9590 lwp_park (0, 0, 0) febc35d8 cond_wait_queue (28bb0, 28b98, 0, 0, 28bb0, 0) + 4c febc3a1c cond_wait_common (28bb0, 28b98, 0, 0, 0, 0) + 294 febc3b20 cond_wait (28bb0, 28b98, 1, 15555555, 28bb0, 1a2f9c) + 10 febc3b5c pthread_cond_wait (28bb0, 28b98, 1, 0, ffbff428, ffbff42c) + 8 ff2ceb5c _ZN4qpid6client6Bounds6expandEjb (1, 42, 1, 15555555, 28bb0, 1a2f9c) + 80 ff31db14 _ZN4qpid6client11SessionImpl9sendFrameERNS_7framing8AMQFrameEb (704c8, ffbff480, 1, 0, ffbff428, ffbff42c) + 24 ff31eb08 _ZN4qpid6client11SessionImpl11sendContentERKNS_7framing13MethodContentE (704c8, 1a2f08, 1, 15555555, 1a2f98, 1a2f9c) + c0 ff323dd8 _ZN4qpid6client11SessionImpl11sendCommandERKNS_7framing7AMQBodyEPKNS2_13MethodContentE (704c8, ffbff5d4, 1a2f08, 0, 1a2734, 0) + 1a4 ff323ec8 _ZN4qpid6client11SessionImpl4sendERKNS_7framing7AMQBodyERKNS2_13MethodContentE (ffbff5e8, ffbff5d4, 1a2f08, 15555555, 1a2f98, 1a2f9c) + 18 ff2be0a8 _ZN4qpid6client10no_keyword17AsyncSession_0_1015messageTransferERKSshhRKNS0_7MessageEb (ffbff670, 1a2738, 1, 0, 1a2734, 0) + 7c ff31a3ac _ZN4qpid6client20MessageReplayTracker12ReplayRecord4sendERS1_ (1a2730, ffbffb20, ffbff820, 15555555, 1a2f98, 1a2f9c) + 70 ff31a7bc _ZN4qpid6client20MessageReplayTracker4sendERKNS0_7MessageERKSs (ffbffb20, ffbffb40, ffbff820, 0, 0, 0) + b4 00012670 _ZN6Sender7executeERN4qpid6client17AsyncSession_0_10Eb (ffbffb1c, ffbff9c0, 0, 15555555, ffbffa40, 29604) + 354 ff2ff5e4 _ZN4qpid6client15FailoverManager7executeERNS1_7CommandE (ffbffa40, ffbffb1c, 3a98, 0, 0, 0) + 228 00012c24 main (4, ffbffbb4, ffbffbc8, 24d28, fef70300, 0) + 1f0 00011d7c _start (0, 0, 0, 0, 0, 0) + 5c ----------------- lwp# 2 / thread# 2 -------------------- febccdbc pollsys (28608, 2, 0, 0) feb6367c poll (28608, 2, ffffffff, 10, 0, 25ea4) + 7c ff1463a0 _ZN4qpid3sys13PollerPrivate11EventStream8getEventENS0_7AbsTimeE (281e0, fe87bbe0, 1, 2821c, 28210, 28228) + 264 ff146c5c _ZN4qpid3sys13PollerPrivate11EventStream4nextENS0_8DurationE (281e0, fe87bdf0, fe8c0218, 10, 0, fe87bf30) + 170 ff142218 _ZN4qpid3sys6Poller4waitENS0_8DurationE (267e8, fe87be60, 0, fe8c0240, fec303d8, 1) + 74 ff1422cc _ZN4qpid3sys6Poller3runEv (267e8, 0, 0, 0, 0, 0) + 68 ff1395b4 _ZN4qpid3sys12_GLOBAL__N_111runRunnableEPv (0, fe87c000, 0, 0, ff1395a4, 1) + 10 febc94f0 _lwp_start (0, 0, 0, 0, 0, 0) ----------------- lwp# 3 / thread# 3 -------------------- febc9590 lwp_park (0, 0, 0) febc35d8 cond_wait_queue (70068, 70050, 0, 0, ff1395a4, 0) + 4c febc3a1c cond_wait_common (70068, 70050, 0, 0, 0, 0) + 294 febc3b20 cond_wait (70068, 70050, fe3fb738, 0, ff1395a4, 70120) + 10 febc3b5c pthread_cond_wait (70068, 70050, 53c, ff377c98, 4f4, ffffffff) + 8 ff2fc7d0 _ZN4qpid3sys13BlockingQueueIN5boost10shared_ptrINS_7framing8FrameSetEEEE3popERS6_NS0_8DurationE (70050, fe3fbe34, fe3fb738, 0, ff1395a4, 70120) + 290 ff2f87bc _ZN4qpid6client10Dispatcher3runEv (fe3fbe34, cac, 53c, ff377c98, 4f4, fee81034) + f8 ff3300e0 _ZN4qpid6client19SubscriptionManager3runEv (6fc9c, fe3fc000, 0, 0, ff1395a4, 1) + 14 ff302b70 _ZN4qpid6client16FailoverListener3runEv (6fc58, 0, 0, 0, 0, 0) + 4 ff1395b4 _ZN4qpid3sys12_GLOBAL__N_111runRunnableEPv (0, fe3fc000, 0, 0, ff1395a4, 1) + 10 febc94f0 _lwp_start (0, 0, 0, 0, 0, 0) beta] client hangs for few minutes (~4 minutes in below case) Terminal transcript: bash-3.00$ LD_LIBRARY_PATH=${QPID_HOME}/lib:${QPID_HOME}/lib/ossxtra:/usr/local/lib:${HOME}/croot/lib:${LD_LIBRARY_PATH} ./declare_queues 10.34.26.114 5672 2012-02-08 10:43:11 warning Session was not closed cleanly: anonymous.4fa4e043-91e4-c075-f882-d85af4af506a 2012-02-08 10:43:11 warning Ignoring frame while closing connection: Frame[BEbe; channel=2; {SessionDetachedBody: name=4fa4e043-91e4-c075-f882-d85af4af506a; code=0; }] bash-3.00$ LD_LIBRARY_PATH=${QPID_HOME}/lib:${QPID_HOME}/lib/ossxtra:/usr/local/lib:${HOME}/croot/lib:${LD_LIBRARY_PATH} ./replaying_sender 10.34.26.112 5672 15000 sent 1000 of 15000 sent 2000 of 15000 sent 3000 of 15000 sent 4000 of 15000 sent 5000 of 15000 sent 6000 of 15000 sent 7000 of 15000 2012-02-08 10:43:43 warning Connection [10.16.48.204:42367-10.34.26.112:5672] closed 2012-02-08 10:43:43 warning Connect failed: Connection refused 2012-02-08 10:43:43 warning Connection closed 2012-02-08 10:47:28 warning Connect failed: Connection timed out 2012-02-08 10:47:28 warning Connection closed sent 8000 of 15000 sent 9000 of 15000 sent 10000 of 15000 sent 11000 of 15000 sent 12000 of 15000 sent 13000 of 15000 sent 14000 of 15000 sent 15000 of 15000 Sent 15000 messages. bash-3.00$ Note ping statistics Solaris machine to 10.34.26.112/114 are about these: round-trip (ms) min/avg/max/stddev = 129./143./151./9.1 Version-Release number of selected component (if applicable): qpid-soleap0.tar.gz (2011-12-13) How reproducible: 40% (total hang) / 40% (hang for couple of minutes) Steps to Reproduce: 0. cluster with two nodes <host-A> and <host-B> 1. failover/declare_queues <host-A> <port> 2. failover/replaying_sender <host-B> <port> <N> 3. restart broker on <host-B> Actual results: Failover hangs. Expected results: Failover should work. Additional info:
Above configuration was tested in following environment: client machine (Solaris 10u9): qpid-soleap0.tar.gz (2011-12-13) cluster machines (rhel 5.7 i386+x86_64): [root@dhcp-26-114 ~]# rpm -qa | grep qpid | sort python-qpid-0.14-1.el5 python-qpid-qmf-0.14-2.el5 qpid-cpp-client-0.14-5.el5 qpid-cpp-client-devel-0.14-5.el5 qpid-cpp-client-devel-docs-0.14-5.el5 qpid-cpp-client-rdma-0.14-5.el5 qpid-cpp-client-ssl-0.14-5.el5 qpid-cpp-mrg-debuginfo-0.14-5.el5 qpid-cpp-server-0.14-5.el5 qpid-cpp-server-cluster-0.14-5.el5 qpid-cpp-server-devel-0.14-5.el5 qpid-cpp-server-rdma-0.14-5.el5 qpid-cpp-server-ssl-0.14-5.el5 qpid-cpp-server-store-0.14-5.el5 qpid-cpp-server-xml-0.14-5.el5 qpid-java-client-0.14-1.el5 qpid-java-common-0.14-1.el5 qpid-java-example-0.14-1.el5 qpid-qmf-0.14-2.el5 qpid-qmf-debuginfo-0.14-2.el5 qpid-qmf-devel-0.14-2.el5 qpid-tests-0.14-1.el5 qpid-tools-0.14-1.el5 rh-qpid-cpp-tests-0.14-5.el5 ruby-qpid-qmf-0.14-2.el5 When retested in pure linux environment no issue was detected.
The same issue (type alpha], hanged forever - more precisely more than 10 hours) detected when failing over with receiver (resuming_receiver). Hang at: bash-3.00# pstack 1522 1522: ./resuming_receiver 10.34.26.114 5672 ----------------- lwp# 1 / thread# 1 -------------------- febc9590 lwp_park (0, 0, 0) febc35d8 cond_wait_queue (75100, 750e8, 0, 0, 0, 0) + 4c febc3a1c cond_wait_common (75100, 750e8, 0, 0, 0, 0) + 294 febc3b20 cond_wait (75100, 750e8, ffbfee98, ffbff6bc, 0, 751b8) + 10 febc3b5c pthread_cond_wait (75100, 750e8, 53c, ff377c98, 4f4, ffffffff) + 8 ff2fc7d0 _ZN4qpid3sys13BlockingQueueIN5boost10shared_ptrINS_7framing8FrameSetEEEE3popERS6_NS0_8DurationE (750e8, ffbff594, ffbfee98, ffbff6bc, 0, 751b8) + 290 ff2f87bc _ZN4qpid6client10Dispatcher3runEv (ffbff594, cac, 53c, ff377c98, 4f4, fee81034) + f8 ff3300e0 _ZN4qpid6client19SubscriptionManager3runEv (ffbff6a0, ffbff6b0, ffbff6b4, ffbff6bc, 0, 0) + 14 00012ed8 _ZN8Listener7executeERN4qpid6client17AsyncSession_0_10Eb (ffbff9a4, ffbff848, 0, 15555555, ffbff8c8, 2df84) + 2a8 ff2ff5e4 _ZN4qpid6client15FailoverManager7executeERNS1_7CommandE (ffbff8c8, ffbff9a8, ffbff9ac, 0, 0, 0) + 228 000130fc main (3, ffbffa2c, ffbffa3c, 29678, fef70300, 0) + c8 00012384 ???????? (0, 0, 0, 0, 0, 0) ----------------- lwp# 2 / thread# 2 -------------------- febccdbc pollsys (2cf88, 2, 0, 0) feb6367c poll (2cf88, 2, ffffffff, 0, ffffffff, 2a5f4) + 7c ff1463a0 _ZN4qpid3sys13PollerPrivate11EventStream8getEventENS0_7AbsTimeE (2ca80, fe87bbe0, 1, 2cabc, 2cab0, 2cac8) + 264 ff146c5c _ZN4qpid3sys13PollerPrivate11EventStream4nextENS0_8DurationE (2ca80, fe87bdf0, fe8c0218, 10, 0, fe87bf30) + 170 ff142218 _ZN4qpid3sys6Poller4waitENS0_8DurationE (2b168, fe87be60, 0, fe8c0240, fec303d8, 1) + 74 ff1422cc _ZN4qpid3sys6Poller3runEv (2b168, 0, 0, 0, 0, 0) + 68 ff1395b4 _ZN4qpid3sys12_GLOBAL__N_111runRunnableEPv (0, fe87c000, 0, 0, ff1395a4, 1) + 10 febc94f0 _lwp_start (0, 0, 0, 0, 0, 0) ----------------- lwp# 3 / thread# 3 -------------------- febc9590 lwp_park (0, 0, 0) febc35d8 cond_wait_queue (74990, 74978, 0, 0, ff1395a4, 0) + 4c febc3a1c cond_wait_common (74990, 74978, 0, 0, 0, 0) + 294 febc3b20 cond_wait (74990, 74978, fe3fb738, 0, ff1395a4, 74a48) + 10 febc3b5c pthread_cond_wait (74990, 74978, 53c, ff377c98, 4f4, ffffffff) + 8 ff2fc7d0 _ZN4qpid3sys13BlockingQueueIN5boost10shared_ptrINS_7framing8FrameSetEEEE3popERS6_NS0_8DurationE (74978, fe3fbe34, fe3fb738, 0, ff1395a4, 74a48) + 290 ff2f87bc _ZN4qpid6client10Dispatcher3runEv (fe3fbe34, cac, 53c, ff377c98, 4f4, fee81034) + f8 ff3300e0 _ZN4qpid6client19SubscriptionManager3runEv (74614, fe3fc000, 0, 0, ff1395a4, 1) + 14 ff302b70 _ZN4qpid6client16FailoverListener3runEv (745d0, 0, 0, 0, 0, 0) + 4 ff1395b4 _ZN4qpid3sys12_GLOBAL__N_111runRunnableEPv (0, fe3fc000, 0, 0, ff1395a4, 1) + 10 febc94f0 _lwp_start (0, 0, 0, 0, 0, 0)