| Summary: | all but one node in a three node cluster drop out when cumin starts | ||
|---|---|---|---|
| Product: | Red Hat Enterprise MRG | Reporter: | Patrick O'Brien <pobrien> |
| Component: | qpid-cpp | Assignee: | mick <mgoulish> |
| Status: | CLOSED INSUFFICIENT_DATA | QA Contact: | MRG Quality Engineering <mrgqe-bugs> |
| Severity: | high | Docs Contact: | |
| Priority: | high | ||
| Version: | 2.0 | CC: | aconway, freznice, jross, pmoravec, whenry |
| Target Milestone: | --- | ||
| Target Release: | --- | ||
| Hardware: | x86_64 | ||
| OS: | Linux | ||
| Whiteboard: | |||
| Fixed In Version: | Doc Type: | Bug Fix | |
| Doc Text: | Story Points: | --- | |
| Clone Of: | Environment: | ||
| Last Closed: | 2013-02-24 20:57:36 UTC | Type: | --- |
| Regression: | --- | Mount Type: | --- |
| Documentation: | --- | CRM: | |
| Verified Versions: | Category: | --- | |
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | |
| Cloudforms Team: | --- | Target Upstream Version: | |
|
Description
Patrick O'Brien
2011-11-08 00:46:50 UTC
According to the log above, the broker was only recently started and had just been just caught up with the rest of the cluster when it failed. Do we know what the pattern of starting/stopping brokers was leading up to the failure? Do we have logs for the other brokers? In a cluster failure its useful to get logs from all the brokers, not just the failed ones, to see what was going on at the time of the failure. Can we please add the JVM package/version information? Here is what I did to try to reproduce this bug. I will include a synopsis, and then all the scripts and the C++ program that I used.
synopsis
{
start a 3-cluster
run a script that repeatedly starts cumin, stops cumin, dumps the cumin
database, and restarts cumin.
while that script is running, run a C++ program that randomly declares
new queues, and chooses which queue to send to or receive from.
When the C++ program decides to send to a queue, it chooses a random
number of messages to send in a burst. When it decides to receive, it
randomly chooses which queue it wants, and then drains that queue.
result: Total cumin-cycles count was 2000, during which time 12,000
queues were created and several million message sent & received.
No luck on the repro. All 3 original brokers were still alive
at the end.
}
scripts
{
starting the 3-cluster
{
#! /bin/bash
QPIDD=/usr/sbin/qpidd
rm -rf /tmp/mick
mkdir /tmp/mick
for N in 1 2 3
do
$QPIDD \
--cluster-name cluster_flock \
-p 581$N \
--data-dir /tmp/mick/data_$N \
--auth=no \
--mgmt-enable=yes \
--log-enable=info+ \
--log-enable=debug+:cluster \
--log-to-file /tmp/mick/qpidd_$N.log \
-d
echo "============================================"
echo "started broker $N from $QPIDD"
echo "============================================"
done
}
cycling cumin -- this script uses 3 other scripts, included below.
{
#! /bin/bash
\rm -rf ./cumin_output
mkdir ./cumin_output
let count=1
while [ $count -lt 501 ]
do
echo " "
echo "------------- test $count -------------"
echo "starting cumin..."
sleep 2
./start_cumin $count &
sleep 5
echo "Sleeping 15 seconds..."
sleep 15
echo "stopping cumin and dumping DB..."
sleep 2
./stop_cumin &
sleep 5
echo "Sleeping 15 seconds..."
sleep 15
echo "reinstalling cumin db..."
sleep 2
./cumin_reinstall &
sleep 5
echo "Sleeping 15 seconds..."
sleep 15
let count+=1
echo -n "Check brokers... "
ps -aef | grep qpidd | grep -v grep | wc -l
done
}
start_cumin
{
#! /bin/bash
N=$1
output=cumin_output/$N
echo "starting cumin..."
/home/mick/cumin_trunk/cumin/bin/cumin-data --debug 2> $output << yes_dang_it
yes
yes_dang_it
}
stop_cumin
{
#! /bin/bash
echo "Killing cumin..."
pkill cumin
echo "dumping database..."
/home/mick/cumin_trunk/cumin/bin/cumin-database drop << say_yes
yes
say_yes
echo "cumin stopped and dumped."
}
cumin_reinstall
{
#! /bin/bash
/home/mick/cumin_trunk/cumin/bin/cumin-database install << say_yes
yes
say_yes
}
# end of the scripts
}
the c++ program
{
#include <vector>
#include <qpid/messaging/Connection.h>
#include <qpid/messaging/Message.h>
#include <qpid/messaging/Receiver.h>
#include <qpid/messaging/Sender.h>
#include <qpid/messaging/Session.h>
#include <stdio.h>
#include <unistd.h> // For pause()
#include <sys/time.h>
#include <signal.h> // For signal
#include <unistd.h>
#include <cstdlib>
#include <iostream>
#include <sstream>
using namespace std;
using namespace qpid::messaging;
struct kolchak_queue
{
string queue_name;
string address;
Session & session;
Sender sender;
Receiver receiver;
kolchak_queue ( Session & _session,
char * _queue_name
);
};
kolchak_queue::kolchak_queue ( Session & _session,
char * _queue_name
)
: session ( _session ),
queue_name ( _queue_name )
{
address = queue_name;
address += ";{create:always,node:{type:queue}}";
sender = session.createSender ( address );
receiver = session.createReceiver ( address );
}
struct kolchak
{
kolchak ( string & _broker,
string & _options
);
int seed;
string broker;
Connection cnx;
Session session;
void run ( int n_steps );
int rand ( int max_desired_value );
// Actions -----------------------
void declare_queue ( );
void send ( );
void drain ( );
// End actions -----------------------
void print_report ( );
vector < kolchak_queue * > queues;
int message_count,
message_burst_count,
drain_count,
report_count;
};
kolchak::kolchak ( string & _broker,
string & _options
) :
broker ( _broker ),
cnx ( _broker, _options ),
message_count ( 0 ),
message_burst_count ( 0 ),
drain_count ( 0 ),
report_count ( 0 )
{
seed = getpid();
srand ( seed );
fprintf ( stderr, "MDEBUG kolchak seed is: %d\n", seed );
cnx.open();
session = cnx.createSession();
}
int
kolchak::rand ( int max_plus_one )
{
double zero_to_one = (double) ::rand() / (double) RAND_MAX;
return (int) (zero_to_one * (double) max_plus_one );
}
void
kolchak::run ( int n_steps )
{
for ( int step = 0; step < n_steps; ++ step )
{
if ( step && ( ! (step % 100) ) )
{
print_report ( );
}
int num = rand ( 1000 );
fprintf ( stderr, "step %d choice %d ", step, num );
if ( num < 50 )
{
declare_queue ( );
}
else
if ( num < 500 )
{
send ( );
}
else
{
drain ( );
}
}
}
void
kolchak::declare_queue ( )
{
char name[1000];
sprintf ( name, "q_%d", queues.size() );
kolchak_queue * q = new kolchak_queue ( session, name );
queues.push_back ( q );
fprintf ( stderr, "declared queue |%s|\n", name );
}
void
kolchak::send ( )
{
int n_queues = queues.size();
int queue_index;
if ( n_queues < 1 )
{
fprintf ( stderr, "no queues to send to yet.\n" );
return;
}
if ( n_queues == 1 )
queue_index = 0;
else
{
// Randomly choose which queue to send them to.
queue_index = rand ( queues.size() );
}
// Randomly choose how many messages to send.
int n_messages = rand ( 1000 );
string msg_str("message...");
Message msg ( msg_str );
kolchak_queue * q = queues [ queue_index ];
for ( int i = 0; i < n_messages; ++ i )
{
q->sender.send ( msg );
}
message_count += n_messages;
++ message_burst_count;
fprintf ( stderr,
"sent %d messages to queue |%s|\n",
n_messages,
q->queue_name.c_str()
);
}
void
kolchak::print_report ( )
{
++ report_count;
fprintf ( stderr, "\nreport %d --------------------------\n", report_count );
fprintf ( stderr, "%d queues declared.\n", queues.size() );
fprintf ( stderr,
"%d messages sent in %d bursts.\n",
message_count,
message_burst_count
);
fprintf ( stderr,
"%d queues drained.\n",
drain_count
);
fprintf ( stderr, "--------------------------\n\n" );
}
void
kolchak::drain ( )
{
int n_queues = queues.size();
int queue_index;
if ( n_queues < 1 )
{
fprintf ( stderr, "no queues to drain yet.\n" );
return;
}
if ( n_queues == 1 )
queue_index = 0;
else
{
// Randomly choose which queue to drain.
queue_index = rand ( queues.size() );
}
kolchak_queue * q = queues [ queue_index ];
while ( 1 )
{
try
{
Message message = q->receiver.fetch ( Duration::SECOND * 1 );
}
catch (const NoMessageAvailable& e)
{
break;
}
}
session.acknowledge();
fprintf ( stderr, "drained queue |%s|\n", q->queue_name.c_str() );
++ drain_count;
}
main ( int argc, char** argv )
{
string broker = argc > 1 ? argv[1] : "localhost:5811";
string options("");
kolchak k ( broker, options ) ;
k.run ( 10000 );
}
// end of C++ program
}
build script for the c++ program
{
#! /bin/bash
echo "============================================================="
echo " making kolchak "
echo "============================================================="
g++ -g -O2 \
-I$QPID_ROOT/cpp/include \
-I$QPID_ROOT/cpp/src \
-L$QPID_ROOT/cpp/src/.libs \
-lqpidmessaging \
kolchak.cpp \
-o kolchak
}
We have not yet been able to reproduce this issue, and the window for 0.14-mrg (2.2) is closing. Targeting 2.3. |