[root@dell-pe1955-01 log]# condor_q -better Error: Could not connect to negotiator (dell-pe1955-01.rhts.eng.bos.redhat.com) [root@dell-pe1955-01 log]# condor_q -- Submitter: dell-pe1955-01.rhts.eng.bos.redhat.com : <10.16.65.121:46765> : dell-pe1955-01.rhts.eng.bos.redhat.com ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 92.0 aeolus 4/21 15:08 0+00:00:00 I 0 0.0 job_jprovazn1_43 93.0 aeolus 4/21 15:14 0+00:00:00 I 0 0.0 job_A_westest01_44 log 4/21/11 15:27:34 matchmakingAlgorithm: limit 24.000000 used 0.000000 pieLeft 24.000000 04/21/11 15:27:34 Can't get SlotWeight for 'provider_combination_0'; using 1.0 04/21/11 15:27:34 Failed to evaluate NEGOTIATOR_POST_JOB_RANK expression to a float. Stack dump for process 22681 at timestamp 1303414054 (25 frames) condor_negotiator(dprintf_dump_stack+0x63)[0x5420c3] condor_negotiator[0x53b392] /lib64/libpthread.so.0[0x3cff60f520] /lib64/libc.so.6(gsignal+0x35)[0x3cfee32a45] /lib64/libc.so.6(abort+0x175)[0x3cfee34225] /lib64/libglib-2.0.so.0(g_logv+0x53a)[0x7fbecf5c337a] /lib64/libglib-2.0.so.0(g_log+0x83)[0x7fbecf5c3413] /lib64/libgthread-2.0.so.0(g_thread_init+0x1db)[0x3d05e028ab] /usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so(_Z21conductor_quota_checkPKcRKSt6vectorIP8ExprTreeSaIS3_EER9EvalStateR5Value+0x210)[0x7fbecfccf890] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x75)[0x4f3c75] condor_negotiator(_ZNK7classad18AttributeReference9_EvaluateERNS_9EvalStateERNS_5ValueE+0xb3)[0x4fdb23] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17] condor_negotiator(_ZNK7classad7ClassAd12EvaluateExprEPKNS_8ExprTreeERNS_5ValueE+0x42)[0x4dc312] condor_negotiator(_ZN7classad12MatchClassAd13EvalMatchExprEPNS_8ExprTreeE+0x34)[0x4f0a34] condor_negotiator(_Z8IsAMatchPN14compat_classad7ClassAdES1_+0xe)[0x53ea7e] condor_negotiator(_ZN10Matchmaker20matchmakingAlgorithmEPKcS1_RN14compat_classad7ClassAdERNS2_27ClassAdListDoesNotDeleteAdsEdddddb+0x3a0)[0x472310] condor_negotiator(_ZN10Matchmaker9negotiateEPKcPKN14compat_classad7ClassAdEdddRNS2_27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS9_ERK17CondorVersionInfoblRiRdSG_+0x8c6)[0x477da6] condor_negotiator(_ZN10Matchmaker18negotiateWithGroupEiddRN14compat_classad27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS4_ES2_ffPKc+0xe5b)[0x4794bb] condor_negotiator(_ZN10Matchmaker15negotiationTimeEv+0x1060)[0x47adf0] condor_negotiator(_ZN12TimerManager7TimeoutEv+0x129)[0x49bad9] condor_negotiator(_ZN10DaemonCore6DriverEv+0x277)[0x48c447] condor_negotiator(main+0x10db)[0x49a5bb] /lib64/libc.so.6(__libc_start_main+0xfd)[0x3cfee1ec9d] condor_negotiator[0x462639]
Created attachment 493971 [details] condorLog.txt recreated on another machine
1 jobs; 1 idle, 0 running, 0 held [root@hp-xw8600-01 ~]# condor_q -- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:56162> : hp-xw8600-01.rhts.eng.bos.redhat.com ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 3.0 aeolus 4/26 15:24 0+00:00:00 I 0 0.0 job_test2_2 1 jobs; 1 idle, 0 running, 0 held [root@hp-xw8600-01 ~]# /etc/init.d/condor restart Stopping Condor daemons: [ OK ] Starting Condor daemons: [ OK ] [root@hp-xw8600-01 ~]# condor_q -better -- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com --- 003.000: Request has not yet been considered by the matchmaker. [root@hp-xw8600-01 ~]# hostname hp-xw8600-01.rhts.eng.bos.redhat.com [root@hp-xw8600-01 ~]# cat /var/lib/condor/condor_config.local ALLOW_WRITE = * ALLOW_ADMINISTRATOR = * ALLOW_NEGOTIATOR = * ALLOW_NEGOTIATOR_SCHEDD = * COLLECTOR_HOST = localhost DAEMON_LIST = MASTER, SCHEDD, COLLECTOR, NEGOTIATOR MAX_GRIDMANAGER_LOG = 500000000 GRIDMANAGER_JOB_PROBE_INTERVAL = 30 GRIDMANAGER_DEBUG = D_FULLDEBUG NEGOTIATOR_DEBUG = D_FULLDEBUG COLLECTOR_DEBUG = D_FULLDEBUG DELTACLOUD_GAHP = $(SBIN)/deltacloud_server CLASSAD_LIFETIME = 0 # for the event log parsing (i.e. dbomatic) EVENT_LOG=$(LOG)/EventLog EVENT_LOG_USE_XML=True EVENT_LOG_JOB_AD_INFORMATION_ATTRS=Owner,GlobalJobId,Cmd,JobStartDate,JobCurrentStartDate,JobFinishedHookDone,DeltacloudProviderId,DeltacloudPublicNetworkAddresses,DeltacloudPrivateNetworkAddresses,DeltacloudAvailableActions,JobStatus,DeltacloudUsername CLASSAD_USER_LIBS = /usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so
adding to ce-ami tracker per clalance's advice
It does seem that condor jobs that are submitted to start/stop instances do not always work when this issue occurs.
004.000: Request is held. Hold reason: Create_Instance_Failure: InvalidAMIID.NotFound: The AMI ID 'ami-51693a14' does not exist --- 005.000: Request has not yet been considered by the matchmaker. [root@hp-xw8600-01 ~]# condor_rm 04.0 Job 4.0 marked for removal [root@hp-xw8600-01 ~]# condor_q -better -- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com --- 003.000: Request is being serviced --- 004.000: Request is removed. error: bad form error: problem with ExprToProfile --- 005.000: Run analysis summary. Of 4 machines, 4 are rejected by your job's requirements 0 reject your job because of their own requirements 0 match but are serving users with a better priority in the pool 0 match but reject the job for unknown reasons 0 match but will not currently preempt their existing job 0 match but are currently offline 0 are available to run your job No successful match recorded. Last failed match: Tue Apr 26 16:16:45 2011 Reason for last match failure: no match found WARNING: Be advised: No resources matched request's constraints The Requirements expression for your job is: ( target.front_end_hardware_profile_id == "1" && target.image == "3" && target.realm == "2" && conductor_quota_check(4,other.provider_account_id) ) [root@hp-xw8600-01 ~]# condor_q -- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 3.0 aeolus 4/26 15:24 0+00:42:38 R 0 0.0 job_test2_2 4.0 aeolus 4/26 15:50 0+00:00:00 X 0 0.0 job_test03_3 5.0 aeolus 4/26 16:16 0+00:00:00 I 0 0.0 job_test05_4 2 jobs; 1 idle, 1 running, 0 held [root@hp-xw8600-01 ~]# condor_q -better -- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com --- 003.000: Request is being serviced --- 004.000: Request is removed. error: bad form error: problem with ExprToProfile --- 005.000: Run analysis summary. Of 8 machines, 8 are rejected by your job's requirements 0 reject your job because of their own requirements 0 match but are serving users with a better priority in the pool 0 match but reject the job for unknown reasons 0 match but will not currently preempt their existing job 0 match but are currently offline 0 are available to run your job No successful match recorded. Last failed match: Tue Apr 26 16:16:45 2011 Reason for last match failure: no match found WARNING: Be advised: No resources matched request's constraints The Requirements expression for your job is: ( target.front_end_hardware_profile_id == "1" && target.image == "3" && target.realm == "2" && conductor_quota_check(4,other.provider_account_id) ) 4/26/11 16:17:45 submitterAbsShare = 1.000000 04/26/11 16:17:45 submitterLimit = 8.000000 04/26/11 16:17:45 submitterUsage = 0.000000 04/26/11 16:17:45 Socket to aeolus.eng.bos.redhat.com (<10.16.65.43:47121>) already in cache, reusing 04/26/11 16:17:45 Sending SEND_JOB_INFO/eom 04/26/11 16:17:45 Getting reply from schedd ... 04/26/11 16:17:45 Got JOB_INFO command; getting classad/eom 04/26/11 16:17:45 Request 00005.00000: 04/26/11 16:17:45 matchmakingAlgorithm: limit 8.000000 used 0.000000 pieLeft 8.000000 Stack dump for process 28352 at timestamp 1303849065 (25 frames) condor_negotiator(dprintf_dump_stack+0x63)[0x5420c3] condor_negotiator[0x53b392] /lib64/libpthread.so.0[0x3069e0f520] /lib64/libc.so.6(gsignal+0x35)[0x3069632a45] /lib64/libc.so.6(abort+0x175)[0x3069634225] /lib64/libglib-2.0.so.0(g_logv+0x53a)[0x7f3ea97e137a] /lib64/libglib-2.0.so.0(g_log+0x83)[0x7f3ea97e1413] /lib64/libgthread-2.0.so.0(g_thread_init+0x1db)[0x30702028ab] /usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so(_Z21conductor_quota_checkPKcRKSt6vectorIP8ExprTreeSaIS3_EER9EvalStateR5Value+0x210)[0x7f3ea9eed890] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x75)[0x4f3c75] condor_negotiator(_ZNK7classad18AttributeReference9_EvaluateERNS_9EvalStateERNS_5ValueE+0xb3)[0x4fdb23] condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17] condor_negotiator(_ZNK7classad7ClassAd12EvaluateExprEPKNS_8ExprTreeERNS_5ValueE+0x42)[0x4dc312] condor_negotiator(_ZN7classad12MatchClassAd13EvalMatchExprEPNS_8ExprTreeE+0x34)[0x4f0a34] condor_negotiator(_Z8IsAMatchPN14compat_classad7ClassAdES1_+0xe)[0x53ea7e] condor_negotiator(_ZN10Matchmaker20matchmakingAlgorithmEPKcS1_RN14compat_classad7ClassAdERNS2_27ClassAdListDoesNotDeleteAdsEdddddb+0x3a0)[0x472310] condor_negotiator(_ZN10Matchmaker9negotiateEPKcPKN14compat_classad7ClassAdEdddRNS2_27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS9_ERK17CondorVersionInfoblRiRdSG_+0x8c6)[0x477da6] condor_negotiator(_ZN10Matchmaker18negotiateWithGroupEiddRN14compat_classad27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS4_ES2_ffPKc+0xe5b)[0x4794bb] condor_negotiator(_ZN10Matchmaker15negotiationTimeEv+0x1060)[0x47adf0] condor_negotiator(_ZN12TimerManager7TimeoutEv+0x129)[0x49bad9] condor_negotiator(_ZN10DaemonCore6DriverEv+0x277)[0x48c447] condor_negotiator(main+0x10db)[0x49a5bb] /lib64/libc.so.6(__libc_start_main+0xfd)[0x306961ec9d] condor_negotiator[0x462639] (END)
perm close