Bug 698791 - condor_q -better fails where condor_q works
Summary: condor_q -better fails where condor_q works
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: CloudForms Cloud Engine
Classification: Retired
Component: aeolus-conductor
Version: 0.3.1
Hardware: Unspecified
OS: Unspecified
unspecified
unspecified
Target Milestone: rc
Assignee: Ian Main
QA Contact: wes hayutin
URL:
Whiteboard:
Depends On:
Blocks: ce-beta ce-ami
TreeView+ depends on / blocked
 
Reported: 2011-04-21 19:43 UTC by wes hayutin
Modified: 2012-01-26 12:17 UTC (History)
8 users (show)

Fixed In Version:
Clone Of:
Environment:
Last Closed: 2011-06-14 16:14:41 UTC
Embargoed:


Attachments (Terms of Use)
condorLog.txt (201.87 KB, text/plain)
2011-04-21 19:49 UTC, wes hayutin
no flags Details

Description wes hayutin 2011-04-21 19:43:39 UTC
[root@dell-pe1955-01 log]# condor_q -better
Error: Could not connect to negotiator (dell-pe1955-01.rhts.eng.bos.redhat.com)

[root@dell-pe1955-01 log]# condor_q 


-- Submitter: dell-pe1955-01.rhts.eng.bos.redhat.com : <10.16.65.121:46765> : dell-pe1955-01.rhts.eng.bos.redhat.com
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD               
  92.0   aeolus          4/21 15:08   0+00:00:00 I  0   0.0  job_jprovazn1_43  
  93.0   aeolus          4/21 15:14   0+00:00:00 I  0   0.0  job_A_westest01_44


log



4/21/11 15:27:34 matchmakingAlgorithm: limit 24.000000 used 0.000000 pieLeft 24.000000
04/21/11 15:27:34 Can't get SlotWeight for 'provider_combination_0'; using 1.0
04/21/11 15:27:34 Failed to evaluate NEGOTIATOR_POST_JOB_RANK expression to a float.
Stack dump for process 22681 at timestamp 1303414054 (25 frames)
condor_negotiator(dprintf_dump_stack+0x63)[0x5420c3]
condor_negotiator[0x53b392]
/lib64/libpthread.so.0[0x3cff60f520]
/lib64/libc.so.6(gsignal+0x35)[0x3cfee32a45]
/lib64/libc.so.6(abort+0x175)[0x3cfee34225]
/lib64/libglib-2.0.so.0(g_logv+0x53a)[0x7fbecf5c337a]
/lib64/libglib-2.0.so.0(g_log+0x83)[0x7fbecf5c3413]
/lib64/libgthread-2.0.so.0(g_thread_init+0x1db)[0x3d05e028ab]
/usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so(_Z21conductor_quota_checkPKcRKSt6vectorIP8ExprTreeSaIS3_EER9EvalStateR5Value+0x210)[0x7fbecfccf890]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x75)[0x4f3c75]
condor_negotiator(_ZNK7classad18AttributeReference9_EvaluateERNS_9EvalStateERNS_5ValueE+0xb3)[0x4fdb23]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17]
condor_negotiator(_ZNK7classad7ClassAd12EvaluateExprEPKNS_8ExprTreeERNS_5ValueE+0x42)[0x4dc312]
condor_negotiator(_ZN7classad12MatchClassAd13EvalMatchExprEPNS_8ExprTreeE+0x34)[0x4f0a34]
condor_negotiator(_Z8IsAMatchPN14compat_classad7ClassAdES1_+0xe)[0x53ea7e]
condor_negotiator(_ZN10Matchmaker20matchmakingAlgorithmEPKcS1_RN14compat_classad7ClassAdERNS2_27ClassAdListDoesNotDeleteAdsEdddddb+0x3a0)[0x472310]
condor_negotiator(_ZN10Matchmaker9negotiateEPKcPKN14compat_classad7ClassAdEdddRNS2_27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS9_ERK17CondorVersionInfoblRiRdSG_+0x8c6)[0x477da6]
condor_negotiator(_ZN10Matchmaker18negotiateWithGroupEiddRN14compat_classad27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS4_ES2_ffPKc+0xe5b)[0x4794bb]
condor_negotiator(_ZN10Matchmaker15negotiationTimeEv+0x1060)[0x47adf0]
condor_negotiator(_ZN12TimerManager7TimeoutEv+0x129)[0x49bad9]
condor_negotiator(_ZN10DaemonCore6DriverEv+0x277)[0x48c447]
condor_negotiator(main+0x10db)[0x49a5bb]
/lib64/libc.so.6(__libc_start_main+0xfd)[0x3cfee1ec9d]
condor_negotiator[0x462639]

Comment 1 wes hayutin 2011-04-21 19:49:58 UTC
Created attachment 493971 [details]
condorLog.txt

recreated on another machine

Comment 2 wes hayutin 2011-04-26 19:33:11 UTC
1 jobs; 1 idle, 0 running, 0 held
[root@hp-xw8600-01 ~]# condor_q


-- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:56162> : hp-xw8600-01.rhts.eng.bos.redhat.com
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD               
   3.0   aeolus          4/26 15:24   0+00:00:00 I  0   0.0  job_test2_2       

1 jobs; 1 idle, 0 running, 0 held
[root@hp-xw8600-01 ~]# /etc/init.d/condor restart
Stopping Condor daemons: [  OK  ]
Starting Condor daemons: [  OK  ]
[root@hp-xw8600-01 ~]# condor_q -better


-- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com
---
003.000:  Request has not yet been considered by the matchmaker.

[root@hp-xw8600-01 ~]# hostname
hp-xw8600-01.rhts.eng.bos.redhat.com
[root@hp-xw8600-01 ~]# cat /var/lib/condor/condor_config.local 
ALLOW_WRITE = *
ALLOW_ADMINISTRATOR = *
ALLOW_NEGOTIATOR = *
ALLOW_NEGOTIATOR_SCHEDD = *
COLLECTOR_HOST = localhost

DAEMON_LIST = MASTER, SCHEDD, COLLECTOR, NEGOTIATOR

MAX_GRIDMANAGER_LOG = 500000000
GRIDMANAGER_JOB_PROBE_INTERVAL = 30

GRIDMANAGER_DEBUG = D_FULLDEBUG
NEGOTIATOR_DEBUG = D_FULLDEBUG
COLLECTOR_DEBUG = D_FULLDEBUG

DELTACLOUD_GAHP = $(SBIN)/deltacloud_server

CLASSAD_LIFETIME = 0

# for the event log parsing (i.e. dbomatic)
EVENT_LOG=$(LOG)/EventLog
EVENT_LOG_USE_XML=True
EVENT_LOG_JOB_AD_INFORMATION_ATTRS=Owner,GlobalJobId,Cmd,JobStartDate,JobCurrentStartDate,JobFinishedHookDone,DeltacloudProviderId,DeltacloudPublicNetworkAddresses,DeltacloudPrivateNetworkAddresses,DeltacloudAvailableActions,JobStatus,DeltacloudUsername

CLASSAD_USER_LIBS = /usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so

Comment 3 wes hayutin 2011-04-26 19:44:22 UTC
adding to ce-ami tracker per clalance's advice

Comment 4 wes hayutin 2011-04-26 19:48:57 UTC
It does seem that condor jobs that are submitted to start/stop instances do not always work when this issue occurs.

Comment 5 wes hayutin 2011-04-26 20:19:39 UTC
004.000:  Request is held.

Hold reason: Create_Instance_Failure: InvalidAMIID.NotFound: The AMI ID 'ami-51693a14' does not exist

---
005.000:  Request has not yet been considered by the matchmaker.

[root@hp-xw8600-01 ~]# condor_rm 04.0
Job 4.0 marked for removal
[root@hp-xw8600-01 ~]# condor_q -better


-- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com
---
003.000:  Request is being serviced

---
004.000:  Request is removed.

error: bad form
error: problem with ExprToProfile
---
005.000:  Run analysis summary.  Of 4 machines,
      4 are rejected by your job's requirements 
      0 reject your job because of their own requirements 
      0 match but are serving users with a better priority in the pool 
      0 match but reject the job for unknown reasons 
      0 match but will not currently preempt their existing job 
      0 match but are currently offline 
      0 are available to run your job
	No successful match recorded.
	Last failed match: Tue Apr 26 16:16:45 2011
	Reason for last match failure: no match found

WARNING:  Be advised:
   No resources matched request's constraints

The Requirements expression for your job is:

( target.front_end_hardware_profile_id == "1" && target.image == "3" &&
target.realm == "2" && conductor_quota_check(4,other.provider_account_id) )

[root@hp-xw8600-01 ~]# condor_q


-- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD               
   3.0   aeolus          4/26 15:24   0+00:42:38 R  0   0.0  job_test2_2       
   4.0   aeolus          4/26 15:50   0+00:00:00 X  0   0.0  job_test03_3      
   5.0   aeolus          4/26 16:16   0+00:00:00 I  0   0.0  job_test05_4      

2 jobs; 1 idle, 1 running, 0 held
[root@hp-xw8600-01 ~]# condor_q -better


-- Submitter: hp-xw8600-01.rhts.eng.bos.redhat.com : <10.16.65.43:47121> : hp-xw8600-01.rhts.eng.bos.redhat.com
---
003.000:  Request is being serviced

---
004.000:  Request is removed.

error: bad form
error: problem with ExprToProfile
---
005.000:  Run analysis summary.  Of 8 machines,
      8 are rejected by your job's requirements 
      0 reject your job because of their own requirements 
      0 match but are serving users with a better priority in the pool 
      0 match but reject the job for unknown reasons 
      0 match but will not currently preempt their existing job 
      0 match but are currently offline 
      0 are available to run your job
	No successful match recorded.
	Last failed match: Tue Apr 26 16:16:45 2011
	Reason for last match failure: no match found

WARNING:  Be advised:
   No resources matched request's constraints

The Requirements expression for your job is:

( target.front_end_hardware_profile_id == "1" && target.image == "3" &&
target.realm == "2" && conductor_quota_check(4,other.provider_account_id) )





4/26/11 16:17:45     submitterAbsShare   = 1.000000
04/26/11 16:17:45     submitterLimit    = 8.000000
04/26/11 16:17:45     submitterUsage    = 0.000000
04/26/11 16:17:45 Socket to aeolus.eng.bos.redhat.com (<10.16.65.43:47121>) already in cache, reusing
04/26/11 16:17:45     Sending SEND_JOB_INFO/eom
04/26/11 16:17:45     Getting reply from schedd ...
04/26/11 16:17:45     Got JOB_INFO command; getting classad/eom
04/26/11 16:17:45     Request 00005.00000:
04/26/11 16:17:45 matchmakingAlgorithm: limit 8.000000 used 0.000000 pieLeft 8.000000
Stack dump for process 28352 at timestamp 1303849065 (25 frames)
condor_negotiator(dprintf_dump_stack+0x63)[0x5420c3]
condor_negotiator[0x53b392]
/lib64/libpthread.so.0[0x3069e0f520]
/lib64/libc.so.6(gsignal+0x35)[0x3069632a45]
/lib64/libc.so.6(abort+0x175)[0x3069634225]
/lib64/libglib-2.0.so.0(g_logv+0x53a)[0x7f3ea97e137a]
/lib64/libglib-2.0.so.0(g_log+0x83)[0x7f3ea97e1413]
/lib64/libgthread-2.0.so.0(g_thread_init+0x1db)[0x30702028ab]
/usr/share/aeolus-conductor/classad_plugin/conductor_classad_plugin.so(_Z21conductor_quota_checkPKcRKSt6vectorIP8ExprTreeSaIS3_EER9EvalStateR5Value+0x210)[0x7f3ea9eed890]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x75)[0x4f3c75]
condor_negotiator(_ZNK7classad18AttributeReference9_EvaluateERNS_9EvalStateERNS_5ValueE+0xb3)[0x4fdb23]
condor_negotiator(_ZNK7classad9Operation9_EvaluateERNS_9EvalStateERNS_5ValueE+0x117)[0x4f3d17]
condor_negotiator(_ZNK7classad7ClassAd12EvaluateExprEPKNS_8ExprTreeERNS_5ValueE+0x42)[0x4dc312]
condor_negotiator(_ZN7classad12MatchClassAd13EvalMatchExprEPNS_8ExprTreeE+0x34)[0x4f0a34]
condor_negotiator(_Z8IsAMatchPN14compat_classad7ClassAdES1_+0xe)[0x53ea7e]
condor_negotiator(_ZN10Matchmaker20matchmakingAlgorithmEPKcS1_RN14compat_classad7ClassAdERNS2_27ClassAdListDoesNotDeleteAdsEdddddb+0x3a0)[0x472310]
condor_negotiator(_ZN10Matchmaker9negotiateEPKcPKN14compat_classad7ClassAdEdddRNS2_27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS9_ERK17CondorVersionInfoblRiRdSG_+0x8c6)[0x477da6]
condor_negotiator(_ZN10Matchmaker18negotiateWithGroupEiddRN14compat_classad27ClassAdListDoesNotDeleteAdsER9HashTableI8MyStringS4_ES2_ffPKc+0xe5b)[0x4794bb]
condor_negotiator(_ZN10Matchmaker15negotiationTimeEv+0x1060)[0x47adf0]
condor_negotiator(_ZN12TimerManager7TimeoutEv+0x129)[0x49bad9]
condor_negotiator(_ZN10DaemonCore6DriverEv+0x277)[0x48c447]
condor_negotiator(main+0x10db)[0x49a5bb]
/lib64/libc.so.6(__libc_start_main+0xfd)[0x306961ec9d]
condor_negotiator[0x462639]
(END)

Comment 6 wes hayutin 2011-12-08 14:15:31 UTC
perm close


Note You need to log in before you can comment on or make changes to this bug.