Bug 476984

Summary: update EC2E source job stats from hooks
Product: Red Hat Enterprise MRG Reporter: Matthew Farrellee <matt>
Component: gridAssignee: Robert Rati <rrati>
Status: CLOSED ERRATA QA Contact: Jeff Needle <jneedle>
Severity: medium Docs Contact:
Priority: medium    
Version: 1.0CC: jsarenik
Target Milestone: 1.1.1   
Target Release: ---   
Hardware: All   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2009-04-21 16:19:05 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description Matthew Farrellee 2008-12-18 14:38:07 UTC
In EC2E a source job is routed to an EC2 job, which actually does the source job's work. The stats returned to the source job are actually those of the EC2 job and not the source job run within EC2.

Comment 1 Robert Rati 2009-01-19 15:17:42 UTC
The finalize hook now prints to stdout the exit classad from SQS.  condor (7.2.0-4+) will update the source job, so condor_q and condor_history will show the correct classad information.

Fixed in:
condor-ec2-enhanced-hooks-1.0-10
condor-7.2.0-4

Comment 3 Jan Sarenik 2009-03-11 16:25:25 UTC
Could you please give me a hint on what and how to test in this bug?

Comment 4 Jan Sarenik 2009-03-11 17:51:30 UTC
Yes, it works as expected.

When I start simple job and it gets into Running state,
I gather info with 'condor_q -l <src_job>'

When it is done and out of the queue, I do it with
'condor_history -l <src_job>'

When I compare those two in that order (both were sort(1)-ed first),
this is the diff:
--------------------------------------------------------------------------
+AmazonAccessKey = "/tmp/access_key"
+AmazonFullSQSQueueName = "rhel4-test-ha-schedd665601236792169"
 AmazonKeyPairFile = "/tmp/keypair-0"
+AmazonSecretKey = "/tmp/secret_access_key"
 Arguments = ""
-AutoClusterAttrs = "ImageSize,JobUniverse,JobStart,LastPeriodicCheckpoint,LastCheckpointPlatform,NumCkpts,DiskUsage,Requirements,NiceUser,ConcurrencyLimits"
-AutoClusterId = 4
 BufferBlockSize = 32768
 BufferSize = 524288
 ClusterId = 6656
-Cmd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/jasan.sh"
+Cmd = "/var/lib/condor/execute/dir_1312/jasan.sh"
 CommittedTime = 0
-CompletionDate = 0
+CompletionDate = 1236792644
 CondorPlatform = "$CondorPlatform: X86_64-LINUX_RHEL4 $"
 CondorVersion = "$CondorVersion: 7.2.1 Mar  4 2009 BuildID: RH-7.2.2-0.7.el4 $"
 CoreSize = -1
 CumulativeSuspensionTime = 0
 CurrentHosts = 0
 DiskUsage_RAW = 1
 DiskUsage = 1
 EC2JobSuccessful = TRUE
-EC2RunAttempts = 1
-EnteredCurrentStatus = 1236792311
+EC2RunAttempts = 0
+EnteredCurrentStatus = 1236792169
 Environment = ""
 Err = "_condor_stderr"
 ExecutableSize_RAW = 1
 ExecutableSize = 1
 ExitBySignal = FALSE
+ExitCode = 0
 ExitStatus = 0
 GlobalJobId = "ha-schedd@#6656.0#1236792169"
+HookKeyword = "LOW_LATENCY"
 ImageSize_RAW = 1
-ImageSize = 1
+ImageSize = 4250
 In = "/dev/null"
+IsFeatched = TRUE
 Iwd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3"
+JobDuration = 10.010094
+JobFinishedHookDone = 1236792350
 JobLeaseDuration = 1200
 JobNotification = 2
+JobPid = 1316
 JobPrio = 0
-JobStatus = 2
+JobStartDate = 1236792340
+JobState = "Exited"
+JobStatus = 4
 JobUniverse = 5
 KillSig = "SIGTERM"
 LastSuspensionTime = 0
 LeaveJobInQueue = FALSE
-LocalSysCpu = 0
-LocalUserCpu = 0
-Managed = "External"
-ManagedManager = "north-07.lab.bos.redhat.com"
+LocalSysCpu = 0.000000
+LocalUserCpu = 0.000000
+ManagedManager = ""
+Managed = "ScheddDone"
 MaxHosts = 1
 MinHosts = 1
-MyType = "Job"
+MyType = ""
 NiceUser = FALSE
 NumCkpts_RAW = 0
 NumCkpts = 0
 NumJobStarts = 0
+NumPids = 0
 NumRestarts = 0
 NumSystemHolds = 0
 OnExitHold = FALSE
 OnExitRemove = TRUE
+OrigCmd = "jasan.sh"
+OriginatingCWD = "/var/lib/condor/execute/dir_1312"
 Out = "stdout.0"
 Owner = "testmonkey"
 PeriodicHold = FALSE
 PeriodicRelease = FALSE
 PeriodicRemove = EC2RunAttempts >= 5
 ProcId = 0
 QDate = 1236792169
 Rank = 0.000000
-RemoteSysCpu = 0.000000
-RemoteUserCpu = 0.000000
-RemoteWallClockTime = 0.000000
+RemoteSysCpu = 0
+RemoteUserCpu = 0
+RemoteWallClockTime = 301.000000
 RequestCpus = 1
 RequestDisk = DiskUsage
 RequestMemory = ceiling(ImageSize / 1024.000000)
 Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasFileTransfer)
 RootDir = "/"
 RoutedToJobId = "6657.0"
-ServerTime = 1236792465
-ShadowBday = 1236792291
+ShadowBday = 0
 ShouldTransferFiles = "YES"
+SQSMessageId = "1S5YA5NE8AA85NDRCY6Y|3552MCJT93VB473VJNP1|8H7SWNK3CK2PVVZZPTC1"
 StreamErr = FALSE
 StreamOut = FALSE
-TargetType = "Machine"
+TargetType = ""
 TotalSuspensions = 0
 TransferFiles = "ONEXIT"
 TransferIn = FALSE
 TransferOutputRemaps = "_condor_stderr=/tmp/job.stderr.0"
 UserLog = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/ulog.6656.0"
 User = "testmonkey@NWPool"
 WantArch = "INTEL"
 WantAWS = TRUE
 WantCPUs = 1
 WantCheckpoint = FALSE
 WantRemoteIO = TRUE
 WantRemoteSyscalls = FALSE
+WF_REQ_SLOT = "1"
 WhenToTransferOutput = "ON_EXIT"
--------------------------------------------------------------------------

Comment 6 errata-xmlrpc 2009-04-21 16:19:05 UTC
An advisory has been issued which should help the problem
described in this bug report. This report is therefore being
closed with a resolution of ERRATA. For more information
on therefore solution and/or where to find the updated files,
please follow the link below. You may reopen this bug report
if the solution does not work for you.

http://rhn.redhat.com/errata/RHEA-2009-0434.html