Bug 476984 - update EC2E source job stats from hooks
update EC2E source job stats from hooks
Status: CLOSED ERRATA
Product: Red Hat Enterprise MRG
Classification: Red Hat
Component: grid (Show other bugs)
1.0
All Linux
medium Severity medium
: 1.1.1
: ---
Assigned To: Robert Rati
Jeff Needle
:
Depends On:
Blocks:
  Show dependency treegraph
 
Reported: 2008-12-18 09:38 EST by Matthew Farrellee
Modified: 2009-04-21 12:19 EDT (History)
1 user (show)

See Also:
Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of:
Environment:
Last Closed: 2009-04-21 12:19:05 EDT
Type: ---
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Category: ---
oVirt Team: ---
RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: ---


Attachments (Terms of Use)

  None (edit)
Description Matthew Farrellee 2008-12-18 09:38:07 EST
In EC2E a source job is routed to an EC2 job, which actually does the source job's work. The stats returned to the source job are actually those of the EC2 job and not the source job run within EC2.
Comment 1 Robert Rati 2009-01-19 10:17:42 EST
The finalize hook now prints to stdout the exit classad from SQS.  condor (7.2.0-4+) will update the source job, so condor_q and condor_history will show the correct classad information.

Fixed in:
condor-ec2-enhanced-hooks-1.0-10
condor-7.2.0-4
Comment 3 Jan Sarenik 2009-03-11 12:25:25 EDT
Could you please give me a hint on what and how to test in this bug?
Comment 4 Jan Sarenik 2009-03-11 13:51:30 EDT
Yes, it works as expected.

When I start simple job and it gets into Running state,
I gather info with 'condor_q -l <src_job>'

When it is done and out of the queue, I do it with
'condor_history -l <src_job>'

When I compare those two in that order (both were sort(1)-ed first),
this is the diff:
--------------------------------------------------------------------------
+AmazonAccessKey = "/tmp/access_key"
+AmazonFullSQSQueueName = "rhel4-test-ha-schedd665601236792169"
 AmazonKeyPairFile = "/tmp/keypair-0"
+AmazonSecretKey = "/tmp/secret_access_key"
 Arguments = ""
-AutoClusterAttrs = "ImageSize,JobUniverse,JobStart,LastPeriodicCheckpoint,LastCheckpointPlatform,NumCkpts,DiskUsage,Requirements,NiceUser,ConcurrencyLimits"
-AutoClusterId = 4
 BufferBlockSize = 32768
 BufferSize = 524288
 ClusterId = 6656
-Cmd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/jasan.sh"
+Cmd = "/var/lib/condor/execute/dir_1312/jasan.sh"
 CommittedTime = 0
-CompletionDate = 0
+CompletionDate = 1236792644
 CondorPlatform = "$CondorPlatform: X86_64-LINUX_RHEL4 $"
 CondorVersion = "$CondorVersion: 7.2.1 Mar  4 2009 BuildID: RH-7.2.2-0.7.el4 $"
 CoreSize = -1
 CumulativeSuspensionTime = 0
 CurrentHosts = 0
 DiskUsage_RAW = 1
 DiskUsage = 1
 EC2JobSuccessful = TRUE
-EC2RunAttempts = 1
-EnteredCurrentStatus = 1236792311
+EC2RunAttempts = 0
+EnteredCurrentStatus = 1236792169
 Environment = ""
 Err = "_condor_stderr"
 ExecutableSize_RAW = 1
 ExecutableSize = 1
 ExitBySignal = FALSE
+ExitCode = 0
 ExitStatus = 0
 GlobalJobId = "ha-schedd@#6656.0#1236792169"
+HookKeyword = "LOW_LATENCY"
 ImageSize_RAW = 1
-ImageSize = 1
+ImageSize = 4250
 In = "/dev/null"
+IsFeatched = TRUE
 Iwd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3"
+JobDuration = 10.010094
+JobFinishedHookDone = 1236792350
 JobLeaseDuration = 1200
 JobNotification = 2
+JobPid = 1316
 JobPrio = 0
-JobStatus = 2
+JobStartDate = 1236792340
+JobState = "Exited"
+JobStatus = 4
 JobUniverse = 5
 KillSig = "SIGTERM"
 LastSuspensionTime = 0
 LeaveJobInQueue = FALSE
-LocalSysCpu = 0
-LocalUserCpu = 0
-Managed = "External"
-ManagedManager = "north-07.lab.bos.redhat.com"
+LocalSysCpu = 0.000000
+LocalUserCpu = 0.000000
+ManagedManager = ""
+Managed = "ScheddDone"
 MaxHosts = 1
 MinHosts = 1
-MyType = "Job"
+MyType = ""
 NiceUser = FALSE
 NumCkpts_RAW = 0
 NumCkpts = 0
 NumJobStarts = 0
+NumPids = 0
 NumRestarts = 0
 NumSystemHolds = 0
 OnExitHold = FALSE
 OnExitRemove = TRUE
+OrigCmd = "jasan.sh"
+OriginatingCWD = "/var/lib/condor/execute/dir_1312"
 Out = "stdout.0"
 Owner = "testmonkey"
 PeriodicHold = FALSE
 PeriodicRelease = FALSE
 PeriodicRemove = EC2RunAttempts >= 5
 ProcId = 0
 QDate = 1236792169
 Rank = 0.000000
-RemoteSysCpu = 0.000000
-RemoteUserCpu = 0.000000
-RemoteWallClockTime = 0.000000
+RemoteSysCpu = 0
+RemoteUserCpu = 0
+RemoteWallClockTime = 301.000000
 RequestCpus = 1
 RequestDisk = DiskUsage
 RequestMemory = ceiling(ImageSize / 1024.000000)
 Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasFileTransfer)
 RootDir = "/"
 RoutedToJobId = "6657.0"
-ServerTime = 1236792465
-ShadowBday = 1236792291
+ShadowBday = 0
 ShouldTransferFiles = "YES"
+SQSMessageId = "1S5YA5NE8AA85NDRCY6Y|3552MCJT93VB473VJNP1|8H7SWNK3CK2PVVZZPTC1"
 StreamErr = FALSE
 StreamOut = FALSE
-TargetType = "Machine"
+TargetType = ""
 TotalSuspensions = 0
 TransferFiles = "ONEXIT"
 TransferIn = FALSE
 TransferOutputRemaps = "_condor_stderr=/tmp/job.stderr.0"
 UserLog = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/ulog.6656.0"
 User = "testmonkey@NWPool"
 WantArch = "INTEL"
 WantAWS = TRUE
 WantCPUs = 1
 WantCheckpoint = FALSE
 WantRemoteIO = TRUE
 WantRemoteSyscalls = FALSE
+WF_REQ_SLOT = "1"
 WhenToTransferOutput = "ON_EXIT"
--------------------------------------------------------------------------
Comment 6 errata-xmlrpc 2009-04-21 12:19:05 EDT
An advisory has been issued which should help the problem
described in this bug report. This report is therefore being
closed with a resolution of ERRATA. For more information
on therefore solution and/or where to find the updated files,
please follow the link below. You may reopen this bug report
if the solution does not work for you.

http://rhn.redhat.com/errata/RHEA-2009-0434.html

Note You need to log in before you can comment on or make changes to this bug.