Bug 476984 - update EC2E source job stats from hooks
Summary: update EC2E source job stats from hooks
Keywords:
Status: CLOSED ERRATA
Alias: None
Product: Red Hat Enterprise MRG
Classification: Red Hat
Component: grid
Version: 1.0
Hardware: All
OS: Linux
medium
medium
Target Milestone: 1.1.1
: ---
Assignee: Robert Rati
QA Contact: Jeff Needle
URL:
Whiteboard:
Depends On:
Blocks:
TreeView+ depends on / blocked
 
Reported: 2008-12-18 14:38 UTC by Matthew Farrellee
Modified: 2009-04-21 16:19 UTC (History)
1 user (show)

Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Clone Of:
Environment:
Last Closed: 2009-04-21 16:19:05 UTC
Target Upstream Version:
Embargoed:


Attachments (Terms of Use)


Links
System ID Private Priority Status Summary Last Updated
Red Hat Product Errata RHEA-2009:0434 0 normal SHIPPED_LIVE Red Hat Enterprise MRG Messaging and Grid Version 1.1.1 2009-04-21 16:15:50 UTC

Description Matthew Farrellee 2008-12-18 14:38:07 UTC
In EC2E a source job is routed to an EC2 job, which actually does the source job's work. The stats returned to the source job are actually those of the EC2 job and not the source job run within EC2.

Comment 1 Robert Rati 2009-01-19 15:17:42 UTC
The finalize hook now prints to stdout the exit classad from SQS.  condor (7.2.0-4+) will update the source job, so condor_q and condor_history will show the correct classad information.

Fixed in:
condor-ec2-enhanced-hooks-1.0-10
condor-7.2.0-4

Comment 3 Jan Sarenik 2009-03-11 16:25:25 UTC
Could you please give me a hint on what and how to test in this bug?

Comment 4 Jan Sarenik 2009-03-11 17:51:30 UTC
Yes, it works as expected.

When I start simple job and it gets into Running state,
I gather info with 'condor_q -l <src_job>'

When it is done and out of the queue, I do it with
'condor_history -l <src_job>'

When I compare those two in that order (both were sort(1)-ed first),
this is the diff:
--------------------------------------------------------------------------
+AmazonAccessKey = "/tmp/access_key"
+AmazonFullSQSQueueName = "rhel4-test-ha-schedd665601236792169"
 AmazonKeyPairFile = "/tmp/keypair-0"
+AmazonSecretKey = "/tmp/secret_access_key"
 Arguments = ""
-AutoClusterAttrs = "ImageSize,JobUniverse,JobStart,LastPeriodicCheckpoint,LastCheckpointPlatform,NumCkpts,DiskUsage,Requirements,NiceUser,ConcurrencyLimits"
-AutoClusterId = 4
 BufferBlockSize = 32768
 BufferSize = 524288
 ClusterId = 6656
-Cmd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/jasan.sh"
+Cmd = "/var/lib/condor/execute/dir_1312/jasan.sh"
 CommittedTime = 0
-CompletionDate = 0
+CompletionDate = 1236792644
 CondorPlatform = "$CondorPlatform: X86_64-LINUX_RHEL4 $"
 CondorVersion = "$CondorVersion: 7.2.1 Mar  4 2009 BuildID: RH-7.2.2-0.7.el4 $"
 CoreSize = -1
 CumulativeSuspensionTime = 0
 CurrentHosts = 0
 DiskUsage_RAW = 1
 DiskUsage = 1
 EC2JobSuccessful = TRUE
-EC2RunAttempts = 1
-EnteredCurrentStatus = 1236792311
+EC2RunAttempts = 0
+EnteredCurrentStatus = 1236792169
 Environment = ""
 Err = "_condor_stderr"
 ExecutableSize_RAW = 1
 ExecutableSize = 1
 ExitBySignal = FALSE
+ExitCode = 0
 ExitStatus = 0
 GlobalJobId = "ha-schedd@#6656.0#1236792169"
+HookKeyword = "LOW_LATENCY"
 ImageSize_RAW = 1
-ImageSize = 1
+ImageSize = 4250
 In = "/dev/null"
+IsFeatched = TRUE
 Iwd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3"
+JobDuration = 10.010094
+JobFinishedHookDone = 1236792350
 JobLeaseDuration = 1200
 JobNotification = 2
+JobPid = 1316
 JobPrio = 0
-JobStatus = 2
+JobStartDate = 1236792340
+JobState = "Exited"
+JobStatus = 4
 JobUniverse = 5
 KillSig = "SIGTERM"
 LastSuspensionTime = 0
 LeaveJobInQueue = FALSE
-LocalSysCpu = 0
-LocalUserCpu = 0
-Managed = "External"
-ManagedManager = "north-07.lab.bos.redhat.com"
+LocalSysCpu = 0.000000
+LocalUserCpu = 0.000000
+ManagedManager = ""
+Managed = "ScheddDone"
 MaxHosts = 1
 MinHosts = 1
-MyType = "Job"
+MyType = ""
 NiceUser = FALSE
 NumCkpts_RAW = 0
 NumCkpts = 0
 NumJobStarts = 0
+NumPids = 0
 NumRestarts = 0
 NumSystemHolds = 0
 OnExitHold = FALSE
 OnExitRemove = TRUE
+OrigCmd = "jasan.sh"
+OriginatingCWD = "/var/lib/condor/execute/dir_1312"
 Out = "stdout.0"
 Owner = "testmonkey"
 PeriodicHold = FALSE
 PeriodicRelease = FALSE
 PeriodicRemove = EC2RunAttempts >= 5
 ProcId = 0
 QDate = 1236792169
 Rank = 0.000000
-RemoteSysCpu = 0.000000
-RemoteUserCpu = 0.000000
-RemoteWallClockTime = 0.000000
+RemoteSysCpu = 0
+RemoteUserCpu = 0
+RemoteWallClockTime = 301.000000
 RequestCpus = 1
 RequestDisk = DiskUsage
 RequestMemory = ceiling(ImageSize / 1024.000000)
 Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasFileTransfer)
 RootDir = "/"
 RoutedToJobId = "6657.0"
-ServerTime = 1236792465
-ShadowBday = 1236792291
+ShadowBday = 0
 ShouldTransferFiles = "YES"
+SQSMessageId = "1S5YA5NE8AA85NDRCY6Y|3552MCJT93VB473VJNP1|8H7SWNK3CK2PVVZZPTC1"
 StreamErr = FALSE
 StreamOut = FALSE
-TargetType = "Machine"
+TargetType = ""
 TotalSuspensions = 0
 TransferFiles = "ONEXIT"
 TransferIn = FALSE
 TransferOutputRemaps = "_condor_stderr=/tmp/job.stderr.0"
 UserLog = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/ulog.6656.0"
 User = "testmonkey@NWPool"
 WantArch = "INTEL"
 WantAWS = TRUE
 WantCPUs = 1
 WantCheckpoint = FALSE
 WantRemoteIO = TRUE
 WantRemoteSyscalls = FALSE
+WF_REQ_SLOT = "1"
 WhenToTransferOutput = "ON_EXIT"
--------------------------------------------------------------------------

Comment 6 errata-xmlrpc 2009-04-21 16:19:05 UTC
An advisory has been issued which should help the problem
described in this bug report. This report is therefore being
closed with a resolution of ERRATA. For more information
on therefore solution and/or where to find the updated files,
please follow the link below. You may reopen this bug report
if the solution does not work for you.

http://rhn.redhat.com/errata/RHEA-2009-0434.html


Note You need to log in before you can comment on or make changes to this bug.