In EC2E a source job is routed to an EC2 job, which actually does the source job's work. The stats returned to the source job are actually those of the EC2 job and not the source job run within EC2.
The finalize hook now prints to stdout the exit classad from SQS. condor (7.2.0-4+) will update the source job, so condor_q and condor_history will show the correct classad information. Fixed in: condor-ec2-enhanced-hooks-1.0-10 condor-7.2.0-4
Could you please give me a hint on what and how to test in this bug?
Yes, it works as expected. When I start simple job and it gets into Running state, I gather info with 'condor_q -l <src_job>' When it is done and out of the queue, I do it with 'condor_history -l <src_job>' When I compare those two in that order (both were sort(1)-ed first), this is the diff: -------------------------------------------------------------------------- +AmazonAccessKey = "/tmp/access_key" +AmazonFullSQSQueueName = "rhel4-test-ha-schedd665601236792169" AmazonKeyPairFile = "/tmp/keypair-0" +AmazonSecretKey = "/tmp/secret_access_key" Arguments = "" -AutoClusterAttrs = "ImageSize,JobUniverse,JobStart,LastPeriodicCheckpoint,LastCheckpointPlatform,NumCkpts,DiskUsage,Requirements,NiceUser,ConcurrencyLimits" -AutoClusterId = 4 BufferBlockSize = 32768 BufferSize = 524288 ClusterId = 6656 -Cmd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/jasan.sh" +Cmd = "/var/lib/condor/execute/dir_1312/jasan.sh" CommittedTime = 0 -CompletionDate = 0 +CompletionDate = 1236792644 CondorPlatform = "$CondorPlatform: X86_64-LINUX_RHEL4 $" CondorVersion = "$CondorVersion: 7.2.1 Mar 4 2009 BuildID: RH-7.2.2-0.7.el4 $" CoreSize = -1 CumulativeSuspensionTime = 0 CurrentHosts = 0 DiskUsage_RAW = 1 DiskUsage = 1 EC2JobSuccessful = TRUE -EC2RunAttempts = 1 -EnteredCurrentStatus = 1236792311 +EC2RunAttempts = 0 +EnteredCurrentStatus = 1236792169 Environment = "" Err = "_condor_stderr" ExecutableSize_RAW = 1 ExecutableSize = 1 ExitBySignal = FALSE +ExitCode = 0 ExitStatus = 0 GlobalJobId = "ha-schedd@#6656.0#1236792169" +HookKeyword = "LOW_LATENCY" ImageSize_RAW = 1 -ImageSize = 1 +ImageSize = 4250 In = "/dev/null" +IsFeatched = TRUE Iwd = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3" +JobDuration = 10.010094 +JobFinishedHookDone = 1236792350 JobLeaseDuration = 1200 JobNotification = 2 +JobPid = 1316 JobPrio = 0 -JobStatus = 2 +JobStartDate = 1236792340 +JobState = "Exited" +JobStatus = 4 JobUniverse = 5 KillSig = "SIGTERM" LastSuspensionTime = 0 LeaveJobInQueue = FALSE -LocalSysCpu = 0 -LocalUserCpu = 0 -Managed = "External" -ManagedManager = "north-07.lab.bos.redhat.com" +LocalSysCpu = 0.000000 +LocalUserCpu = 0.000000 +ManagedManager = "" +Managed = "ScheddDone" MaxHosts = 1 MinHosts = 1 -MyType = "Job" +MyType = "" NiceUser = FALSE NumCkpts_RAW = 0 NumCkpts = 0 NumJobStarts = 0 +NumPids = 0 NumRestarts = 0 NumSystemHolds = 0 OnExitHold = FALSE OnExitRemove = TRUE +OrigCmd = "jasan.sh" +OriginatingCWD = "/var/lib/condor/execute/dir_1312" Out = "stdout.0" Owner = "testmonkey" PeriodicHold = FALSE PeriodicRelease = FALSE PeriodicRemove = EC2RunAttempts >= 5 ProcId = 0 QDate = 1236792169 Rank = 0.000000 -RemoteSysCpu = 0.000000 -RemoteUserCpu = 0.000000 -RemoteWallClockTime = 0.000000 +RemoteSysCpu = 0 +RemoteUserCpu = 0 +RemoteWallClockTime = 301.000000 RequestCpus = 1 RequestDisk = DiskUsage RequestMemory = ceiling(ImageSize / 1024.000000) Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasFileTransfer) RootDir = "/" RoutedToJobId = "6657.0" -ServerTime = 1236792465 -ShadowBday = 1236792291 +ShadowBday = 0 ShouldTransferFiles = "YES" +SQSMessageId = "1S5YA5NE8AA85NDRCY6Y|3552MCJT93VB473VJNP1|8H7SWNK3CK2PVVZZPTC1" StreamErr = FALSE StreamOut = FALSE -TargetType = "Machine" +TargetType = "" TotalSuspensions = 0 TransferFiles = "ONEXIT" TransferIn = FALSE TransferOutputRemaps = "_condor_stderr=/tmp/job.stderr.0" UserLog = "/mnt/sharedfs/testmonkey/north-14/ec2e/jasan3/ulog.6656.0" User = "testmonkey@NWPool" WantArch = "INTEL" WantAWS = TRUE WantCPUs = 1 WantCheckpoint = FALSE WantRemoteIO = TRUE WantRemoteSyscalls = FALSE +WF_REQ_SLOT = "1" WhenToTransferOutput = "ON_EXIT" --------------------------------------------------------------------------
An advisory has been issued which should help the problem described in this bug report. This report is therefore being closed with a resolution of ERRATA. For more information on therefore solution and/or where to find the updated files, please follow the link below. You may reopen this bug report if the solution does not work for you. http://rhn.redhat.com/errata/RHEA-2009-0434.html