Bug 246247 - indexed load/store performance issue on POWER6 for RHEL5
indexed load/store performance issue on POWER6 for RHEL5
Status: CLOSED NEXTRELEASE
Product: Bugzilla
Classification: Community
Component: Test (Show other bugs)
2.8
ppc64 Linux
low Severity urgent (vote)
: ---
: ---
Assigned To: PnT DevOps Devs
ARRAY(0x8db160)
:
Depends On:
Blocks:
  Show dependency treegraph
 
Reported: 2007-06-29 11:45 EDT by IBM Mirproxy
Modified: 2013-06-23 22:48 EDT (History)
0 users

See Also:
Fixed In Version:
Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of:
Environment:
Last Closed: 2007-06-29 12:11:30 EDT
Type: ---
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Category: ---
oVirt Team: ---
RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: ---


Attachments (Terms of Use)
gcc patch (4.32 KB, patch)
2007-06-29 11:56 EDT, IBM Bug Proxy
no flags Details | Diff


External Trackers
Tracker ID Priority Status Summary Last Updated
IBM Linux Technology Center 29862 None None None Never

  None (edit)
Description IBM Mirproxy 2007-06-29 11:45:23 EDT
=Comment: #0=================================================
Salina Chu <<a href="mailto:salina@us.ibm.com">salina@us.ibm.com</a>> - 2007-06-29 11:09 EDT
Problem description:


A severe performance problem exists on POWER6 hw with the RHEL5 compiler with
respect to indexed load/store instruction generation.  The problem is that
power6 processors require (for performance, not correctness) the base address of
an indexed load/store instruction to be in the rA position and the offset to be
in the rB position.  This is not currently happening with the RHEL5 compiler. 

This is being tracked in GCC bugzilla:

    <a
href="<a
href="http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28690">http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28690</a>"><a
href="http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28690">http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28690</a></a>


Contact Information = Peter Bergner <<a
href="<a href="mailto:bergner@vnet.ibm.com">mailto:bergner@vnet.ibm.com</a>"><a
href="mailto:bergner@vnet.ibm.com">bergner@vnet.ibm.com</a></a>>, Steve Munroe
<<a href="<a href="mailto:sjmunroe@us.ibm.com">mailto:sjmunroe@us.ibm.com</a>"><a
href="mailto:sjmunroe@us.ibm.com">sjmunroe@us.ibm.com</a></a>>
 
---uname output---
Linux devl4e-woody-lp1 2.6.16.37-0.6-ppc64 #1 SMP Fri Jan 12 20:19:44 UTC 2007
ppc64 ppc64 ppc64 GNU/Linux
 
Machine Type = POWER6
 
---Debugger---
A debugger is not configured
 
---Steps to Reproduce---
cat <<EOF > indexedload.c
int indexedload(int ***base, int idx0, int idx1, int idx2)
{
  return base[idx0][idx1][idx2];
}

gcc -O1 -S indexedload.c

 
---Compiler and Library Component Data---
Userspace tool common name: GCC

The userspace tool has the following bit modes: 32-bit and 64-bit code
generation is affected.

Userspace rpm: gcc-4.1.1-47.el5
 
*Additional Instructions for Peter Bergner <<a
href="<a href="mailto:bergner@vnet.ibm.com">mailto:bergner@vnet.ibm.com</a>"><a
href="mailto:bergner@vnet.ibm.com">bergner@vnet.ibm.com</a></a>>, Steve Munroe
<<a href="<a href="mailto:sjmunroe@us.ibm.com">mailto:sjmunroe@us.ibm.com</a>"><a
href="mailto:sjmunroe@us.ibm.com">sjmunroe@us.ibm.com</a></a>>:
-Attach ltrace and strace of userspace application.
Comment 1 IBM Bug Proxy 2007-06-29 11:49:41 EDT
The following seems not to have made it's way from RHIT to IBMBZ ; it made it's
way from RHBZ to RHIT, and is visible there, but not to IBMBZ...  

----------------------------------------------------------------------------------
These changes made by jakub@redhat.com.
Bugzilla comment added:
Created an attachment (id=157830)
gcc41-rh225425.patch

I meant something like:
--- gcc/defaults.h.jj 2007-02-20 22:39:12.000000000 +0100
+++ gcc/defaults.h 2007-06-26 00:32:16.000000000 +0200
@@ -785,6 +785,10 @@ Software Foundation, 51 Franklin Street,
#define TARGET_C99_FUNCTIONS 0
#endif

+#ifndef TARGET_INDEX_OPERAND_FIRST
+#define TARGET_INDEX_OPERAND_FIRST 0
+#endif
+
/* Indicate that CLZ and CTZ are undefined at zero.  */
#ifndef CLZ_DEFINED_VALUE_AT_ZERO
#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  0
--- gcc/config/rs6000/rs6000.h.jj 2007-02-20 22:39:00.000000000 +0100
+++ gcc/config/rs6000/rs6000.h 2007-06-26 00:33:32.000000000 +0200
@@ -57,6 +57,8 @@
#define PPC405_ERRATUM77 0
#endif

+#define TARGET_INDEX_OPERAND_FIRST (rs6000_cpu == PROCESSOR_POWER6)
+
/* Common ASM definitions used by ASM_SPEC among the various targets
   for handling -mcpu=xxx switches.  */
#define ASM_CPU_SPEC \
--- gcc/optabs.c.jj 2007-02-20 22:39:12.000000000 +0100
+++ gcc/optabs.c 2007-06-26 00:30:27.000000000 +0200
@@ -3673,12 +3673,16 @@ emit_cmp_and_jump_insns (rtx x, rtx y, e
  /* Swap operands and condition to ensure canonical RTL.  */
  if (swap_commutative_operands_p (x, y))
    {
-      /* If we're not emitting a branch, this means some caller
-  is out of sync.  */
-      gcc_assert (label);
+      enum rtx_code swapped_comparison = swap_condition (comparison);
+
+      /* If we're not emitting a branch, callers are required to pass
+  operands in an order conforming to canonical RTL.  We relax this
+ for commutative comparsions so callers using EQ don't need to do
+ swapping by hand.  */
+      gcc_assert (label || swapped_comparison == comparison);

      op0 = y, op1 = x;
-      comparison = swap_condition (comparison);
+      comparison = swapped_comparison;
    }

#ifdef HAVE_cc0
--- gcc/rtlanal.c.jj 2007-02-20 22:39:12.000000000 +0100
+++ gcc/rtlanal.c 2007-06-26 00:28:56.000000000 +0200
@@ -2890,9 +2890,9 @@ commutative_operand_precedence (rtx op)

  /* Constants always come the second operand.  Prefer "nice" constants.  */
  if (code == CONST_INT)
-    return -7;
+    return -10;
  if (code == CONST_DOUBLE)
-    return -6;
+    return -9;
  op = avoid_constant_pool_reference (op);
  code = GET_CODE (op);

@@ -2900,26 +2900,31 @@ commutative_operand_precedence (rtx op)
    {
    case RTX_CONST_OBJ:
      if (code == CONST_INT)
- return -5;
+ return -8;
      if (code == CONST_DOUBLE)
- return -4;
-      return -3;
+ return -7;
+      return -6;

    case RTX_EXTRA:
      /* SUBREGs of objects should come second.  */
      if (code == SUBREG && OBJECT_P (SUBREG_REG (op)))
- return -2;
+ return -5;

      if (!CONSTANT_P (op))
return 0;
      else
/* As for RTX_CONST_OBJ.  */
- return -3;
+ return -6;

    case RTX_OBJ:
      /* Complex expressions should be the first, so decrease priority
 of objects.  */
-      return -1;
+      if (!TARGET_INDEX_OPERAND_FIRST)
+ return -1;
+      if (REG_P (op))
+ return (REG_POINTER (op)) ? -1 : -3;
+      else
+ return (MEM_P (op) && MEM_POINTER (op)) ? -2 : -4;

    case RTX_COMM_ARITH:
      /* Prefer operands that are themselves commutative to be first.
@@ -2949,8 +2954,16 @@ commutative_operand_precedence (rtx op)
int
swap_commutative_operands_p (rtx x, rtx y)
{
-  return (commutative_operand_precedence (x)
-  < commutative_operand_precedence (y));
+  int result = (commutative_operand_precedence (x)
+ - commutative_operand_precedence (y));
+  if (!TARGET_INDEX_OPERAND_FIRST || result)
+    return result < 0;
+
+  /* Group together equal REGs to do more simplification.  */
+  if (REG_P (x) && REG_P (y))
+    return REGNO (x) > REGNO (y);
+
+  return 0;
}

/* Return 1 if X is an autoincrement side effect and the register is
--- gcc/tree-ssa-address.c.jj 2007-02-20 22:39:12.000000000 +0100
+++ gcc/tree-ssa-address.c 2007-06-26 00:29:49.000000000 +0200
@@ -124,7 +124,9 @@ gen_addr_rtx (rtx symbol, rtx base, rtx
  if (base)
    {
      if (*addr)
- *addr = gen_rtx_PLUS (Pmode, *addr, base);
+ *addr = (TARGET_INDEX_OPERAND_FIRST
+ ? simplify_gen_binary (PLUS, Pmode, base, *addr)
+ : gen_rtx_PLUS (Pmode, *addr, base));
      else
*addr = base;
    }


Untested yet.
Comment 2 IBM Bug Proxy 2007-06-29 11:56:55 EDT
Created attachment 158220 [details]
gcc patch

try sending patch attachment

Note You need to log in before you can comment on or make changes to this bug.