AnsweredAssumed Answered

How to avoid unnecessary register loads using powerpc-eabi-gcc

Question asked by jlambe@quin.co.uk on Oct 4, 2016

I'm finding that compiling with gcc (without optimisation) produces assembly code which contains a number of unnecessary register loads which increases the code size and slows it down. I'm reluctant to turn on optimisation as this makes it difficult to debug. Are there any switches on gcc I could use to overcome this?

For example this fragment of C:

  for (i = 0; i < MAX_CHANNEL+1; i++)
  {
    na_entry = &NodeArray[i];

    na_entry->NodeNumber = i;
    na_entry->gen_io_offset = 0;

    /* clear all digital output records to 'off' state */
    for (j = 0; j < MAX_OPGROUPS; j++)
    {
      na_entry->ogroup[j] = 0;
    }
  }

 

compiles to this:

stw 0,8(31)  # i, tmp125

b .L2  #

.L5:

.loc 1 131 0

lwz 0,8(31)  # i, i.135

mulli 9,0,20  # D.6999, i.135,

lis 0,NodeArray@ha  # tmp127,

addic 0,0,NodeArray@l  # tmp126, tmp127,

add 0,9,0  # tmp128, D.6999, tmp126

stw 0,16(31)  # na_entry, tmp128

.loc 1 133 0

lwz 0,8(31)  # i, tmp129

extsh 9,0  # D.7000, tmp129

lwz 0,16(31)  # na_entry, tmp130

mr 11,0  #, tmp130

sth 9,0(11)  # na_entry_6->NodeNumber, D.7000

.loc 1 134 0

lwz 0,16(31)  # na_entry, tmp131

li 9,0  # tmp132,

mr 11,0  #, tmp131

sth 9,2(11)  # na_entry_6->gen_io_offset, tmp132

.loc 1 137 0

li 0,0  # tmp133,

stw 0,12(31)  # j, tmp133

b .L3  #

.L4:

.loc 1 139 0 discriminator 2

lwz 9,16(31)  # na_entry, tmp134

lwz 0,12(31)  # j, tmp136

add 0,9,0  # tmp135, tmp134, tmp136

li 9,0  # tmp137,

mr 11,0  #, tmp135

stb 9,15(11)  # na_entry_6->ogroup, tmp137

.loc 1 137 0 discriminator 2

lwz 0,12(31)  # j, tmp138

addic 0,0,1  # tmp139, tmp138,

stw 0,12(31)  # j, tmp139

.L3:

.loc 1 137 0 is_stmt 0 discriminator 1

lwz 0,12(31)  # j, tmp140

cmpwi 7,0,1  #, tmp141, tmp140

ble 7,.L4  #

.loc 1 129 0 is_stmt 1

lwz 0,8(31)  # i, tmp142

addic 0,0,1  # tmp143, tmp142,

stw 0,8(31)  # i, tmp143

.L2:

.loc 1 129 0 is_stmt 0 discriminator 1

lwz 0,8(31)  # i, tmp144

cmpwi 7,0,60  #, tmp145, tmp144

ble 7,.L5  #

 

There are 4 times where r0 is loaded from the stack just after being stored so presumably the loads could be eliminated. Also the 3 main variables in this loop: i, j & narray_entry are always being loaded into r0. If 3 separate registers were used many of the loads could be avoided. Does anyone have any thoughts on this?

Outcomes