0 Replies Latest reply on Oct 4, 2016 9:56 AM by jlambe@quin.co.uk

    How to avoid unnecessary register loads using powerpc-eabi-gcc

    jlambe@quin.co.uk

      I'm finding that compiling with gcc (without optimisation) produces assembly code which contains a number of unnecessary register loads which increases the code size and slows it down. I'm reluctant to turn on optimisation as this makes it difficult to debug. Are there any switches on gcc I could use to overcome this?

      For example this fragment of C:

        for (i = 0; i < MAX_CHANNEL+1; i++)
        {
          na_entry = &NodeArray[i];

          na_entry->NodeNumber = i;
          na_entry->gen_io_offset = 0;

          /* clear all digital output records to 'off' state */
          for (j = 0; j < MAX_OPGROUPS; j++)
          {
            na_entry->ogroup[j] = 0;
          }
        }

       

      compiles to this:

      stw 0,8(31)  # i, tmp125

      b .L2  #

      .L5:

      .loc 1 131 0

      lwz 0,8(31)  # i, i.135

      mulli 9,0,20  # D.6999, i.135,

      lis 0,NodeArray@ha  # tmp127,

      addic 0,0,NodeArray@l  # tmp126, tmp127,

      add 0,9,0  # tmp128, D.6999, tmp126

      stw 0,16(31)  # na_entry, tmp128

      .loc 1 133 0

      lwz 0,8(31)  # i, tmp129

      extsh 9,0  # D.7000, tmp129

      lwz 0,16(31)  # na_entry, tmp130

      mr 11,0  #, tmp130

      sth 9,0(11)  # na_entry_6->NodeNumber, D.7000

      .loc 1 134 0

      lwz 0,16(31)  # na_entry, tmp131

      li 9,0  # tmp132,

      mr 11,0  #, tmp131

      sth 9,2(11)  # na_entry_6->gen_io_offset, tmp132

      .loc 1 137 0

      li 0,0  # tmp133,

      stw 0,12(31)  # j, tmp133

      b .L3  #

      .L4:

      .loc 1 139 0 discriminator 2

      lwz 9,16(31)  # na_entry, tmp134

      lwz 0,12(31)  # j, tmp136

      add 0,9,0  # tmp135, tmp134, tmp136

      li 9,0  # tmp137,

      mr 11,0  #, tmp135

      stb 9,15(11)  # na_entry_6->ogroup, tmp137

      .loc 1 137 0 discriminator 2

      lwz 0,12(31)  # j, tmp138

      addic 0,0,1  # tmp139, tmp138,

      stw 0,12(31)  # j, tmp139

      .L3:

      .loc 1 137 0 is_stmt 0 discriminator 1

      lwz 0,12(31)  # j, tmp140

      cmpwi 7,0,1  #, tmp141, tmp140

      ble 7,.L4  #

      .loc 1 129 0 is_stmt 1

      lwz 0,8(31)  # i, tmp142

      addic 0,0,1  # tmp143, tmp142,

      stw 0,8(31)  # i, tmp143

      .L2:

      .loc 1 129 0 is_stmt 0 discriminator 1

      lwz 0,8(31)  # i, tmp144

      cmpwi 7,0,60  #, tmp145, tmp144

      ble 7,.L5  #

       

      There are 4 times where r0 is loaded from the stack just after being stored so presumably the loads could be eliminated. Also the 3 main variables in this loop: i, j & narray_entry are always being loaded into r0. If 3 separate registers were used many of the loads could be avoided. Does anyone have any thoughts on this?