Electronic – AVR: How optimize cycle-counted ISR to portable code, using inline asm

assemblyavrcinterruptsmicrocontroller

I'm trying to optimize my RX and TX interrupts to meet maximum execution time of 25 cycles while interrupts are disabled.

So far, I have found that code is optimized enough, but pushing and popping series of registers between loading and unloading __SREG__ exceeds the time limit.

 272:   80 91 24 01     lds r24, 0x0124
 276:   8f 5f           subi    r24, 0xFF   ; 255
 278:   8f 71           andi    r24, 0x1F   ; 31
 27a:   90 91 c6 00     lds r25, 0x00C6
 27e:   20 91 25 01     lds r18, 0x0125
 282:   28 17           cp  r18, r24
 284:   39 f0           breq    .+14        ; 0x294 <__vector_18+0x30>
 286:   e8 2f           mov r30, r24
 288:   f0 e0           ldi r31, 0x00   ; 0
 28a:   ea 5d           subi    r30, 0xDA   ; 218
 28c:   fe 4f           sbci    r31, 0xFE   ; 254
 28e:   90 83           st  Z, r25
 290:   80 93 24 01     sts 0x0124, r24

The only way to be able to place __SREG__ in safest location (include as much pushes as possible in unconscious area) was inline asm.

Here is my current code:

ISR(RX0_INTERRUPT, ISR_NAKED)
{
    //push
    asm volatile("push r31" ::); // table pointer
    asm volatile("push r30" ::); // table pointer
    asm volatile("push r25" ::); // received character
    asm volatile("push r18" ::); // once compared to r24 -> rx0_first_byte

    asm volatile("push r24" ::); // most stuff is executed in r24
    asm volatile("in r24,__SREG__" ::); // - 
    asm volatile("push r24" ::); // but one byte more on stack

    register uint8_t tmp_rx_last_byte = (rx0_last_byte + 1) & RX0_BUFFER_MASK;
    register uint8_t tmp = UDR0_REGISTER;

    if(rx0_first_byte != tmp_rx_last_byte)
    {
        rx0_buffer[tmp_rx_last_byte] = tmp;
        rx0_last_byte = tmp_rx_last_byte;
    }

    //pop
    asm volatile("pop r24" ::);
    asm volatile("out __SREG__,r24" ::);
    asm volatile("pop r24" ::);

    asm volatile("pop r18" ::);
    asm volatile("pop r25" ::);
    asm volatile("pop r30" ::);
    asm volatile("pop r31" ::);

    reti();
}

As you can see, there are a hardcoded register pushes that my compiler used, btw it's working at all, but I'm not sure how portable is it.

The only register which I can get by "=r" specificator is r24 and it's happening even for mask and rx0_first_byte.

So how can I tell the compiler to push/pop these 5 registers, even if they will be placed elsewhere?

What is possibility that the compiler will use r19 and r26 instead of r18 and r25?

I don't want to rewrite the whole ISR into assembler.

EDIT: thanks for all suggestions, finally I rewrote ISR into asm

    ISR(RX0_INTERRUPT, ISR_NAKED)
{
    asm volatile("\n\t"                      /* 5 ISR entry */
    "push  r31 \n\t"                         /* 2 */
    "push  r30 \n\t"                         /* 2 */
    "push  r25 \n\t"                         /* 2 */
    "push  r24 \n\t"                         /* 2 */
    "push  r18 \n\t"                         /* 2 */
    "in    r18, __SREG__ \n\t"               /* 1 */
    "push  r18 \n\t"                         /* 2 */

    /* read byte from UDR register */
    "lds   r25, %M[uart_data] \n\t"          /* 2 */

    /* load globals */
    "lds   r24, (rx0_last_byte) \n\t"        /* 2 */
    "lds   r18, (rx0_first_byte) \n\t"       /* 2 */

    /* add 1 & mask */
    "subi  r24, 0xFF \n\t" //???                  /* 1 */
    "andi  r24, %M[mask] \n\t"            /* 1 */

    /* if head == tail */
    "cp    r18, r24 \n\t"                    /* 1 */
    "breq  L_%= \n\t"                        /* 1/2 */

    "mov   r30, r24 \n\t"                    /* 1 */
    "ldi   r31, 0x00 \n\t"                   /* 1 */
    "subi  r30, lo8(-(rx0_buffer))\n\t"      /* 1 */
    "sbci  r31, hi8(-(rx0_buffer))\n\t"      /* 1 */
    "st    Z, r25 \n\t"                      /* 2 */
    "sts   (rx0_last_byte), r24 \n\t"        /* 2 */

"L_%=:\t"
    "pop   r18 \n\t"                         /* 2 */
    "out   __SREG__ , r18 \n\t"              /* 1 */
    "pop   r18 \n\t"                         /* 2 */
    "pop   r24 \n\t"                         /* 2 */
    "pop   r25 \n\t"                         /* 2 */
    "pop   r30 \n\t"                         /* 2 */
    "pop   r31 \n\t"                         /* 2 */
    "reti \n\t"                              /* 5 ISR return */

    : /* output operands */

    : /* input operands */
    [uart_data] "M"    (_SFR_MEM_ADDR(UDR0_REGISTER)),
    [mask]      "M"    (RX0_BUFFER_MASK)

    /* no clobbers */
    );

}

UPDATE:

After some tests I have found that the interrupts are disabled before entering ISR handler, not after unloading __SREG__ as I have been suggested before.

The only way is to globalize registers as ndim suggested, or use the following code:

ISR(RX0_INTERRUPT, ISR_NAKED)
    {
        asm volatile("\n\t"                      /* 4 ISR entry */

        "push  r0 \n\t"                          /* 2 */
        "in    r0, __SREG__ \n\t"                /* 1 */

        "push  r31 \n\t"                         /* 2 */
        "push  r30 \n\t"                         /* 2 */
        "push  r25 \n\t"                         /* 2 */
        "push  r24 \n\t"                         /* 2 */
        "push  r18 \n\t"                         /* 2 */

        /* read byte from UDR register */
        "lds   r25, %M[uart_data] \n\t"          /* 2 */

#ifdef USART_UNSAFE_RX_INTERRUPT // enable interrupt after satisfying UDR register
        "sei \n\t"                               /* 1 */
#endif
        /* load globals */
        "lds   r24, (rx0_last_byte) \n\t"        /* 2 */
        "lds   r18, (rx0_first_byte) \n\t"       /* 2 */

        /* tmp_rx_last_byte = (rx0_last_byte + 1) & RX0_BUFFER_MASK */
        "subi  r24, 0xFF \n\t"                   /* 1 */
        "andi  r24, %M[mask] \n\t"               /* 1 */

        /* if(rx0_first_byte != tmp_rx_last_byte) */
        "cp    r18, r24 \n\t"                    /* 1 */
        "breq  .+14 \n\t"                        /* 1/2 */

        /* rx0_buffer[tmp_rx_last_byte] = tmp */
        "mov   r30, r24 \n\t"                    /* 1 */
        "ldi   r31, 0x00 \n\t"                   /* 1 */
        "subi  r30, lo8(-(rx0_buffer))\n\t"      /* 1 */
        "sbci  r31, hi8(-(rx0_buffer))\n\t"      /* 1 */
        "st    Z, r25 \n\t"                      /* 2 */

        /* rx0_last_byte = tmp_rx_last_byte */
        "sts   (rx0_last_byte), r24 \n\t"        /* 2 */

#ifdef USART_UNSAFE_RX_INTERRUPT
        "cli \n\t"                               /* 1 */
#endif

        "pop   r18 \n\t"                         /* 2 */
        "pop   r24 \n\t"                         /* 2 */
        "pop   r25 \n\t"                         /* 2 */
        "pop   r30 \n\t"                         /* 2 */
        "pop   r31 \n\t"                         /* 2 */

        "out   __SREG__ , r0 \n\t"               /* 1 */
        "pop   r0 \n\t"                          /* 2 */

        "reti \n\t"                              /* 4 ISR return */

        : /* output operands */

        : /* input operands */
        [uart_data] "M"    (_SFR_MEM_ADDR(UDR0_REGISTER)),
        [mask]      "M"    (RX0_BUFFER_MASK)

        /* no clobbers */
        );

    }

Best Answer

Saving a few pushes/pops by using global register variables, putting all instructions into one asm() statement, I would arrive at something like

#define RB_WIDTH 5
#define RB_SIZE (1<<(RB_WIDTH))
#define RB_MASK ((RB_SIZE)-1)

register uint8_t rb_head      asm("r13");
register uint8_t rb_tail      asm("r14");
register uint8_t rb_sreg_save asm("r15");

volatile uint8_t rb_buf[RB_SIZE];

ISR(USART0_RX_vect, ISR_NAKED)                 /* CLOCK CYCLES */
{
  asm("\n\t"                                   /* 5 ISR entry */
      "push  r24\n\t"                          /* 2 */
      "push  r25\n\t"                          /* 2 */
      "push  r30\n\t"                          /* 2 */
      "push  r31\n\t"                          /* 2 */
      "in    %r[sreg_save], __SREG__\n\t"      /* 1 */
      "\n\t"

      /* read byte from UART */
      "lds   r25, %M[uart_data]\n\t"           /* 2 */

      /* next_tail := (cur_tail + 1) & MASK; */
      "ldi   r24, 1\n\t"                       /* 1 */
      "add   r24, %r[tail]\n\t"                /* 1 */
      "andi  r24, %a[mask]\n\t"                /* 1 */

      /* if next_tail == cur_head */
      "cp    r24, %r[head]\n\t"                /* 1 */
      "breq  L_%=\n\t"                         /* 1/2 */

      /* rb_buf[next_tail] := byte */
      "mov   r30, r24\n\t"                     /* 1 */
      "ldi   r31, 0\n\t"                       /* 1 */
      "subi  r30, lo8(-(rb_buf))\n\t"          /* 1 */
      "sbci  r31, hi8(-(rb_buf))\n\t"          /* 1 */
      "st    Z, r25\n\t"                       /* 2 */

      /* rb_tail := next_tail */
      "mov   %r[tail], r24\n\t"                /* 1 */

      "\n"
"L_%=:\t"
      "out   __SREG__, %r[sreg_save]\n\t"      /* 1 */
      "pop   r31\n\t"                          /* 2 */
      "pop   r30\n\t"                          /* 2 */
      "pop   r25\n\t"                          /* 2 */
      "pop   r24\n\t"                          /* 2 */
      "reti\n\t"                               /* 5 ISR return */
      : /* output operands */
        [tail]      "+r"   (rb_tail)    /* both input+output */
      : /* input operands */
        [uart_data] "M"    (_SFR_MEM_ADDR(UDR0)),
        [mask]      "M"    (RB_MASK),
        [head]      "r"    (rb_head),
        [sreg_save] "r"    (rb_sreg_save)
        /* no clobbers */
      );
}

Reorganizing the ring buffer code a bit might reduce the amount of code in the ISR writing to the ring buffer even simpler (at the cost of making the function reading from the buffer more complex).

I have put up a complete example with build system and support structures at https://github.com/ndim/avr-uart-example/