I'm trying to optimize my RX and TX interrupts to meet maximum execution time of 25 cycles while interrupts are disabled.
So far, I have found that code is optimized enough, but pushing and popping series of registers between loading and unloading __SREG__
exceeds the time limit.
272: 80 91 24 01 lds r24, 0x0124
276: 8f 5f subi r24, 0xFF ; 255
278: 8f 71 andi r24, 0x1F ; 31
27a: 90 91 c6 00 lds r25, 0x00C6
27e: 20 91 25 01 lds r18, 0x0125
282: 28 17 cp r18, r24
284: 39 f0 breq .+14 ; 0x294 <__vector_18+0x30>
286: e8 2f mov r30, r24
288: f0 e0 ldi r31, 0x00 ; 0
28a: ea 5d subi r30, 0xDA ; 218
28c: fe 4f sbci r31, 0xFE ; 254
28e: 90 83 st Z, r25
290: 80 93 24 01 sts 0x0124, r24
The only way to be able to place __SREG__
in safest location (include as much pushes as possible in unconscious area) was inline asm.
Here is my current code:
ISR(RX0_INTERRUPT, ISR_NAKED)
{
//push
asm volatile("push r31" ::); // table pointer
asm volatile("push r30" ::); // table pointer
asm volatile("push r25" ::); // received character
asm volatile("push r18" ::); // once compared to r24 -> rx0_first_byte
asm volatile("push r24" ::); // most stuff is executed in r24
asm volatile("in r24,__SREG__" ::); // -
asm volatile("push r24" ::); // but one byte more on stack
register uint8_t tmp_rx_last_byte = (rx0_last_byte + 1) & RX0_BUFFER_MASK;
register uint8_t tmp = UDR0_REGISTER;
if(rx0_first_byte != tmp_rx_last_byte)
{
rx0_buffer[tmp_rx_last_byte] = tmp;
rx0_last_byte = tmp_rx_last_byte;
}
//pop
asm volatile("pop r24" ::);
asm volatile("out __SREG__,r24" ::);
asm volatile("pop r24" ::);
asm volatile("pop r18" ::);
asm volatile("pop r25" ::);
asm volatile("pop r30" ::);
asm volatile("pop r31" ::);
reti();
}
As you can see, there are a hardcoded register pushes that my compiler used, btw it's working at all, but I'm not sure how portable is it.
The only register which I can get by "=r" specificator is r24
and it's happening even for mask and rx0_first_byte
.
So how can I tell the compiler to push/pop these 5 registers, even if they will be placed elsewhere?
What is possibility that the compiler will use r19
and r26
instead of r18
and r25
?
I don't want to rewrite the whole ISR into assembler.
EDIT: thanks for all suggestions, finally I rewrote ISR into asm
ISR(RX0_INTERRUPT, ISR_NAKED)
{
asm volatile("\n\t" /* 5 ISR entry */
"push r31 \n\t" /* 2 */
"push r30 \n\t" /* 2 */
"push r25 \n\t" /* 2 */
"push r24 \n\t" /* 2 */
"push r18 \n\t" /* 2 */
"in r18, __SREG__ \n\t" /* 1 */
"push r18 \n\t" /* 2 */
/* read byte from UDR register */
"lds r25, %M[uart_data] \n\t" /* 2 */
/* load globals */
"lds r24, (rx0_last_byte) \n\t" /* 2 */
"lds r18, (rx0_first_byte) \n\t" /* 2 */
/* add 1 & mask */
"subi r24, 0xFF \n\t" //??? /* 1 */
"andi r24, %M[mask] \n\t" /* 1 */
/* if head == tail */
"cp r18, r24 \n\t" /* 1 */
"breq L_%= \n\t" /* 1/2 */
"mov r30, r24 \n\t" /* 1 */
"ldi r31, 0x00 \n\t" /* 1 */
"subi r30, lo8(-(rx0_buffer))\n\t" /* 1 */
"sbci r31, hi8(-(rx0_buffer))\n\t" /* 1 */
"st Z, r25 \n\t" /* 2 */
"sts (rx0_last_byte), r24 \n\t" /* 2 */
"L_%=:\t"
"pop r18 \n\t" /* 2 */
"out __SREG__ , r18 \n\t" /* 1 */
"pop r18 \n\t" /* 2 */
"pop r24 \n\t" /* 2 */
"pop r25 \n\t" /* 2 */
"pop r30 \n\t" /* 2 */
"pop r31 \n\t" /* 2 */
"reti \n\t" /* 5 ISR return */
: /* output operands */
: /* input operands */
[uart_data] "M" (_SFR_MEM_ADDR(UDR0_REGISTER)),
[mask] "M" (RX0_BUFFER_MASK)
/* no clobbers */
);
}
UPDATE:
After some tests I have found that the interrupts are disabled before entering ISR handler, not after unloading __SREG__
as I have been suggested before.
The only way is to globalize registers as ndim suggested, or use the following code:
ISR(RX0_INTERRUPT, ISR_NAKED)
{
asm volatile("\n\t" /* 4 ISR entry */
"push r0 \n\t" /* 2 */
"in r0, __SREG__ \n\t" /* 1 */
"push r31 \n\t" /* 2 */
"push r30 \n\t" /* 2 */
"push r25 \n\t" /* 2 */
"push r24 \n\t" /* 2 */
"push r18 \n\t" /* 2 */
/* read byte from UDR register */
"lds r25, %M[uart_data] \n\t" /* 2 */
#ifdef USART_UNSAFE_RX_INTERRUPT // enable interrupt after satisfying UDR register
"sei \n\t" /* 1 */
#endif
/* load globals */
"lds r24, (rx0_last_byte) \n\t" /* 2 */
"lds r18, (rx0_first_byte) \n\t" /* 2 */
/* tmp_rx_last_byte = (rx0_last_byte + 1) & RX0_BUFFER_MASK */
"subi r24, 0xFF \n\t" /* 1 */
"andi r24, %M[mask] \n\t" /* 1 */
/* if(rx0_first_byte != tmp_rx_last_byte) */
"cp r18, r24 \n\t" /* 1 */
"breq .+14 \n\t" /* 1/2 */
/* rx0_buffer[tmp_rx_last_byte] = tmp */
"mov r30, r24 \n\t" /* 1 */
"ldi r31, 0x00 \n\t" /* 1 */
"subi r30, lo8(-(rx0_buffer))\n\t" /* 1 */
"sbci r31, hi8(-(rx0_buffer))\n\t" /* 1 */
"st Z, r25 \n\t" /* 2 */
/* rx0_last_byte = tmp_rx_last_byte */
"sts (rx0_last_byte), r24 \n\t" /* 2 */
#ifdef USART_UNSAFE_RX_INTERRUPT
"cli \n\t" /* 1 */
#endif
"pop r18 \n\t" /* 2 */
"pop r24 \n\t" /* 2 */
"pop r25 \n\t" /* 2 */
"pop r30 \n\t" /* 2 */
"pop r31 \n\t" /* 2 */
"out __SREG__ , r0 \n\t" /* 1 */
"pop r0 \n\t" /* 2 */
"reti \n\t" /* 4 ISR return */
: /* output operands */
: /* input operands */
[uart_data] "M" (_SFR_MEM_ADDR(UDR0_REGISTER)),
[mask] "M" (RX0_BUFFER_MASK)
/* no clobbers */
);
}
Best Answer
Saving a few pushes/pops by using global register variables, putting all instructions into one
asm()
statement, I would arrive at something likeReorganizing the ring buffer code a bit might reduce the amount of code in the ISR writing to the ring buffer even simpler (at the cost of making the function reading from the buffer more complex).
I have put up a complete example with build system and support structures at https://github.com/ndim/avr-uart-example/