unsigned int lt1 = 0x12345678,lt2=0xaabbccdd,lt3=0xabcdef12;
unsigned short t1=0xabcd,t2=0x6789;
void my_test()
{
lt1 = t1 * t2;
lt1 = lt2 / t1;
lt3 = lt1 / lt2;
while(1);
}
2. Variable memory map, it is helpful to understand the assembly code behavior below.
.data.lt1 0x20000c00 0x4 ./src/main.o
0x20000c00 lt1
.data.lt2 0x20000c04 0x4 ./src/main.o
0x20000c04 lt2
.data.lt3 0x20000c08 0x4 ./src/main.o
0x20000c08 lt3
.data.t1 0x20000c0c 0x2 ./src/main.o
0x20000c0c t1
.data.t2 0x20000c0e 0x2 ./src/main.o
0x20000c0e t2
3. Assembly code was attached here.
my_test:
20000590: my_test+0 push {r7}
20000592: my_test+2 add r7, sp, #0
204 lt1 = t1 * t2;
# load t1 value from 0x20000c0c address into r3 core register.
20000594: my_test+4 movw r3, #3084 ; 0xc0c
20000598: my_test+8 movt r3, #8192 ; 0x2000
2000059c: my_test+12 ldrh r3, [r3, #0]
# move r3 contents to r2
2000059e: my_test+14 mov r2, r3
# load t2 value from 0x20000c0e address into r3 core register.
200005a0: my_test+16 movw r3, #3086 ; 0xc0e
200005a4: my_test+20 movt r3, #8192 ; 0x2000
200005a8: my_test+24 ldrh r3, [r3, #0]
# Multiply t1 and t2 and save it into r2
200005aa: my_test+26 mul.w r3, r3, r2
200005ae: my_test+30 mov r2, r3
# save r2 value into lt1 0x20000c00 address
200005b0: my_test+32 movw r3, #3072 ; 0xc00
200005b4: my_test+36 movt r3, #8192 ; 0x2000
200005b8: my_test+40 str r2, [r3, #0]
205 lt1 = lt2 / t1;
# load lt2 value from 0x20000c04 address into r2
200005ba: my_test+42 movw r3, #3076 ; 0xc04
200005be: my_test+46 movt r3, #8192 ; 0x2000
200005c2: my_test+50 ldr r2, [r3, #0]
# load t1 value from 0x20000c0c address into r3
200005c4: my_test+52 movw r3, #3084 ; 0xc0c
200005c8: my_test+56 movt r3, #8192 ; 0x2000
200005cc: my_test+60 ldrh r3, [r3, #0]
# Lt2 Divde t1 and result lt1 0x20000c000
200005ce: my_test+62 udiv r2, r2, r3
200005d2: my_test+66 movw r3, #3072 ; 0xc00
200005d6: my_test+70 movt r3, #8192 ; 0x2000
200005da: my_test+74 str r2, [r3, #0]
206 lt3 = lt1 / lt2;
# load lt1 value from 0x20000c00 address into r2
200005dc: my_test+76 movw r3, #3072 ; 0xc00
200005e0: my_test+80 movt r3, #8192 ; 0x2000
200005e4: my_test+84 ldr r2, [r3, #0]
# load lt2 value from 0x20000c04 address into r3
200005e6: my_test+86 movw r3, #3076 ; 0xc04
200005ea: my_test+90 movt r3, #8192 ; 0x2000
200005ee: my_test+94 ldr r3, [r3, #0]
# Lt1 divde lt2 and result in lt3 0x20000c08
200005f0: my_test+96 udiv r2, r2, r3
200005f4: my_test+100 movw r3, #3080 ; 0xc08
200005f8: my_test+104 movt r3, #8192 ; 0x2000
200005fc: my_test+108 str r2, [r3, #0]
207 while(1);
200005fe: my_test+110 b.n 0x200005fe
4. Instruction timing, we can get reference from “DDI0337E_cortex_m3_r1p1_trm.pdf”, Chapter 18, about instruction timing.
20000594: my_test+4 movw r3, #3084 ; 0xc0c
# movw 1 cycle
20000598: my_test+8 movt r3, #8192 ; 0x2000
#movt 1 cycle
2000059c: my_test+12 ldrh r3, [r3, #0]
#ldrh 2 cycles
2000059e: my_test+14 mov r2, r3
#mov 1 cycle
200005a0: my_test+16 movw r3, #3086 ; 0xc0e
# movw 1 cycle
200005a4: my_test+20 movt r3, #8192 ; 0x2000
# movt 1 cylce
200005a8: my_test+24 ldrh r3, [r3, #0]
# ldrh 2 cycles
200005aa: my_test+26 mul.w r3, r3, r2
# mul 1 cycle
200005ae: my_test+30 mov r2, r3
# move 1 cycle
200005b0: my_test+32 movw r3, #3072 ; 0xc00
# movw 1 cycle
200005b4: my_test+36 movt r3, #8192 ; 0x2000
#movt 1 cycle
200005b8: my_test+40 str r2, [r3, #0]
# str 1 cycle
So ” lt1 = t1 * t2;” total cycles is 1+1+2+1+1+1+2+1+1+1+1+1 = 14 cycles
And with same way we go other two equation instruction cycles
“lt1 = lt2 / t1;” total cycles is 13 cycles
“lt3 = lt1 / lt2;” total cycles is 13 cycles
5. Conclusion:
Below result is not exactly same as real situation. Consider pipeline flush, pipeline interrupt, etc. However we can have roughly result below.
• 20Mhz, one cycle is 1/20Mhz = 50ns,
long = Int*int ; 14 * 50 = 700 ns
long = long/int 13 * 50 = 650 ns
long = long/long 13 * 50 = 650 ns
• 80Mhz, one cycle is 1/80Mhz = 12.5ns,
long = Int*int ; 14 * 12.5 = 175 ns
long = long/int 13 * 12.5 = 162.5 ns
long = long/long 13 * 12.5 = 162.5 ns