#申请原创#
CH32V307芯片的内核是QinkeV4F,带有FPU.
做个代码测试一下子FPU指令相对于软浮点代码的效率对比.
测试代码:
#include <stdint.h>
#define TEST_CNT 1000000
float test_float(void) {
float sum = 0.1;
for(uint32_t i=0; i < TEST_CNT; ++i) {
sum += 0.625;
}
return sum;
}
double test_double(void) {
double sum = 0.1;
for(uint32_t i=0; i < TEST_CNT; ++i) {
sum += 0.625;
}
return sum;
}
{
uint32_t s_ticks = g_ticks;
float result = test_float();
uint32_t e_ticks = g_ticks;
printf( "%u - %u\r\n", s_ticks, e_ticks);
printf( "%f, test duration %u\r\n", result, e_ticks - s_ticks);
}
{
uint32_t s_ticks = g_ticks;
double result = test_double();
uint32_t e_ticks = g_ticks;
printf( "%u - %u\r\n", s_ticks, e_ticks);
printf( "%f, double duration %u\r\n", result, e_ticks - s_ticks);
}
通过配置是否使用FPU来对比运行效率,主要是这两个配置选项:
运行结果:
使用了FPU指令:
625000.125000, test duration 41
625000.100000, double duration 1186
不使用FPU指令,软件模拟浮点运算:
625000.125000, test duration 763
625000.100000, double duration 1031
结论:
单精度情况下, 硬件FPU指令比软件浮点快近19倍.
763/41 = 18.609756097560975
对于双精度情况, 由于都是软件模拟, 所以性能近似.
那么为什么, 使用软件模拟浮点运算时, 双精度的性能要稍稍提升一点.( 1186/1131 = 1.0486295313881522, 约5个百分点的提升).
本人认为是单精度与双精度都使用了软件模拟运算, 某些共通部分有一些编译器与连接器带来的优化效果.
最后看看两种情况下生成的代码差异.
1. 硬件FPU:
4 .Ltext0:
5 .cfi_sections .debug_frame
6 .section .text.test_float,"ax",@progbits
7 .align 1
8 .globl test_float
10 test_float:
11 .LFB0:
12 .file 1 "../User/test_float_double.c"
1:../User/test_float_double.c **** /*
2:../User/test_float_double.c **** * test_float_double.c
3:../User/test_float_double.c **** *
4:../User/test_float_double.c **** * Created on: Sep 16, 2023
5:../User/test_float_double.c **** * Author: Administrator
6:../User/test_float_double.c **** */
7:../User/test_float_double.c **** #include <stdint.h>
8:../User/test_float_double.c ****
9:../User/test_float_double.c **** #define TEST_CNT 1000000
10:../User/test_float_double.c ****
11:../User/test_float_double.c **** float test_float(void) {
13 .loc 1 11 24
14 .cfi_startproc
12:../User/test_float_double.c **** float sum = 0.1;
15 .loc 1 12 5
16 .LVL0:
13:../User/test_float_double.c ****
14:../User/test_float_double.c **** for(uint32_t i=0; i < TEST_CNT; ++i) {
17 .loc 1 14 5
18 .LBB2:
19 .loc 1 14 9
20 .LBE2:
12:../User/test_float_double.c **** float sum = 0.1;
21 .loc 1 12 11 is_stmt 0
22 0000 B7070000 lui a5,%hi(.LC0)
23 0004 07A50700 flw fa0,%lo(.LC0)(a5)
24 .LBB3:
15:../User/test_float_double.c **** sum += 0.625;
25 .loc 1 15 13
26 0008 B7070000 lui a5,%hi(.LC1)
27 000c 87A70700 flw fa5,%lo(.LC1)(a5)
28 .LBE3:
11:../User/test_float_double.c **** float sum = 0.1;
29 .loc 1 11 24
30 0010 B7470F00 li a5,999424
31 0014 93870724 addi a5,a5,576
32 .LVL1:
33 .L2:
34 .LBB4:
35 .loc 1 15 9 is_stmt 1 discriminator 3
36 0018 FD17 addi a5,a5,-1
37 .loc 1 15 13 is_stmt 0 discriminator 3
38 001a 5375F500 fadd.s fa0,fa0,fa5
39 .LVL2:
14:../User/test_float_double.c **** sum += 0.625;
40 .loc 1 14 5 discriminator 3
41 001e EDFF bnez a5,.L2
42 .LBE4:
16:../User/test_float_double.c **** }
17:../User/test_float_double.c ****
18:../User/test_float_double.c **** return sum;
19:../User/test_float_double.c **** }
43 .loc 1 19 1
44 0020 8280 ret
45 .cfi_endproc
2.软件模拟FPU:
4 .Ltext0:
5 .cfi_sections .debug_frame
6 .globl __addsf3
7 .section .text.test_float,"ax",@progbits
8 .align 1
9 .globl test_float
11 test_float:
12 .LFB0:
13 .file 1 "../User/test_float_double.c"
1:../User/test_float_double.c **** /*
2:../User/test_float_double.c **** * test_float_double.c
3:../User/test_float_double.c **** *
4:../User/test_float_double.c **** * Created on: Sep 16, 2023
5:../User/test_float_double.c **** * Author: Administrator
6:../User/test_float_double.c **** */
7:../User/test_float_double.c **** #include <stdint.h>
8:../User/test_float_double.c ****
9:../User/test_float_double.c **** #define TEST_CNT 1000000
10:../User/test_float_double.c ****
11:../User/test_float_double.c **** float test_float(void) {
14 .loc 1 11 24
15 .cfi_startproc
16 0000 17030000 call t0,__riscv_save_2
16 E7020300
17 .cfi_offset 9, -12
18 .cfi_offset 8, -8
19 .cfi_offset 1, -4
20 .cfi_def_cfa_offset 16
12:../User/test_float_double.c **** float sum = 0.1;
21 .loc 1 12 5
22 .LVL0:
13:../User/test_float_double.c ****
14:../User/test_float_double.c **** for(uint32_t i=0; i < TEST_CNT; ++i) {
23 .loc 1 14 5
24 .LBB2:
25 .loc 1 14 9
26 .LBE2:
12:../User/test_float_double.c **** float sum = 0.1;
27 .loc 1 12 11 is_stmt 0
28 0008 B7070000 lui a5,%hi(.LC0)
29 .LBB3:
15:../User/test_float_double.c **** sum += 0.625;
30 .loc 1 15 13
31 000c 37070000 lui a4,%hi(.LC1)
32 .LBE3:
12:../User/test_float_double.c **** float sum = 0.1;
33 .loc 1 12 11
34 0010 83A70700 lw a5,%lo(.LC0)(a5)
35 .LBB4:
36 .loc 1 15 13
37 0014 83240700 lw s1,%lo(.LC1)(a4)
38 .LBE4:
11:../User/test_float_double.c **** float sum = 0.1;
39 .loc 1 11 24
40 0018 37440F00 li s0,999424
41 001c 13040424 addi s0,s0,576
42 .LVL1:
43 .L2:
44 .LBB5:
45 .loc 1 15 9 is_stmt 1 discriminator 3
46 .loc 1 15 13 is_stmt 0 discriminator 3
47 0020 3E85 mv a0,a5
48 0022 A685 mv a1,s1
49 0024 97000000 call __addsf3
49 E7800000
50 .LVL2:
51 002c 7D14 addi s0,s0,-1
52 002e AA87 mv a5,a0
53 .LVL3:
14:../User/test_float_double.c **** sum += 0.625;
54 .loc 1 14 5 discriminator 3
55 0030 65F8 bnez s0,.L2
56 .LBE5:
16:../User/test_float_double.c **** }
17:../User/test_float_double.c ****
18:../User/test_float_double.c **** return sum;
19:../User/test_float_double.c **** }
57 .loc 1 19 1
58 0032 17030000 tail __riscv_restore_2
58 67000300
59 .cfi_restore 9
60 .cfi_restore 8
61 .cfi_restore 1
62 .cfi_def_cfa_offset 0
63 .cfi_endproc
可以看出硬件FPU计算时, 有明显的浮点指令:
谢谢阅读.
|
|