- {
- uint32_t s_ticks = g_ticks;
- float result = test_float();
- uint32_t e_ticks = g_ticks;
- printf( "%u - %u\r\n", s_ticks, e_ticks);
- printf( "%f, test duration %u\r\n", result, e_ticks - s_ticks);
- }
- {
- uint32_t s_ticks = g_ticks;
- double result = test_double();
- uint32_t e_ticks = g_ticks;
- printf( "%u - %u\r\n", s_ticks, e_ticks);
- printf( "%f, double duration %u\r\n", result, e_ticks - s_ticks);
- }
通过配置是否使用FPU来对比运行效率,主要是这两个配置选项:
运行结果:
使用了FPU指令:
- 625000.125000, test duration 41
- 625000.100000, double duration 1186
不使用FPU指令,软件模拟浮点运算:
- 625000.125000, test duration 763
- 625000.100000, double duration 1031
结论:
单精度情况下, 硬件FPU指令比软件浮点快近19倍.
763/41 = 18.609756097560975
对于双精度情况, 由于都是软件模拟, 所以性能近似.
那么为什么, 使用软件模拟浮点运算时, 双精度的性能要稍稍提升一点.( 1186/1131 = 1.0486295313881522, 约5个百分点的提升).
本人认为是单精度与双精度都使用了软件模拟运算, 某些共通部分有一些编译器与连接器带来的优化效果.
最后看看两种情况下生成的代码差异.
1. 硬件FPU:
- 4 .Ltext0:
- 5 .cfi_sections .debug_frame
- 6 .section .text.test_float,"ax",@progbits
- 7 .align 1
- 8 .globl test_float
- 10 test_float:
- 11 .LFB0:
- 12 .file 1 "../User/test_float_double.c"
- 1:../User/test_float_double.c **** /*
- 2:../User/test_float_double.c **** * test_float_double.c
- 3:../User/test_float_double.c **** *
- 4:../User/test_float_double.c **** * Created on: Sep 16, 2023
- 5:../User/test_float_double.c **** * Author: Administrator
- 6:../User/test_float_double.c **** */
- 7:../User/test_float_double.c **** #include <stdint.h>
- 8:../User/test_float_double.c ****
- 9:../User/test_float_double.c **** #define TEST_CNT 1000000
- 10:../User/test_float_double.c ****
- 11:../User/test_float_double.c **** float test_float(void) {
- 13 .loc 1 11 24
- 14 .cfi_startproc
- 12:../User/test_float_double.c **** float sum = 0.1;
- 15 .loc 1 12 5
- 16 .LVL0:
- 13:../User/test_float_double.c ****
- 14:../User/test_float_double.c **** for(uint32_t i=0; i < TEST_CNT; ++i) {
- 17 .loc 1 14 5
- 18 .LBB2:
- 19 .loc 1 14 9
- 20 .LBE2:
- 12:../User/test_float_double.c **** float sum = 0.1;
- 21 .loc 1 12 11 is_stmt 0
- 22 0000 B7070000 lui a5,%hi(.LC0)
- 23 0004 07A50700 flw fa0,%lo(.LC0)(a5)
- 24 .LBB3:
- 15:../User/test_float_double.c **** sum += 0.625;
- 25 .loc 1 15 13
- 26 0008 B7070000 lui a5,%hi(.LC1)
- 27 000c 87A70700 flw fa5,%lo(.LC1)(a5)
- 28 .LBE3:
- 11:../User/test_float_double.c **** float sum = 0.1;
- 29 .loc 1 11 24
- 30 0010 B7470F00 li a5,999424
- 31 0014 93870724 addi a5,a5,576
- 32 .LVL1:
- 33 .L2:
- 34 .LBB4:
- 35 .loc 1 15 9 is_stmt 1 discriminator 3
- 36 0018 FD17 addi a5,a5,-1
- 37 .loc 1 15 13 is_stmt 0 discriminator 3
- 38 001a 5375F500 fadd.s fa0,fa0,fa5
- 39 .LVL2:
- 14:../User/test_float_double.c **** sum += 0.625;
- 40 .loc 1 14 5 discriminator 3
- 41 001e EDFF bnez a5,.L2
- 42 .LBE4:
- 16:../User/test_float_double.c **** }
- 17:../User/test_float_double.c ****
- 18:../User/test_float_double.c **** return sum;
- 19:../User/test_float_double.c **** }
- 43 .loc 1 19 1
- 44 0020 8280 ret
- 45 .cfi_endproc
2.软件模拟FPU:
- 4 .Ltext0:
- 5 .cfi_sections .debug_frame
- 6 .globl __addsf3
- 7 .section .text.test_float,"ax",@progbits
- 8 .align 1
- 9 .globl test_float
- 11 test_float:
- 12 .LFB0:
- 13 .file 1 "../User/test_float_double.c"
- 1:../User/test_float_double.c **** /*
- 2:../User/test_float_double.c **** * test_float_double.c
- 3:../User/test_float_double.c **** *
- 4:../User/test_float_double.c **** * Created on: Sep 16, 2023
- 5:../User/test_float_double.c **** * Author: Administrator
- 6:../User/test_float_double.c **** */
- 7:../User/test_float_double.c **** #include <stdint.h>
- 8:../User/test_float_double.c ****
- 9:../User/test_float_double.c **** #define TEST_CNT 1000000
- 10:../User/test_float_double.c ****
- 11:../User/test_float_double.c **** float test_float(void) {
- 14 .loc 1 11 24
- 15 .cfi_startproc
- 16 0000 17030000 call t0,__riscv_save_2
- 16 E7020300
- 17 .cfi_offset 9, -12
- 18 .cfi_offset 8, -8
- 19 .cfi_offset 1, -4
- 20 .cfi_def_cfa_offset 16
- 12:../User/test_float_double.c **** float sum = 0.1;
- 21 .loc 1 12 5
- 22 .LVL0:
- 13:../User/test_float_double.c ****
- 14:../User/test_float_double.c **** for(uint32_t i=0; i < TEST_CNT; ++i) {
- 23 .loc 1 14 5
- 24 .LBB2:
- 25 .loc 1 14 9
- 26 .LBE2:
- 12:../User/test_float_double.c **** float sum = 0.1;
- 27 .loc 1 12 11 is_stmt 0
- 28 0008 B7070000 lui a5,%hi(.LC0)
- 29 .LBB3:
- 15:../User/test_float_double.c **** sum += 0.625;
- 30 .loc 1 15 13
- 31 000c 37070000 lui a4,%hi(.LC1)
- 32 .LBE3:
- 12:../User/test_float_double.c **** float sum = 0.1;
- 33 .loc 1 12 11
- 34 0010 83A70700 lw a5,%lo(.LC0)(a5)
- 35 .LBB4:
- 36 .loc 1 15 13
- 37 0014 83240700 lw s1,%lo(.LC1)(a4)
- 38 .LBE4:
- 11:../User/test_float_double.c **** float sum = 0.1;
- 39 .loc 1 11 24
- 40 0018 37440F00 li s0,999424
- 41 001c 13040424 addi s0,s0,576
- 42 .LVL1:
- 43 .L2:
- 44 .LBB5:
- 45 .loc 1 15 9 is_stmt 1 discriminator 3
- 46 .loc 1 15 13 is_stmt 0 discriminator 3
- 47 0020 3E85 mv a0,a5
- 48 0022 A685 mv a1,s1
- 49 0024 97000000 call __addsf3
- 49 E7800000
- 50 .LVL2:
- 51 002c 7D14 addi s0,s0,-1
- 52 002e AA87 mv a5,a0
- 53 .LVL3:
- 14:../User/test_float_double.c **** sum += 0.625;
- 54 .loc 1 14 5 discriminator 3
- 55 0030 65F8 bnez s0,.L2
- 56 .LBE5:
- 16:../User/test_float_double.c **** }
- 17:../User/test_float_double.c ****
- 18:../User/test_float_double.c **** return sum;
- 19:../User/test_float_double.c **** }
- 57 .loc 1 19 1
- 58 0032 17030000 tail __riscv_restore_2
- 58 67000300
- 59 .cfi_restore 9
- 60 .cfi_restore 8
- 61 .cfi_restore 1
- 62 .cfi_def_cfa_offset 0
- 63 .cfi_endproc
可以看出硬件FPU计算时, 有明显的浮点指令: