[经验分享] 浮点数性能和一致性

[复制链接]
4348|67
 楼主| mickit 发表于 2025-4-11 11:49 | 显示全部楼层 |阅读模式


  1. // Copyright 2022 Tencent

  2. #include "test_fpu.h"

  3. #include <assert.h>

  4. #if defined(_WIN32)
  5. #  include <float.h>
  6. #else
  7. #  if defined(__aarch64__) || defined(__arm__)
  8. #    include <fenv.h>
  9. #  else  // defined(__i386__) || defined(__x86_64__)
  10. #    include <fpu_control.h>
  11. #  endif
  12. #endif
  13. #include <atomic>
  14. #include <chrono>
  15. #include <cmath>
  16. #include <memory>
  17. #include <random>
  18. #include <sstream>
  19. #include <string>
  20. #include <thread>
  21. #include <type_traits>

  22. bool init_fpu() {
  23. #if defined(_MSC_VER)
  24.   unsigned int control_word;
  25.   int err;
  26.   err = _controlfp_s(&control_word, 0, 0);
  27.   if (err) {
  28.     return false;
  29.   }

  30. #  if !defined(_M_X64)
  31.   err = _controlfp_s(&control_word, PC_24, MCW_PC);
  32.   if (err) {
  33.     return false;
  34.   }
  35. #  endif
  36.   err = _controlfp_s(&control_word, RC_NEAR, MCW_RC);
  37.   if (err) {
  38.     return false;
  39.   }
  40.   return true;
  41. #else
  42. #  if defined(__aarch64__) || defined(__arm__)
  43.   fesetround(FE_TONEAREST);
  44. #  else
  45.   fpu_control_t cw = (_FPU_DEFAULT & ~_FPU_EXTENDED) | _FPU_RC_NEAREST | _FPU_SINGLE;
  46.   _FPU_SETCW(cw);
  47. #  endif
  48.   return true;
  49. #endif
  50. }

  51. std::string dump_current_controlfp() {
  52.   std::stringstream ss;
  53.   float a = 0.1f;

  54. #if defined(_MSC_VER)
  55.   unsigned int control_word;
  56.   int err = _controlfp_s(&control_word, 0, 0);
  57.   if (err) {
  58.     ss << "Got error code: " << err;
  59.     return ss.str();
  60.   }

  61.   ss << "Control word: " << std::hex << control_word << std::endl;
  62.   float b = a * a;
  63.   ss << a << "*" << a << "=" << b << std::endl;
  64. #else
  65. #  if defined(__aarch64__) || defined(__arm__)
  66.   ss << "Rounding word: " << std::hex << fegetround() << std::endl;
  67. #  else
  68.   fpu_control_t cw;
  69.   _FPU_GETCW(cw);
  70.   ss << "Control word: " << std::hex << cw << std::endl;
  71. #  endif
  72.   float b = a * a;
  73.   ss << a << "*" << a << "=" << b << std::endl;
  74. #endif

  75.   return ss.str();
  76. }

  77. struct benchmark_thread_data {
  78.   std::unique_ptr<std::thread> thread;

  79.   benchmark_result result;
  80. };

  81. struct benchmark_handle {
  82.   size_t max_round;
  83.   std::atomic<size_t> running_thread;
  84.   std::atomic<size_t> progress_total;
  85.   std::atomic<size_t> progress_done;
  86.   std::vector<benchmark_thread_data> datas;
  87.   std::unique_ptr<std::thread> controller_thread;

  88.   ~benchmark_handle() {
  89.     if (controller_thread && controller_thread->joinable()) {
  90.       controller_thread->join();
  91.     }
  92.   }
  93. };

  94. namespace {
  95. static constexpr size_t kMaxParameterCount = 1 << 20;
  96. static constexpr size_t kMaxParameterArraySize = kMaxParameterCount * 2;
  97. static uint32_t g_integer_parameters_odd[kMaxParameterArraySize] = {0};
  98. static uint32_t g_integer_parameters_even[kMaxParameterArraySize] = {0};
  99. static float g_float_parameters_odd[kMaxParameterArraySize] = {0};
  100. static float g_float_parameters_even[kMaxParameterArraySize] = {0};

  101. static void initialize_parameters(std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
  102.   if (g_integer_parameters_even[std::extent<decltype(g_integer_parameters_even)>::value - 1] != 0) {
  103.     return;
  104.   }

  105.   progress_total += kMaxParameterArraySize >> 9;

  106.   std::mt19937 rnd{9999991};
  107.   size_t index = 0;
  108.   while (index < kMaxParameterArraySize * 2) {
  109.     uint32_t r = rnd();
  110.     if (r < 9999991) {
  111.       continue;
  112.     }
  113.     r = (r << 1) & 0x7ffffffe;
  114.     if (index & 0x1) {
  115.       g_integer_parameters_odd[index >> 1] = r | 0x1;
  116.       g_float_parameters_odd[index >> 1] = static_cast<float>(r | 0x1);
  117.     } else {
  118.       g_integer_parameters_even[index >> 1] = r;
  119.       g_float_parameters_even[index >> 1] = static_cast<float>(r);
  120.     }

  121.     ++index;
  122.     if (0 == (index & ((1 << 10) - 1))) {
  123.       ++progress_done;
  124.     }
  125.   }
  126. }

  127. template <class TDATA>
  128. static inline void benchmark_add(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
  129.   size_t s1 = start_parameter_idx;
  130.   size_t s2 = start_parameter_idx;
  131.   final_result += odd[s1 + 1] + odd[s1 + 2] + odd[s1 + 3] + odd[s1 + 4] + odd[s1 + 5] + odd[s1 + 6] + odd[s1 + 7] +
  132.                   odd[s1 + 8] + odd[s1 + 9] + odd[s1 + 10] + odd[s1 + 11] + odd[s1 + 12] + odd[s1 + 13] + odd[s1 + 14] +
  133.                   odd[s1 + 15] + odd[s1];
  134.   final_result += even[s2 + 1] + even[s2 + 2] + even[s2 + 3] + even[s2 + 4] + even[s2 + 5] + even[s2 + 6] +
  135.                   even[s2 + 7] + even[s2 + 8] + even[s2 + 9] + even[s2 + 10] + even[s2 + 11] + even[s2 + 12] +
  136.                   even[s2 + 13] + even[s2 + 14] + even[s2 + 15] + even[s2];
  137. }

  138. template <class TDATA>
  139. static inline void benchmark_sub(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
  140.   size_t s1 = start_parameter_idx;
  141.   size_t s2 = start_parameter_idx;
  142.   final_result += odd[s1 + 1] - odd[s1 + 2] - odd[s1 + 3] - odd[s1 + 4] - odd[s1 + 5] - odd[s1 + 6] - odd[s1 + 7] -
  143.                   odd[s1 + 8] - odd[s1 + 9] - odd[s1 + 10] - odd[s1 + 11] - odd[s1 + 12] - odd[s1 + 13] - odd[s1 + 14] -
  144.                   odd[s1 + 15] - odd[s1];
  145.   final_result += even[s2 + 1] - even[s2 + 2] - even[s2 + 3] - even[s2 + 4] - even[s2 + 5] - even[s2 + 6] -
  146.                   even[s2 + 7] - even[s2 + 8] - even[s2 + 9] - even[s2 + 10] - even[s2 + 11] - even[s2 + 12] -
  147.                   even[s2 + 13] - even[s2 + 14] - even[s2 + 15] - even[s2];
  148. }

  149. template <class>
  150. struct benchmark_mul_helper;

  151. template <>
  152. struct benchmark_mul_helper<uint32_t> {
  153.   static inline void do_operator(uint32_t odd[], uint32_t &final_result, size_t start_parameter_idx) {
  154.     final_result *= odd[start_parameter_idx];
  155.     final_result *= odd[start_parameter_idx++];
  156.   }
  157. };

  158. template <>
  159. struct benchmark_mul_helper<float> {
  160.   static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
  161.     if (std::isinf(final_result * odd[start_parameter_idx])) {
  162.       int exp;
  163.       final_result = std::frexp(final_result, &exp);
  164.       // memset(&final_result, 0, 1);
  165.       // *(reinterpret_cast<uint8_t *>(&final_result) + sizeof(float) - 1) = 0;
  166.     }
  167.     final_result *= odd[start_parameter_idx++];
  168.   }
  169. };

  170. template <class TDATA>
  171. static inline void benchmark_mul(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
  172.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  173.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  174.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  175.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  176.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  177.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  178.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  179.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  180.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  181.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  182.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  183.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  184.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  185.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  186.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  187.   benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  188. }

  189. template <class>
  190. struct benchmark_div_helper;

  191. template <>
  192. struct benchmark_div_helper<uint32_t> {
  193.   static inline void do_operator(uint32_t odd[], uint32_t even[], uint32_t &final_result, size_t start_parameter_idx) {
  194.     final_result *= even[start_parameter_idx];
  195.     uint32_t devided = (odd[start_parameter_idx] & 0xff);
  196.     if (final_result > devided) {
  197.       final_result /= devided;
  198.     } else {
  199.       final_result %= devided;
  200.     }
  201.   }
  202. };

  203. template <>
  204. struct benchmark_div_helper<float> {
  205.   static inline void do_operator(float odd[], float even[], float &final_result, size_t start_parameter_idx) {
  206.     float r = final_result * even[start_parameter_idx];
  207.     if (!std::isinf(r)) {
  208.       final_result = r;
  209.     }
  210.     if (final_result > odd[start_parameter_idx]) {
  211.       final_result /= odd[start_parameter_idx];
  212.     } else {
  213.       int exp;
  214.       float devided = std::frexp(odd[start_parameter_idx], &exp);
  215.       final_result /= devided;
  216.     }
  217.   }
  218. };

  219. template <class TDATA>
  220. static inline void benchmark_div(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
  221.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  222.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  223.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  224.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  225.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  226.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  227.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  228.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  229.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  230.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  231.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  232.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  233.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  234.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  235.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  236.   benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
  237. }

  238. template <class>
  239. struct benchmark_sqrt_helper;

  240. template <>
  241. struct benchmark_sqrt_helper<float> {
  242.   static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
  243.     float v = odd[start_parameter_idx];
  244.     if (start_parameter_idx & 0xc) {
  245.       final_result = std::sqrt(final_result * v + v * v);
  246.     } else if (start_parameter_idx & 0x3) {
  247.       final_result = std::sqrt(final_result * v * v);
  248.     } else {
  249.       final_result = std::sqrt(final_result * final_result * v);
  250.     }
  251.   }
  252. };

  253. template <class TDATA>
  254. static inline void benchmark_sqrt(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
  255.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  256.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  257.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  258.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  259.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  260.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  261.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  262.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  263.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  264.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  265.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  266.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  267.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  268.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  269.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  270.   benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
  271. }

  272. static void start_benchmark_worker(size_t idx, size_t max_round, benchmark_thread_data &data,
  273.                                    std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
  274.   constexpr const size_t step = 1 << 4;
  275.   constexpr const size_t iterator_count = kMaxParameterCount >> 4;
  276.   progress_total += max_round * 9;
  277.   progress_total += 2;  // sin + cos

  278.   // integer add
  279.   {
  280.     data.result.integer_add_final_result.resize(max_round);
  281.     auto begin = std::chrono::system_clock::now();
  282.     for (size_t round = 0; round < max_round; ++round) {
  283.       size_t start_index = kMaxParameterCount / max_round * round;
  284.       size_t iterator_end = start_index + iterator_count;
  285.       uint32_t result = g_integer_parameters_odd[start_index];
  286.       for (size_t i = start_index; i < iterator_end; i += step) {
  287.         benchmark_add(g_integer_parameters_odd, g_integer_parameters_even, result, i);
  288.       }
  289.       ++progress_done;

  290.       data.result.integer_add_final_result[round] = result;
  291.     }
  292.     auto end = std::chrono::system_clock::now();
  293.     data.result.integer_add_cost = end - begin;
  294.   }

  295.   // integer sub
  296.   {
  297.     data.result.integer_sub_final_result.resize(max_round);
  298.     auto begin = std::chrono::system_clock::now();
  299.     for (size_t round = 0; round < max_round; ++round) {
  300.       size_t start_index = kMaxParameterCount / max_round * round;
  301.       size_t iterator_end = start_index + iterator_count;
  302.       uint32_t result = g_integer_parameters_odd[start_index];
  303.       for (size_t i = start_index; i < iterator_end; i += step) {
  304.         benchmark_sub(g_integer_parameters_odd, g_integer_parameters_even, result, i);
  305.       }
  306.       ++progress_done;

  307.       data.result.integer_sub_final_result[round] = result;
  308.     }
  309.     auto end = std::chrono::system_clock::now();
  310.     data.result.integer_sub_cost = end - begin;
  311.   }

  312.   // integer mul
  313.   {
  314.     data.result.integer_mul_final_result.resize(max_round);
  315.     auto begin = std::chrono::system_clock::now();
  316.     for (size_t round = 0; round < max_round; ++round) {
  317.       size_t start_index = kMaxParameterCount / max_round * round;
  318.       size_t iterator_end = start_index + iterator_count;
  319.       uint32_t result = g_integer_parameters_odd[start_index];
  320.       for (size_t i = start_index; i < iterator_end; i += step) {
  321.         benchmark_mul(g_integer_parameters_odd, g_integer_parameters_even, result, i);
  322.       }
  323.       ++progress_done;

  324.       data.result.integer_mul_final_result[round] = result;
  325.     }
  326.     auto end = std::chrono::system_clock::now();
  327.     data.result.integer_mul_cost = end - begin;
  328.   }

  329.   // integer div
  330.   {
  331.     data.result.integer_div_final_result.resize(max_round);
  332.     auto begin = std::chrono::system_clock::now();
  333.     for (size_t round = 0; round < max_round; ++round) {
  334.       size_t start_index = kMaxParameterCount / max_round * round;
  335.       size_t iterator_end = start_index + iterator_count;
  336.       uint32_t result = g_integer_parameters_odd[start_index];
  337.       for (size_t i = start_index; i < iterator_end; i += step) {
  338.         benchmark_div(g_integer_parameters_odd, g_integer_parameters_even, result, i);
  339.       }
  340.       ++progress_done;

  341.       data.result.integer_div_final_result[round] = result;
  342.     }
  343.     auto end = std::chrono::system_clock::now();
  344.     data.result.integer_div_cost = end - begin;
  345.   }

  346.   // float add
  347.   {
  348.     data.result.float_add_final_result.resize(max_round);
  349.     auto begin = std::chrono::system_clock::now();
  350.     for (size_t round = 0; round < max_round; ++round) {
  351.       size_t start_index = kMaxParameterCount / max_round * round;
  352.       size_t iterator_end = start_index + iterator_count;
  353.       float result = g_float_parameters_odd[start_index];
  354.       for (size_t i = start_index; i < iterator_end; i += step) {
  355.         benchmark_add(g_float_parameters_odd, g_float_parameters_even, result, i);
  356.       }
  357.       ++progress_done;

  358.       data.result.float_add_final_result[round] = result;
  359.     }
  360.     auto end = std::chrono::system_clock::now();
  361.     data.result.float_add_cost = end - begin;
  362.   }

  363.   // float sub
  364.   {
  365.     data.result.float_sub_final_result.resize(max_round);
  366.     auto begin = std::chrono::system_clock::now();
  367.     for (size_t round = 0; round < max_round; ++round) {
  368.       size_t start_index = kMaxParameterCount / max_round * round;
  369.       size_t iterator_end = start_index + iterator_count;
  370.       float result = g_float_parameters_odd[start_index];
  371.       for (size_t i = start_index; i < iterator_end; i += step) {
  372.         benchmark_sub(g_float_parameters_odd, g_float_parameters_even, result, i);
  373.       }
  374.       ++progress_done;

  375.       data.result.float_sub_final_result[round] = result;
  376.     }
  377.     auto end = std::chrono::system_clock::now();
  378.     data.result.float_sub_cost = end - begin;
  379.   }

  380.   // float mul
  381.   {
  382.     data.result.float_mul_final_result.resize(max_round);
  383.     auto begin = std::chrono::system_clock::now();
  384.     for (size_t round = 0; round < max_round; ++round) {
  385.       size_t start_index = kMaxParameterCount / max_round * round;
  386.       size_t iterator_end = start_index + iterator_count;
  387.       float result = g_float_parameters_odd[start_index];
  388.       for (size_t i = start_index; i < iterator_end; i += step) {
  389.         benchmark_mul(g_float_parameters_odd, g_float_parameters_even, result, i);
  390.       }
  391.       ++progress_done;

  392.       data.result.float_mul_final_result[round] = result;
  393.     }
  394.     auto end = std::chrono::system_clock::now();
  395.     data.result.float_mul_cost = end - begin;
  396.   }

  397.   // float div
  398.   {
  399.     data.result.float_div_final_result.resize(max_round);
  400.     auto begin = std::chrono::system_clock::now();
  401.     for (size_t round = 0; round < max_round; ++round) {
  402.       size_t start_index = kMaxParameterCount / max_round * round;
  403.       size_t iterator_end = start_index + iterator_count;
  404.       float result = g_float_parameters_odd[start_index];
  405.       for (size_t i = start_index; i < iterator_end; i += step) {
  406.         benchmark_div(g_float_parameters_odd, g_float_parameters_even, result, i);
  407.       }
  408.       ++progress_done;
  409.       data.result.float_div_final_result[round] = result;
  410.     }
  411.     auto end = std::chrono::system_clock::now();
  412.     data.result.float_div_cost = end - begin;
  413.   }

  414.   // float sqrt
  415.   {
  416.     data.result.float_sqrt_final_result.resize(max_round);
  417.     auto begin = std::chrono::system_clock::now();
  418.     for (size_t round = 0; round < max_round; ++round) {
  419.       size_t start_index = kMaxParameterCount / max_round * round;
  420.       size_t iterator_end = start_index + iterator_count;
  421.       float result = g_float_parameters_odd[start_index];
  422.       for (size_t i = start_index; i < iterator_end; i += step) {
  423.         benchmark_sqrt(g_float_parameters_odd, g_float_parameters_even, result, i);
  424.       }
  425.       ++progress_done;
  426.       data.result.float_sqrt_final_result[round] = result;
  427.     }
  428.     auto end = std::chrono::system_clock::now();
  429.     data.result.float_sqrt_cost = end - begin;
  430.   }

  431.   // float sin
  432.   {
  433.     for (int i = 0; i < 16; ++i) {
  434.       data.result.float_sin_final_result.push_back(std::sin(3.14159f / 34 * i));
  435.     }
  436.     ++progress_done;
  437.   }

  438.   // float cos
  439.   {
  440.     for (int i = 0; i < 16; ++i) {
  441.       data.result.float_cos_final_result.push_back(std::cos(3.14159f / 34 * i));
  442.     }
  443.     ++progress_done;
  444.   }
  445. }

  446. static void start_benchmark_controller(std::shared_ptr<benchmark_handle> handle) {
  447.   initialize_parameters(handle->progress_total, handle->progress_done);

  448.   size_t idx = 0;
  449.   for (auto &data : handle->datas) {
  450.     data.result.float_add_cost = std::chrono::system_clock::duration::zero();
  451.     data.result.float_sub_cost = std::chrono::system_clock::duration::zero();
  452.     data.result.float_mul_cost = std::chrono::system_clock::duration::zero();
  453.     data.result.float_div_cost = std::chrono::system_clock::duration::zero();
  454.     data.result.float_sqrt_cost = std::chrono::system_clock::duration::zero();
  455.     data.result.integer_add_cost = std::chrono::system_clock::duration::zero();
  456.     data.result.integer_sub_cost = std::chrono::system_clock::duration::zero();
  457.     data.result.integer_mul_cost = std::chrono::system_clock::duration::zero();
  458.     data.result.integer_div_cost = std::chrono::system_clock::duration::zero();
  459.     data.thread = std::unique_ptr<std::thread>(new std::thread([idx, &data, &handle]() {
  460.       ++handle->running_thread;
  461.       start_benchmark_worker(idx, handle->max_round, data, handle->progress_total, handle->progress_done);
  462.       --handle->running_thread;
  463.     }));
  464.     ++idx;
  465.   }

  466.   for (auto &data : handle->datas) {
  467.     if (data.thread && data.thread->joinable()) {
  468.       data.thread->join();
  469.     }
  470.   }
  471. }
  472. }  // namespace

  473. std::shared_ptr<benchmark_handle> start_benchmark(size_t thread_count, size_t round) {
  474.   if (thread_count > 32) {
  475.     thread_count = 32;
  476.   }

  477.   std::shared_ptr<benchmark_handle> ret = std::make_shared<benchmark_handle>();
  478.   if (!ret) {
  479.     return ret;
  480.   }

  481.   ret->max_round = round;
  482.   ret->running_thread.store(0);
  483.   ret->progress_total.store(1);
  484.   ret->progress_done.store(0);
  485.   ret->datas.resize(thread_count);
  486.   ret->controller_thread = std::unique_ptr<std::thread>(new std::thread([ret]() {
  487.     start_benchmark_controller(ret);
  488.     ++ret->progress_done;
  489.   }));
  490.   return ret;
  491. }

  492. bool is_benchmark_running(const std::shared_ptr<benchmark_handle> &handle) {
  493.   if (!handle) {
  494.     return false;
  495.   }

  496.   if (!handle->controller_thread) {
  497.     return false;
  498.   }

  499.   return handle->progress_done.load() < handle->progress_total.load();
  500. }

  501. std::pair<size_t, size_t> get_benchmark_progress(const std::shared_ptr<benchmark_handle> &handle) {
  502.   if (!handle) {
  503.     return std::pair<size_t, size_t>{0, 0};
  504.   }

  505.   return std::pair<size_t, size_t>{handle->progress_done, handle->progress_total};
  506. }

  507. size_t get_benchmark_running_thread(const std::shared_ptr<benchmark_handle> &handle) {
  508.   if (!handle) {
  509.     return 0;
  510.   }

  511.   return handle->running_thread.load();
  512. }

  513. size_t get_benchmark_thread_count(const std::shared_ptr<benchmark_handle> &handle) {
  514.   if (!handle) {
  515.     return 0;
  516.   }

  517.   return handle->datas.size();
  518. }

  519. void pick_benchmark_result(const std::shared_ptr<benchmark_handle> &handle, std::vector<benchmark_result> &result) {
  520.   if (!handle) {
  521.     return;
  522.   }

  523.   result.reserve(handle->datas.size());
  524.   for (auto &data : handle->datas) {
  525.     result.push_back(data.result);
  526.   }
  527. }


burgessmaggie 发表于 2025-4-17 21:32 | 显示全部楼层
C 语言中的浮点数运算遵循一定的舍入模式,常见的舍入模式有向偶数舍入(默认模式)、向零舍入、向上舍入、向下舍入等。
hudi008 发表于 2025-4-17 21:44 | 显示全部楼层
使用定点数或查表法替代复杂浮点运算。
averyleigh 发表于 2025-4-17 23:15 | 显示全部楼层
在需要严格一致性的场合,可以禁用编译器的浮点优化选项。
 楼主| mickit 发表于 2025-4-18 00:00 | 显示全部楼层
可以通过设置浮点运算单元的控制寄存器来控制舍入模式和精度,以确保一致性。
robertesth 发表于 2025-4-18 00:12 | 显示全部楼层
使用更高精度类型(如double)或调整循环结构。
gygp 发表于 2025-4-18 00:25 | 显示全部楼层
避免不必要的类型转换和复杂的浮点数运算。
mollylawrence 发表于 2025-4-18 00:38 | 显示全部楼层
编译器可以对浮点数运算进行多种优化。
vivilyly 发表于 2025-4-18 00:51 | 显示全部楼层
使用适合当前处理器架构的编译器和优化选项。
zerorobert 发表于 2025-4-18 01:04 | 显示全部楼层
尽量使用整数运算替代浮点运算,例如将浮点数乘以一个常数后转换为整数。
abotomson 发表于 2025-4-18 01:17 | 显示全部楼层
在C语言中,浮点数的性能和一致性是需要特别关注的两个方面,尤其是在高性能计算、跨平台开发或对数值精度敏感的场景中。
51xlf 发表于 2025-4-18 01:30 | 显示全部楼层
可以自动向量化循环,但需要手动编写优化代码以获得最佳性能。
1988020566 发表于 2025-4-18 01:43 | 显示全部楼层
比较两个浮点数是否相等时,不能直接使用==运算符,而是要判断它们的差值是否在一个很小的误差范围内。
sdlls 发表于 2025-4-18 01:56 | 显示全部楼层
浮点数运算(如加减乘除)通常比整数运算更慢,尤其是复杂数学函数(如sin、cos)的计算,可能需要更多CPU周期。
macpherson 发表于 2025-4-18 02:08 | 显示全部楼层
避免在浮点密集型循环中使用条件分支。
jackcat 发表于 2025-4-18 02:21 | 显示全部楼层
不同的编译器在实现浮点数运算时也可能会有一些细微的差别。
sesefadou 发表于 2025-4-18 02:33 | 显示全部楼层
复杂的浮点数运算,如乘法和除法,通常比加法和减法慢。
uytyu 发表于 2025-4-18 02:45 | 显示全部楼层
不同平台可能使用不同的浮点指令集。例如,x86平台上的x87和SSE指令集在精度和性能上有所不同。
10299823 发表于 2025-4-18 02:57 | 显示全部楼层
由于浮点数的精度问题,直接比较两个浮点数是否相等可能会得到不准确的结果。
pl202 发表于 2025-4-18 03:09 | 显示全部楼层
编译器可以选择不同的指令来执行浮点运算。例如,GCC和Clang提供了-Ofast选项,允许编译器为了性能而放弃某些一致性保证。
您需要登录后才可以回帖 登录 | 注册

本版积分规则

89

主题

1560

帖子

1

粉丝
快速回复 在线客服 返回列表 返回顶部