- // Copyright 2022 Tencent
- #include "test_fpu.h"
- #include <assert.h>
- #if defined(_WIN32)
- # include <float.h>
- #else
- # if defined(__aarch64__) || defined(__arm__)
- # include <fenv.h>
- # else // defined(__i386__) || defined(__x86_64__)
- # include <fpu_control.h>
- # endif
- #endif
- #include <atomic>
- #include <chrono>
- #include <cmath>
- #include <memory>
- #include <random>
- #include <sstream>
- #include <string>
- #include <thread>
- #include <type_traits>
- bool init_fpu() {
- #if defined(_MSC_VER)
- unsigned int control_word;
- int err;
- err = _controlfp_s(&control_word, 0, 0);
- if (err) {
- return false;
- }
- # if !defined(_M_X64)
- err = _controlfp_s(&control_word, PC_24, MCW_PC);
- if (err) {
- return false;
- }
- # endif
- err = _controlfp_s(&control_word, RC_NEAR, MCW_RC);
- if (err) {
- return false;
- }
- return true;
- #else
- # if defined(__aarch64__) || defined(__arm__)
- fesetround(FE_TONEAREST);
- # else
- fpu_control_t cw = (_FPU_DEFAULT & ~_FPU_EXTENDED) | _FPU_RC_NEAREST | _FPU_SINGLE;
- _FPU_SETCW(cw);
- # endif
- return true;
- #endif
- }
- std::string dump_current_controlfp() {
- std::stringstream ss;
- float a = 0.1f;
- #if defined(_MSC_VER)
- unsigned int control_word;
- int err = _controlfp_s(&control_word, 0, 0);
- if (err) {
- ss << "Got error code: " << err;
- return ss.str();
- }
- ss << "Control word: " << std::hex << control_word << std::endl;
- float b = a * a;
- ss << a << "*" << a << "=" << b << std::endl;
- #else
- # if defined(__aarch64__) || defined(__arm__)
- ss << "Rounding word: " << std::hex << fegetround() << std::endl;
- # else
- fpu_control_t cw;
- _FPU_GETCW(cw);
- ss << "Control word: " << std::hex << cw << std::endl;
- # endif
- float b = a * a;
- ss << a << "*" << a << "=" << b << std::endl;
- #endif
- return ss.str();
- }
- struct benchmark_thread_data {
- std::unique_ptr<std::thread> thread;
- benchmark_result result;
- };
- struct benchmark_handle {
- size_t max_round;
- std::atomic<size_t> running_thread;
- std::atomic<size_t> progress_total;
- std::atomic<size_t> progress_done;
- std::vector<benchmark_thread_data> datas;
- std::unique_ptr<std::thread> controller_thread;
- ~benchmark_handle() {
- if (controller_thread && controller_thread->joinable()) {
- controller_thread->join();
- }
- }
- };
- namespace {
- static constexpr size_t kMaxParameterCount = 1 << 20;
- static constexpr size_t kMaxParameterArraySize = kMaxParameterCount * 2;
- static uint32_t g_integer_parameters_odd[kMaxParameterArraySize] = {0};
- static uint32_t g_integer_parameters_even[kMaxParameterArraySize] = {0};
- static float g_float_parameters_odd[kMaxParameterArraySize] = {0};
- static float g_float_parameters_even[kMaxParameterArraySize] = {0};
- static void initialize_parameters(std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
- if (g_integer_parameters_even[std::extent<decltype(g_integer_parameters_even)>::value - 1] != 0) {
- return;
- }
- progress_total += kMaxParameterArraySize >> 9;
- std::mt19937 rnd{9999991};
- size_t index = 0;
- while (index < kMaxParameterArraySize * 2) {
- uint32_t r = rnd();
- if (r < 9999991) {
- continue;
- }
- r = (r << 1) & 0x7ffffffe;
- if (index & 0x1) {
- g_integer_parameters_odd[index >> 1] = r | 0x1;
- g_float_parameters_odd[index >> 1] = static_cast<float>(r | 0x1);
- } else {
- g_integer_parameters_even[index >> 1] = r;
- g_float_parameters_even[index >> 1] = static_cast<float>(r);
- }
- ++index;
- if (0 == (index & ((1 << 10) - 1))) {
- ++progress_done;
- }
- }
- }
- template <class TDATA>
- static inline void benchmark_add(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
- size_t s1 = start_parameter_idx;
- size_t s2 = start_parameter_idx;
- final_result += odd[s1 + 1] + odd[s1 + 2] + odd[s1 + 3] + odd[s1 + 4] + odd[s1 + 5] + odd[s1 + 6] + odd[s1 + 7] +
- odd[s1 + 8] + odd[s1 + 9] + odd[s1 + 10] + odd[s1 + 11] + odd[s1 + 12] + odd[s1 + 13] + odd[s1 + 14] +
- odd[s1 + 15] + odd[s1];
- final_result += even[s2 + 1] + even[s2 + 2] + even[s2 + 3] + even[s2 + 4] + even[s2 + 5] + even[s2 + 6] +
- even[s2 + 7] + even[s2 + 8] + even[s2 + 9] + even[s2 + 10] + even[s2 + 11] + even[s2 + 12] +
- even[s2 + 13] + even[s2 + 14] + even[s2 + 15] + even[s2];
- }
- template <class TDATA>
- static inline void benchmark_sub(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
- size_t s1 = start_parameter_idx;
- size_t s2 = start_parameter_idx;
- final_result += odd[s1 + 1] - odd[s1 + 2] - odd[s1 + 3] - odd[s1 + 4] - odd[s1 + 5] - odd[s1 + 6] - odd[s1 + 7] -
- odd[s1 + 8] - odd[s1 + 9] - odd[s1 + 10] - odd[s1 + 11] - odd[s1 + 12] - odd[s1 + 13] - odd[s1 + 14] -
- odd[s1 + 15] - odd[s1];
- final_result += even[s2 + 1] - even[s2 + 2] - even[s2 + 3] - even[s2 + 4] - even[s2 + 5] - even[s2 + 6] -
- even[s2 + 7] - even[s2 + 8] - even[s2 + 9] - even[s2 + 10] - even[s2 + 11] - even[s2 + 12] -
- even[s2 + 13] - even[s2 + 14] - even[s2 + 15] - even[s2];
- }
- template <class>
- struct benchmark_mul_helper;
- template <>
- struct benchmark_mul_helper<uint32_t> {
- static inline void do_operator(uint32_t odd[], uint32_t &final_result, size_t start_parameter_idx) {
- final_result *= odd[start_parameter_idx];
- final_result *= odd[start_parameter_idx++];
- }
- };
- template <>
- struct benchmark_mul_helper<float> {
- static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
- if (std::isinf(final_result * odd[start_parameter_idx])) {
- int exp;
- final_result = std::frexp(final_result, &exp);
- // memset(&final_result, 0, 1);
- // *(reinterpret_cast<uint8_t *>(&final_result) + sizeof(float) - 1) = 0;
- }
- final_result *= odd[start_parameter_idx++];
- }
- };
- template <class TDATA>
- static inline void benchmark_mul(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_mul_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- }
- template <class>
- struct benchmark_div_helper;
- template <>
- struct benchmark_div_helper<uint32_t> {
- static inline void do_operator(uint32_t odd[], uint32_t even[], uint32_t &final_result, size_t start_parameter_idx) {
- final_result *= even[start_parameter_idx];
- uint32_t devided = (odd[start_parameter_idx] & 0xff);
- if (final_result > devided) {
- final_result /= devided;
- } else {
- final_result %= devided;
- }
- }
- };
- template <>
- struct benchmark_div_helper<float> {
- static inline void do_operator(float odd[], float even[], float &final_result, size_t start_parameter_idx) {
- float r = final_result * even[start_parameter_idx];
- if (!std::isinf(r)) {
- final_result = r;
- }
- if (final_result > odd[start_parameter_idx]) {
- final_result /= odd[start_parameter_idx];
- } else {
- int exp;
- float devided = std::frexp(odd[start_parameter_idx], &exp);
- final_result /= devided;
- }
- }
- };
- template <class TDATA>
- static inline void benchmark_div(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- benchmark_div_helper<TDATA>::do_operator(odd, even, final_result, start_parameter_idx++);
- }
- template <class>
- struct benchmark_sqrt_helper;
- template <>
- struct benchmark_sqrt_helper<float> {
- static inline void do_operator(float odd[], float &final_result, size_t start_parameter_idx) {
- float v = odd[start_parameter_idx];
- if (start_parameter_idx & 0xc) {
- final_result = std::sqrt(final_result * v + v * v);
- } else if (start_parameter_idx & 0x3) {
- final_result = std::sqrt(final_result * v * v);
- } else {
- final_result = std::sqrt(final_result * final_result * v);
- }
- }
- };
- template <class TDATA>
- static inline void benchmark_sqrt(TDATA odd[], TDATA even[], TDATA &final_result, size_t start_parameter_idx) {
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- benchmark_sqrt_helper<TDATA>::do_operator(odd, final_result, start_parameter_idx++);
- }
- static void start_benchmark_worker(size_t idx, size_t max_round, benchmark_thread_data &data,
- std::atomic<size_t> &progress_total, std::atomic<size_t> &progress_done) {
- constexpr const size_t step = 1 << 4;
- constexpr const size_t iterator_count = kMaxParameterCount >> 4;
- progress_total += max_round * 9;
- progress_total += 2; // sin + cos
- // integer add
- {
- data.result.integer_add_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- uint32_t result = g_integer_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_add(g_integer_parameters_odd, g_integer_parameters_even, result, i);
- }
- ++progress_done;
- data.result.integer_add_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.integer_add_cost = end - begin;
- }
- // integer sub
- {
- data.result.integer_sub_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- uint32_t result = g_integer_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_sub(g_integer_parameters_odd, g_integer_parameters_even, result, i);
- }
- ++progress_done;
- data.result.integer_sub_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.integer_sub_cost = end - begin;
- }
- // integer mul
- {
- data.result.integer_mul_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- uint32_t result = g_integer_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_mul(g_integer_parameters_odd, g_integer_parameters_even, result, i);
- }
- ++progress_done;
- data.result.integer_mul_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.integer_mul_cost = end - begin;
- }
- // integer div
- {
- data.result.integer_div_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- uint32_t result = g_integer_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_div(g_integer_parameters_odd, g_integer_parameters_even, result, i);
- }
- ++progress_done;
- data.result.integer_div_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.integer_div_cost = end - begin;
- }
- // float add
- {
- data.result.float_add_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- float result = g_float_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_add(g_float_parameters_odd, g_float_parameters_even, result, i);
- }
- ++progress_done;
- data.result.float_add_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.float_add_cost = end - begin;
- }
- // float sub
- {
- data.result.float_sub_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- float result = g_float_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_sub(g_float_parameters_odd, g_float_parameters_even, result, i);
- }
- ++progress_done;
- data.result.float_sub_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.float_sub_cost = end - begin;
- }
- // float mul
- {
- data.result.float_mul_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- float result = g_float_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_mul(g_float_parameters_odd, g_float_parameters_even, result, i);
- }
- ++progress_done;
- data.result.float_mul_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.float_mul_cost = end - begin;
- }
- // float div
- {
- data.result.float_div_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- float result = g_float_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_div(g_float_parameters_odd, g_float_parameters_even, result, i);
- }
- ++progress_done;
- data.result.float_div_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.float_div_cost = end - begin;
- }
- // float sqrt
- {
- data.result.float_sqrt_final_result.resize(max_round);
- auto begin = std::chrono::system_clock::now();
- for (size_t round = 0; round < max_round; ++round) {
- size_t start_index = kMaxParameterCount / max_round * round;
- size_t iterator_end = start_index + iterator_count;
- float result = g_float_parameters_odd[start_index];
- for (size_t i = start_index; i < iterator_end; i += step) {
- benchmark_sqrt(g_float_parameters_odd, g_float_parameters_even, result, i);
- }
- ++progress_done;
- data.result.float_sqrt_final_result[round] = result;
- }
- auto end = std::chrono::system_clock::now();
- data.result.float_sqrt_cost = end - begin;
- }
- // float sin
- {
- for (int i = 0; i < 16; ++i) {
- data.result.float_sin_final_result.push_back(std::sin(3.14159f / 34 * i));
- }
- ++progress_done;
- }
- // float cos
- {
- for (int i = 0; i < 16; ++i) {
- data.result.float_cos_final_result.push_back(std::cos(3.14159f / 34 * i));
- }
- ++progress_done;
- }
- }
- static void start_benchmark_controller(std::shared_ptr<benchmark_handle> handle) {
- initialize_parameters(handle->progress_total, handle->progress_done);
- size_t idx = 0;
- for (auto &data : handle->datas) {
- data.result.float_add_cost = std::chrono::system_clock::duration::zero();
- data.result.float_sub_cost = std::chrono::system_clock::duration::zero();
- data.result.float_mul_cost = std::chrono::system_clock::duration::zero();
- data.result.float_div_cost = std::chrono::system_clock::duration::zero();
- data.result.float_sqrt_cost = std::chrono::system_clock::duration::zero();
- data.result.integer_add_cost = std::chrono::system_clock::duration::zero();
- data.result.integer_sub_cost = std::chrono::system_clock::duration::zero();
- data.result.integer_mul_cost = std::chrono::system_clock::duration::zero();
- data.result.integer_div_cost = std::chrono::system_clock::duration::zero();
- data.thread = std::unique_ptr<std::thread>(new std::thread([idx, &data, &handle]() {
- ++handle->running_thread;
- start_benchmark_worker(idx, handle->max_round, data, handle->progress_total, handle->progress_done);
- --handle->running_thread;
- }));
- ++idx;
- }
- for (auto &data : handle->datas) {
- if (data.thread && data.thread->joinable()) {
- data.thread->join();
- }
- }
- }
- } // namespace
- std::shared_ptr<benchmark_handle> start_benchmark(size_t thread_count, size_t round) {
- if (thread_count > 32) {
- thread_count = 32;
- }
- std::shared_ptr<benchmark_handle> ret = std::make_shared<benchmark_handle>();
- if (!ret) {
- return ret;
- }
- ret->max_round = round;
- ret->running_thread.store(0);
- ret->progress_total.store(1);
- ret->progress_done.store(0);
- ret->datas.resize(thread_count);
- ret->controller_thread = std::unique_ptr<std::thread>(new std::thread([ret]() {
- start_benchmark_controller(ret);
- ++ret->progress_done;
- }));
- return ret;
- }
- bool is_benchmark_running(const std::shared_ptr<benchmark_handle> &handle) {
- if (!handle) {
- return false;
- }
- if (!handle->controller_thread) {
- return false;
- }
- return handle->progress_done.load() < handle->progress_total.load();
- }
- std::pair<size_t, size_t> get_benchmark_progress(const std::shared_ptr<benchmark_handle> &handle) {
- if (!handle) {
- return std::pair<size_t, size_t>{0, 0};
- }
- return std::pair<size_t, size_t>{handle->progress_done, handle->progress_total};
- }
- size_t get_benchmark_running_thread(const std::shared_ptr<benchmark_handle> &handle) {
- if (!handle) {
- return 0;
- }
- return handle->running_thread.load();
- }
- size_t get_benchmark_thread_count(const std::shared_ptr<benchmark_handle> &handle) {
- if (!handle) {
- return 0;
- }
- return handle->datas.size();
- }
- void pick_benchmark_result(const std::shared_ptr<benchmark_handle> &handle, std::vector<benchmark_result> &result) {
- if (!handle) {
- return;
- }
- result.reserve(handle->datas.size());
- for (auto &data : handle->datas) {
- result.push_back(data.result);
- }
- }