C++模板元编程（C++ template metaprogramming）

只看该作者 · 2015-3-24 21:16

5. 循环展开

文献[11]展示了一个循环展开（loop unrolling）的例子 — 冒泡排序：

#include <utility>  // std::swap

 

// dynamic code, 普通函数版本

void bubbleSort(int* data, int n)

{

    for(int i=n-1; i>0; --i) {

        for(int j=0; j<i; ++j)

            if (data[j]>data[j+1]) std::swap(data[j], data[j+1]);

    }

}

// 数据长度为 4 时，手动循环展开

inline void bubbleSort4(int* data)

{

#define COMP_SWAP(i, j) if(data[i]>data[j]) std::swap(data[i], data[j])

    COMP_SWAP(0, 1); COMP_SWAP(1, 2); COMP_SWAP(2, 3);

    COMP_SWAP(0, 1); COMP_SWAP(1, 2);

    COMP_SWAP(0, 1);

}

 

// 递归函数版本，指导模板思路，最后一个参数是哑参数（dummy parameter），仅为分辨重载函数

class recursion { };

void bubbleSort(int* data, int n, recursion)

{

    if(n<=1) return;

    for(int j=0; j<n-1; ++j) if(data[j]>data[j+1]) std::swap(data[j], data[j+1]);

    bubbleSort(data, n-1, recursion());

}

 

// static code, 模板元编程版本

template<int i, int j>

inline void IntSwap(int* data) { // 比较和交换两个相邻元素

    if(data[i]>data[j]) std::swap(data[i], data[j]);

}

 

template<int i, int j>

inline void IntBubbleSortLoop(int* data) { // 一次冒泡，将前 i 个元素中最大的置换到最后

    IntSwap<j, j+1>(data);

    IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>(data);

}

template<>

inline void IntBubbleSortLoop<0, 0>(int*) { }

 

template<int n>

inline void IntBubbleSort(int* data) { // 模板冒泡排序循环展开

    IntBubbleSortLoop<n-1, 0>(data);

    IntBubbleSort<n-1>(data);

}

template<>

inline void IntBubbleSort<1>(int* data) { }

只看该作者 · 2015-3-24 21:17

对循环次数固定且比较小的循环语句，对其进行展开并内联可以避免函数调用以及执行循环语句中的分支，从而可以提高性能，对上述代码做如下测试，代码在 VS2013 的 Release 下编译运行：

复制

#include <iostream>

#include <omp.h>

#include <string.h> // memcpy

 

int main() {

    double t1, t2, t3; const int num=100000000;

    int data[4]; int inidata[4]={3,4,2,1};

    t1 = omp_get_wtime();

    for(int i=0; i<num; ++i) { memcpy(data, inidata, 4); bubbleSort(data, 4); }

    t1 = omp_get_wtime()-t1;

    t2 = omp_get_wtime();

    for(int i=0; i<num; ++i) { memcpy(data, inidata, 4); bubbleSort4(data); }

    t2 = omp_get_wtime()-t2;

    t3 = omp_get_wtime();

    for(int i=0; i<num; ++i) { memcpy(data, inidata, 4); IntBubbleSort<4>(data); }

    t3 = omp_get_wtime()-t3;

    std::cout << t1/t3 << '\t' << t2/t3 << '\n';

    std::cin.get(); return 0;

}

复制

2.38643 0.926521

只看该作者 · 2015-3-24 21:17

上述结果表明，模板元编程实现的循环展开能够达到和手动循环展开相近的性能（90% 以上），并且性能是循环版本的 2 倍多（如果扣除 memcpy 函数占据的部分加速比将更高，根据 Amdahl 定律）。这里可能有人会想，既然循环次数固定，为什么不直接手动循环展开呢，难道就为了使用模板吗？当然不是，有时候循环次数确实是编译期固定值，但对用户并不是固定的，比如要实现数学上向量计算的类，因为可能是 2、3、4 维，所以写成模板，把维度作为 int 型模板参数，这时因为不知道具体是几维的也就不得不用循环，不过因为维度信息在模板实例化时是编译期常量且较小，所以编译器很可能在代码优化时进行循环展开，但我们想让这一切发生的更可控一些。

只看该作者 · 2015-3-24 21:18

上面用三个函数模板 IntSwap<>()、 IntBubbleSortLoop<>()、 IntBubbleSort<>() 来实现一个排序功能，不但显得分散（和封装原理不符），还暴露了实现细节，我们可以仿照上一节的代码，将 IntBubbleSortLoop<>()、 IntBubbleSort<>() 嵌入其他模板内部，因为函数不允许嵌套，我们只能用类模板：

复制

// 整合成一个类模板实现，看着好，但引入了 代码膨胀

template<int n>

class IntBubbleSortC {

    template<int i, int j>

    static inline void IntSwap(int* data) { // 比较和交换两个相邻元素

        if(data[i]>data[j]) std::swap(data[i], data[j]);

    }

    template<int i, int j>

    static inline void IntBubbleSortLoop(int* data) { // 一次冒泡

        IntSwap<j, j+1>(data);

        IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>(data);

    }

    template<>

    static inline void IntBubbleSortLoop<0, 0>(int*) { }

public:

    static inline void sort(int* data) {

        IntBubbleSortLoop<n-1, 0>(data);

        IntBubbleSortC<n-1>::sort(data);

    }

};

template<>

class IntBubbleSortC<0> {

public:

    static inline void sort(int* data) { }

};

 

int main() {

    int data[4] = {3,4,2,1};

    IntBubbleSortC<4>::sort(data); // 如此调用

    std::cin.get(); return 0;

}

只看该作者 · 2015-3-24 21:18

上面代码看似很好，不仅整合了代码，借助类成员的访问控制，还隐藏了实现细节。不过它存在着很大问题，如果实例化 IntBubbleSortC<4>、 IntBubbleSortC<3>、 IntBubbleSortC<2>，将实例化成员函数 IntBubbleSortC<4>::IntSwap<0, 1>()、 IntBubbleSortC<4>::IntSwap<1, 2>()、 IntBubbleSortC<4>::IntSwap<2, 3>()、 IntBubbleSortC<3>::IntSwap<0, 1>()、 IntBubbleSortC<3>::IntSwap<1, 2>()、 IntBubbleSortC<2>::IntSwap<0, 1>()，而在原来的看着分散的代码中 IntSwap<0, 1>() 只有一个。这将导致代码膨胀（code bloat），即生成的可执行文件体积变大（代码膨胀另一含义是源代码增大，见文献[1]第11章）。不过这里使用了内联（inline），如果编译器确实内联展开代码则不会导致代码膨胀（除了循环展开本身会带来的代码膨胀），但因为重复编译原本可以复用的模板实例，会增加编译时间。在上一节的例子中，因为只涉及编译期常量计算，并不涉及函数（函数模板，或类模板的成员函数，函数被编译成具体的机器二进制代码），并不会出现代码膨胀。

只看该作者 · 2015-3-24 21:18

为了清晰证明上面的论述，我们去掉所有 inline 并将函数实现放到类外面（类里面实现的成员函数都是内联的，因为函数实现可能被包含多次，见文献[2] 10.2.9，不过现在的编译器优化能力很强，很多时候加不加 inline 并不影响编译器自己对内联的选择…），分别编译分散版本和类模板封装版本的冒泡排序代码编译生成的目标文件（VS2013 下是 .obj 文件）的大小，代码均在 VS2013 Debug 模式下编译（防止编译器优化），比较 main.obj （源文件是 main.cpp）大小。

只看该作者 · 2015-3-24 21:19

类模板封装版本代码如下，注意将成员函数在外面定义的写法：

复制

#include <iostream>

#include <utility>  // std::swap

 

// 整合成一个类模板实现，看着好，但引入了 代码膨胀

template<int n>

class IntBubbleSortC {

    template<int i, int j> static void IntSwap(int* data);

    template<int i, int j> static void IntBubbleSortLoop(int* data);

    template<> static void IntBubbleSortLoop<0, 0>(int*) { }

public:

    static void sort(int* data);

};

template<>

class IntBubbleSortC<0> {

public:

    static void sort(int* data) { }

};

 

template<int n> template<int i, int j>

void IntBubbleSortC<n>::IntSwap(int* data) {

    if(data[i]>data[j]) std::swap(data[i], data[j]);

}

template<int n> template<int i, int j>

void IntBubbleSortC<n>::IntBubbleSortLoop(int* data) {

    IntSwap<j, j+1>(data);

    IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>(data);

}

template<int n>

void IntBubbleSortC<n>::sort(int* data) {

    IntBubbleSortLoop<n-1, 0>(data);

    IntBubbleSortC<n-1>::sort(data);

}

 

int main() {

    int data[40] = {3,4,2,1};

    IntBubbleSortC<2>::sort(data);  IntBubbleSortC<3>::sort(data);

    IntBubbleSortC<4>::sort(data);  IntBubbleSortC<5>::sort(data);

    IntBubbleSortC<6>::sort(data);  IntBubbleSortC<7>::sort(data);

    IntBubbleSortC<8>::sort(data);  IntBubbleSortC<9>::sort(data);

    IntBubbleSortC<10>::sort(data); IntBubbleSortC<11>::sort(data);

#if 0

    IntBubbleSortC<12>::sort(data); IntBubbleSortC<13>::sort(data);

    IntBubbleSortC<14>::sort(data); IntBubbleSortC<15>::sort(data);

    IntBubbleSortC<16>::sort(data); IntBubbleSortC<17>::sort(data);

    IntBubbleSortC<18>::sort(data); IntBubbleSortC<19>::sort(data);

    IntBubbleSortC<20>::sort(data); IntBubbleSortC<21>::sort(data);

 

    IntBubbleSortC<22>::sort(data); IntBubbleSortC<23>::sort(data);

    IntBubbleSortC<24>::sort(data); IntBubbleSortC<25>::sort(data);

    IntBubbleSortC<26>::sort(data); IntBubbleSortC<27>::sort(data);

    IntBubbleSortC<28>::sort(data); IntBubbleSortC<29>::sort(data);

    IntBubbleSortC<30>::sort(data); IntBubbleSortC<31>::sort(data);

#endif

    std::cin.get(); return 0;

}

只看该作者 · 2015-3-24 21:19

分散定义函数模板版本代码如下，为了更具可比性，也将函数放在类里面作为成员函数：

复制

#include <iostream>

#include <utility>  // std::swap

 

// static code, 模板元编程版本

template<int i, int j>

class IntSwap {

public: static void swap(int* data);

};

 

template<int i, int j>

class IntBubbleSortLoop {

public: static void loop(int* data);

};

template<>

class IntBubbleSortLoop<0, 0> {

public: static void loop(int* data) { }

};

 

template<int n>

class IntBubbleSort {

public: static void sort(int* data);

};

template<>

class IntBubbleSort<0> {

public: static void sort(int* data) { }

};

 

template<int i, int j>

void IntSwap<i, j>::swap(int* data) {

    if(data[i]>data[j]) std::swap(data[i], data[j]);

}

template<int i, int j>

void IntBubbleSortLoop<i, j>::loop(int* data) {

    IntSwap<j, j+1>::swap(data);

    IntBubbleSortLoop<j<i-1?i:0, j<i-1?(j+1):0>::loop(data);

}

template<int n>

void IntBubbleSort<n>::sort(int* data) {

    IntBubbleSortLoop<n-1, 0>::loop(data);

    IntBubbleSort<n-1>::sort(data);

}

 

int main() {

    int data[40] = {3,4,2,1};

    IntBubbleSort<2>::sort(data);  IntBubbleSort<3>::sort(data);

    IntBubbleSort<4>::sort(data);  IntBubbleSort<5>::sort(data);

    IntBubbleSort<6>::sort(data);  IntBubbleSort<7>::sort(data);

    IntBubbleSort<8>::sort(data);  IntBubbleSort<9>::sort(data);

    IntBubbleSort<10>::sort(data); IntBubbleSort<11>::sort(data);

#if 0

    IntBubbleSort<12>::sort(data); IntBubbleSort<13>::sort(data);

    IntBubbleSort<14>::sort(data); IntBubbleSort<15>::sort(data);

    IntBubbleSort<16>::sort(data); IntBubbleSort<17>::sort(data);

    IntBubbleSort<18>::sort(data); IntBubbleSort<19>::sort(data);

    IntBubbleSort<20>::sort(data); IntBubbleSort<21>::sort(data);

 

    IntBubbleSort<22>::sort(data); IntBubbleSort<23>::sort(data);

    IntBubbleSort<24>::sort(data); IntBubbleSort<25>::sort(data);

    IntBubbleSort<26>::sort(data); IntBubbleSort<27>::sort(data);

    IntBubbleSort<28>::sort(data); IntBubbleSort<29>::sort(data);

    IntBubbleSort<30>::sort(data); IntBubbleSort<31>::sort(data);

#endif

    std::cin.get(); return 0;

}

只看该作者 · 2015-3-24 21:20

程序中条件编译都未打开时（#if 0），main.obj 大小分别为 264 KB 和 211 KB，条件编译打开时（#if 1），main.obj 大小分别为 1073 KB 和 620 KB。可以看到，类模板封装版的对象文件不但绝对大小更大，而且增长更快，这和之前分析是一致的。

只看该作者 · 2015-3-24 21:20

6. 表达式模板，向量运算

文献[12]展示了一个表达式模板（Expression Templates）的例子：

复制

#include <iostream> // std::cout

#include <cmath>    // std::sqrt()

 

// 表达式类型

class DExprLiteral {                    // 文字量

    double a_;

public:

    DExprLiteral(double a) : a_(a) { }

    double operator()(double x) const { return a_; }

};

class DExprIdentity {                   // 自变量

public:

    double operator()(double x) const { return x; }

};

template<class A, class B, class Op>    // 双目操作

class DBinExprOp {

    A a_; B b_;

public:

    DBinExprOp(const A& a, const B& b) : a_(a), b_(b) { }

    double operator()(double x) const { return Op::apply(a_(x), b_(x)); }

};

template<class A, class Op>             // 单目操作

class DUnaryExprOp {

    A a_;

public:

    DUnaryExprOp(const A& a) : a_(a) { }

    double operator()(double x) const { return Op::apply(a_(x)); }

};

// 表达式

template<class A>

class DExpr {

    A a_;

public:

    DExpr() { }

    DExpr(const A& a) : a_(a) { }

    double operator()(double x) const { return a_(x); }

};

 

// 运算符，模板参数 A、B 为参与运算的表达式类型

// operator /, division

class DApDiv { public: static double apply(double a, double b) { return a / b; } };

template<class A, class B> DExpr<DBinExprOp<DExpr<A>, DExpr<B>, DApDiv> >

operator/(const DExpr<A>& a, const DExpr<B>& b) {

    typedef DBinExprOp<DExpr<A>, DExpr<B>, DApDiv> ExprT;

    return DExpr<ExprT>(ExprT(a, b));

}

// operator +, addition

class DApAdd { public: static double apply(double a, double b) { return a + b; } };

template<class A, class B> DExpr<DBinExprOp<DExpr<A>, DExpr<B>, DApAdd> >

operator+(const DExpr<A>& a, const DExpr<B>& b) {

    typedef DBinExprOp<DExpr<A>, DExpr<B>, DApAdd> ExprT;

    return DExpr<ExprT>(ExprT(a, b));

}

// sqrt(), square rooting

class DApSqrt { public: static double apply(double a) { return std::sqrt(a); } };

template<class A> DExpr<DUnaryExprOp<DExpr<A>, DApSqrt> >

sqrt(const DExpr<A>& a) {

    typedef DUnaryExprOp<DExpr<A>, DApSqrt> ExprT;

    return DExpr<ExprT>(ExprT(a));

}

// operator-, negative sign

class DApNeg { public: static double apply(double a) { return -a; } };

template<class A> DExpr<DUnaryExprOp<DExpr<A>, DApNeg> >

operator-(const DExpr<A>& a) {

    typedef DUnaryExprOp<DExpr<A>, DApNeg> ExprT;

    return DExpr<ExprT>(ExprT(a));

}

 

// evaluate()

template<class Expr>

void evaluate(const DExpr<Expr>& expr, double start, double end, double step) {

    for(double i=start; i<end; i+=step) std::cout << expr(i) << ' ';

}

 

int main() {

    DExpr<DExprIdentity> x;

    evaluate( -x / sqrt( DExpr<DExprLiteral>(1.0) + x ) , 0.0, 10.0, 1.0);

    std::cin.get(); return 0;

}

复制

-0 -0.707107 -1.1547 -1.5 -1.78885 -2.04124 -2.26779 -2.47487 -2.66667 -2.84605

只看该作者 · 2015-3-24 21:21

代码有点长（我已经尽量压缩行数），请先看最下面的 main() 函数，表达式模板允许我们以 “-x / sqrt( 1.0 + x )” 这种类似数学表达式的方式传参数，在 evaluate() 内部，将 0-10 的数依次赋给自变量 x 对表达式进行求值，这是通过在 template<> DExpr 类模板内部重载 operator() 实现的。我们来看看这一切是如何发生的。

只看该作者 · 2015-3-24 21:21

在 main() 中调用 evaluate() 时，编译器根据全局重载的加号、sqrt、除号、负号推断“-x / sqrt( 1.0 + x )” 的类型是 Dexpr<DBinExprOp<Dexpr<DUnaryExprOp<Dexpr<DExprIdentity>, DApNeg>>, Dexpr<DUnaryExprOp<Dexpr<DBinExprOp<Dexpr<DExprLiteral>, Dexpr<DExprIdentity>, DApAdd>>, DApSqrt>>, DApDiv>>（即将每个表达式编码到一种类型，设这个类型为 ultimateExprType），并用此类型实例化函数模板 evaluate()，类型的推导见下图。在 evaluate() 中，对表达式进行求值 expr(i)，调用 ultimateExprType 的 operator()，这引起一系列的 operator() 和 Op::apply() 的调用，最终遇到基础类型 “表达式类型” DExprLiteral 和 DExprIdentity，这个过程见下图。总结就是，请看下图，从下到上类型推断，从上到下 operator() 表达式求值。

只看该作者 · 2015-3-24 21:21

只看该作者 · 2015-3-24 21:22

上面代码函数实现写在类的内部，即内联，如果编译器对内联支持的好的话，上面代码几乎等价于如下代码：

复制

#include <iostream> // std::cout

#include <cmath>    // std::sqrt()

 

void evaluate(double start, double end, double step) {

    double _temp = 1.0;

    for(double i=start; i<end; i+=step)

        std::cout << -i / std::sqrt(_temp + i) << ' ';

}

 

int main() {

    evaluate(0.0, 10.0, 1.0);

    std::cin.get(); return 0;

}

复制

-0 -0.707107 -1.1547 -1.5 -1.78885 -2.04124 -2.26779 -2.47487 -2.66667 -2.84605

只看该作者 · 2015-3-24 21:23

和表达式模板类似的技术还可以用到向量计算中，以避免产生临时向量变量，见文献[4] Expression templates 和文献[12]的后面。传统向量计算如下：

复制

class DoubleVec; // DoubleVec 重载了 + - * / 等向量元素之间的计算

DoubleVec y(1000), a(1000), b(1000), c(1000), d(1000); // 向量长度 1000

// 向量计算

y = (a + b) / (c - d);

// 等价于

DoubleVec __t1 = a + b;

DoubleVec __t2 = c - d;

DoubleVec __t3 = __t1 / __t2;

y = __t3;

只看该作者 · 2015-3-24 21:23

模板代码实现向量计算如下：

复制

template<class A> DVExpr;

class DVec{

    // ...

    template<class A>

    DVec& operator=(const DVExpr<A>&); // 由 = 引起向量逐个元素的表达式值计算并赋值

};

DVec y(1000), a(1000), b(1000), c(1000), d(1000); // 向量长度 1000

// 向量计算

y = (a + b) / (c - d);

// 等价于

for(int i=0; i<1000; ++i) {

    y[i] = (a[i] + b[i]) / (c[i] + d[i]);

}

只看该作者 · 2015-3-24 21:23

不过值得一提的是，传统代码可以用 C++11 的右值引用提升性能，C++11 新特性我们以后再详细讨论。

只看该作者 · 2015-3-24 21:24

我们这里看下文献[4] Expression templates 实现的版本，它用到了编译期多态，编译期多态示意代码如下（关于这种代码形式有个名字叫 curiously recurring template pattern， CRTP，见文献[4]）：

复制

// 模板基类，定义接口，具体实现由模板参数，即子类实现

template <typename D>

class base {

public:

    void f1() { static_cast<E&>(*this).f1(); } // 直接调用子类实现

    int f2() const { static_cast<const E&>(*this).f1(); }

};

// 子类

class dirived1 : public base<dirived1> {

public:

    void f1() { /* ... */ }

    int f2() const { /* ... */ }

};

template<typename T>

class dirived2 : public base<dirived2<T>> {

public:

    void f1() { /* ... */ }

    int f2() const { /* ... */ }

};

只看该作者 · 2015-3-24 21:24

简化后（向量长度固定为1000，元素类型为 double）的向量计算代码如下：

复制

#include <iostream> // std::cout

 

// A CRTP base class for Vecs with a size and indexing:

template <typename E>

class VecExpr {

public:

    double operator[](int i) const { return static_cast<E const&>(*this)[i]; }

    operator E const&() const { return static_cast<const E&>(*this); } // 向下类型转换

};

// The actual Vec class:

class Vec : public VecExpr<Vec> {

    double _data[1000];

public:

    double&  operator[](int i) { return _data[i]; }

    double operator[](int i) const { return _data[i]; }

    template <typename E>

    Vec const& operator=(VecExpr<E> const& vec) {

        E const& v = vec;

        for (int i = 0; i<1000; ++i) _data[i] = v[i];

        return *this;

    }

    // Constructors

    Vec() { }

    Vec(double v) { for(int i=0; i<1000; ++i) _data[i] = v; }

};

 

template <typename E1, typename E2>

class VecDifference : public VecExpr<VecDifference<E1, E2> > {

    E1 const& _u; E2 const& _v;

public:

    VecDifference(VecExpr<E1> const& u, VecExpr<E2> const& v) : _u(u), _v(v) { }

    double operator[](int i) const { return _u[i] - _v[i]; }

};

template <typename E>

class VecScaled : public VecExpr<VecScaled<E> > {

    double _alpha; E const& _v;

public:

    VecScaled(double alpha, VecExpr<E> const& v) : _alpha(alpha), _v(v) { }

    double operator[](int i) const { return _alpha * _v[i]; }

};

 

// Now we can overload operators:

template <typename E1, typename E2> VecDifference<E1, E2> const

operator-(VecExpr<E1> const& u, VecExpr<E2> const& v) {

    return VecDifference<E1, E2>(u, v);

}

template <typename E> VecScaled<E> const

operator*(double alpha, VecExpr<E> const& v) {

    return VecScaled<E>(alpha, v);

}

 

int main() {

    Vec u(3), v(1); double alpha=9; Vec y;

    y = alpha*(u - v);

    std::cout << y[999] << '\n';

    std::cin.get(); return 0;

}

只看该作者 · 2015-3-24 21:25

alpha*(u – v)” 的类型推断过程如下图所示，其中有子类到基类的隐式类型转换：

C++模板元编程（C++ template metaprogramming）

甘甜之泉水

技术高手奖章