Experiments with C++11

C++11
A subset of the features explored

What is happening?
• We want
• the performance of carefully optimized code
• the convenience of a high level language
• to use all our cores

Example: Laplacian Smoothing
Vertex moves
to center of
neighbors

void laplacian_smooth0(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
L_attr[*v] =laplacian(m, *v);
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
m.pos(*v) += weight*L_attr[*v];
}
}
14.6Original
It is so C++98

{
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexID v : m.vertices())
if(!boundary(m, v))
L_attr[v] =laplacian(m, v);
for(VertexID v : m.vertices()){
if(!boundary(m, v))
m.pos(v) += weight*L_attr[v];
}
}
}
14.2Range for
Much better to read. Not only is the
for loop clear, we did away with `*´.
vertices() returns a class which just
contains begin and end functions

{
auto new_pos = m.positions_attribute_vector();
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = new_pos;
}
}
12.4Optimized
And we only need one loop.We can
memory move the vertex positions

void laplacian_smooth3(Manifold& m, float weight, int max_iter)
{
if(!boundary(m, v))
m.positions_attribute_vector() = move(new_pos);
}
}
12.6move
Actually, we should move, but ... oh ...
now I copy somewhere else

{
if(!boundary(m, v))
swap(m.positions_attribute_vector(),new_pos);
}
}
12.1swap
Now we only have two buffers for
vertex positions and always read from
one and write to the other.Then swap!
I think this version is the sweet spot
for single threaded code.

void laplacian_smooth4_5(Manifold& m,float weight, int max_iter)
{
for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);});
swap(m.positions_attribute_vector(),new_pos);
}
}
Lambda variation
Not much more clear.
Should be about the same
performance...

{
vector<thread> t_vec;
if(!boundary(m, v))
t_vec.push_back(thread([&](VertexID vid){
if(!boundary(m, vid))
new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v));
for(int i=0;i<t_vec.size();++i)
t_vec[i].join();
m.positions_attribute_vector() = move(new_pos);
}
}
∞Threads done wrong
For a brief moment I must
have thought I was coding
to a GPU. First time I timed
it, I got 666 times longer run
time

inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids,
VertexAttributeVector<Vec3d>& new_pos,
float weight){
for(auto v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
}
{
vector<vector<VertexID>> vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
vector<thread> t_vec(CORES);
VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector();
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no] = thread(laplacian_smooth_vertex,
ref(m), ref(vertex_ids[thread_no]),
ref(new_pos), weight);
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no].join();
swap(m.positions_attribute_vector(), new_pos);
}
}
2.5
Almost ﬁve times
performance
improvement with
four physical cores.
hyperthreading
works!!
CORES = 8

Statistics
Median
Baseline 14,6 14,6 14,5 14,6 14,6 14,6
Range for 14,4 14,2 14,2 14,2 14,2 14,2
Copy back 12,4 12,4 12,4 12,4 12,4 12,4
Move back 12,5 12,5 12,9 12,9 12,6 12,6
Swap 12,1 12,1 12,1 12,1 12,2 12,1
2 threads 6,8 6,7 6,8 6,7 6,7 6,7
4 threads 4,1 4,1 4,1 4,1 4,1 4,1
8 threads 2,5 2,5 2,5 2,5 2,5 2,5
s s s s s s

typedef vector<vector<VertexID>> VertexIDBatches;
VertexIDBatches batch_vertices(Manifold& m) {
VertexIDBatches vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
return vertex_ids;
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
t_vec[t].join();
}
#1 Produces a vector of
vectors of vertex IDs
#2 Actually spawns off worker
threads

{
auto vertex_ids = batch_vertices(m);
auto f = [&](const vector<VertexID>& vids) {
for(VertexID v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
};
for(auto _ : range(0, max_iter)) {
for_each_vertex_parallel(CORES, vertex_ids, f);
swap(m.positions_attribute_vector(), new_pos);
}
}
Slightly faster, much simpler. Note I threw in a range
class to get rid of old school for loops.
2.4

vector<future<void>> f_vec(no_threads);
f_vec[t] = async(launch::async, f, ref(batches[t]));
}
vector<thread> t_vec(no_threads);
t_vec[t] = thread(f, ref(batches[t]));
t_vec[t].join();
}
See the code above is simpler and the destructor joins!
what happens if we ignore the future?!
But the async code takes 50% more time than the old code
where I join threads explicitly. Not sure why?!

Polymorphism with std::function
class MyClass {
int c;
public:
MyClass(int _c): c(_c) {}
function<int(int)> fun;
void set_fun(function<int(int,int)> f) {
fun = bind1st(f, c);
}
};
int fun1(int c, int x) { return c*x;}
int fun2(int c, int x) { return x/c;}
int main(int argc, const char * argv[]) {
MyClass m1{1},m2{2};
m1.set_fun(fun1);
m2.set_fun(fun2);
cout << m1.fun(42) << " " << m2.fun(42) << endl;
}
Maybe more exotic than
actually useful, but instructive
that polymorphism can be
achieved so differently from
when using virtual functions

Kinder, gentler member init
class VisObj
! {
! ! std::string file;
! ! GLGraphics::GLViewController view_ctrl;
! ! bool create_display_list;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics;
GLGraphics::ManifoldRenderer* renderer;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0),
create_display_list(true), harmonics(0) {}
// ... and so on
We never really liked these
long initialization lists and
always wondered why we
could not just initialize when
we declare

Kinder, gentler member init
class VisObj
! {
! ! std::string file = "";
! ! GLGraphics::GLViewController view_ctrl =
GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0);
! ! bool create_display_list = true;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics = nullptr;
GLGraphics::ManifoldRenderer* renderer = nullptr;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj() {}
// and so on
Now, we can! What
is up with nullptr?!

ArithVec changes
template <class T, class V, unsigned int N>
class ArithVec
{
protected:
/// The actual contents of the vector.
std::array<T,N> data;
// ......... Look, I did away with C style
arrays

ArithVec::ArithVec(T _a, T _b, T _c, T _d)
{
assert(N==4);
data[0] = _a;
data[1] = _b;
data[2] = _c;
data[3] = _d;
}
ArithVec::ArithVec(T _a, T _b, T _c, T _d):
data({_a,_b,_c,_d}) {assert(N==4);}
Look! an initializer list ... hmmm MSVC does not like it

/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
{
std::for_each(begin(), end(), [k](T& x){x*=k;});
}
Note: begin() and end()
make the code nicer
than before

{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
}
{
for(auto& x : data) {x*=k;}
}
Morten: this is
actually simpler!

bool ArithVec:: operator==(const V& v) const
{
return std::equal(begin(),end(), v.begin());
}
bool ArithVec::operator==(const V& v) const
{
return std::inner_product(data, &data[N], v.get(), true,
! ! ! std::logical_and<bool>(), std::equal_to<T>());
}
Just to use the
obvious.This was
possible before
C++11

circulate with functors
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(Walker&)> f)
{
Walker w = m.walker(v);
for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w);
return w.no_steps();
}
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(VertexID)> f)
{
return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());});
}
Five slides that show
what we can do by having
circulator functions
accepting functors

int valency(const Manifold& m, VertexID v)
{
// perform full circulation to get valency
Walker vj = m.walker(v);
while(!vj.full_circle())
vj = vj.circulate_vertex_cw();
return vj.no_steps();
}
int valency(const Manifold& m, VertexID v)
{
return circulate_vertex_ccw(m,v, [](Walker){});
}

bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
for(Walker vj = m.walker(v0); !vj.full_circle();
vj = vj.circulate_vertex_cw()){
if(vj.vertex() == v1)
return true;
}
return false;
}
bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
bool c=false;
circulate_vertex_ccw(m, v0, [&](VertexID v){ c |= (v==v1);});
return c;
}

inline Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d p(0);
int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); });
return p / n - m.pos(v);
}
Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d avg_pos(0);
int n = 0;
for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){
avg_pos += m.pos(w.vertex());
++n;
}
return avg_pos / n - m.pos(v);
}

int no_edges(const Manifold& m, FaceID f)
{
return circulate_face_ccw(m, f, [](Walker w){});
}
int no_edges(const Manifold& m, FaceID f)
{
// perform full circulation to get valency
Walker w = m.walker(f);
for(; !w.full_circle(); w = w.circulate_face_cw());
return w.no_steps();
}

Conclusions
• Multicore is very important and the C++11 thread library makes
concurrency easy.We will rely on the compiler for SIMD
optimization!
• range for is great. Makes code far more clear and we get rid of
iterators in many cases
• move semantics & RVO make clear code faster
• lambda functions improve on locality ... awesome with the STL
algorithms and std::function
• auto helps us avoid obfuscation with ugly type names
• uniform initialization and initializer lists also make code concise

Discussion
• A C++11 developer version of GEL has branched off: should
we go for built-in parallellism?
• Hmm - just so you know - there is much more in the C++11
standard.This is just the part I understand so far...
• Herb Sutter:“We broke all the books!”
• Yet the learning curve is less daunting than when we ﬁrst had
to do templates.

Experiments with C++11

More Related Content

What's hot (15)

Viewers also liked (9)

Similar to Experiments with C++11 (20)

Recently uploaded (20)

Experiments with C++11