SlideShare a Scribd company logo
C++11
A subset of the features explored
What is happening?
• We want
• the performance of carefully optimized code
• the convenience of a high level language
• to use all our cores
Example: Laplacian Smoothing
Vertex moves
to center of
neighbors
Before
After 1k iterations
void laplacian_smooth0(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
L_attr[*v] =laplacian(m, *v);
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
m.pos(*v) += weight*L_attr[*v];
}
}
14.6Original
It is so C++98
void laplacian_smooth1(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexID v : m.vertices())
if(!boundary(m, v))
L_attr[v] =laplacian(m, v);
for(VertexID v : m.vertices()){
if(!boundary(m, v))
m.pos(v) += weight*L_attr[v];
}
}
}
14.2Range for
Much better to read. Not only is the
for loop clear, we did away with `*´.
vertices() returns a class which just
contains begin and end functions
void laplacian_smooth2(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = new_pos;
}
}
12.4Optimized
And we only need one loop.We can
memory move the vertex positions
void laplacian_smooth3(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = move(new_pos);
}
}
12.6move
Actually, we should move, but ... oh ...
now I copy somewhere else
void laplacian_smooth4(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
swap(m.positions_attribute_vector(),new_pos);
}
}
12.1swap
Now we only have two buffers for
vertex positions and always read from
one and write to the other.Then swap!
I think this version is the sweet spot
for single threaded code.
void laplacian_smooth4_5(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);});
swap(m.positions_attribute_vector(),new_pos);
}
}
Lambda variation
Not much more clear.
Should be about the same
performance...
void laplacian_smooth5(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
vector<thread> t_vec;
for(auto v : m.vertices())
if(!boundary(m, v))
t_vec.push_back(thread([&](VertexID vid){
if(!boundary(m, vid))
new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v));
for(int i=0;i<t_vec.size();++i)
t_vec[i].join();
m.positions_attribute_vector() = move(new_pos);
}
}
∞Threads done wrong
For a brief moment I must
have thought I was coding
to a GPU. First time I timed
it, I got 666 times longer run
time
inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids,
VertexAttributeVector<Vec3d>& new_pos,
float weight){
for(auto v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
}
void laplacian_smooth6(Manifold& m, float weight, int max_iter)
{
vector<vector<VertexID>> vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
vector<thread> t_vec(CORES);
VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no] = thread(laplacian_smooth_vertex,
ref(m), ref(vertex_ids[thread_no]),
ref(new_pos), weight);
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no].join();
swap(m.positions_attribute_vector(), new_pos);
}
}
2.5
Almost five times
performance
improvement with
four physical cores.
hyperthreading
works!!
CORES = 8
Statistics
Median
Baseline 14,6 14,6 14,5 14,6 14,6 14,6
Range for 14,4 14,2 14,2 14,2 14,2 14,2
Copy back 12,4 12,4 12,4 12,4 12,4 12,4
Move back 12,5 12,5 12,9 12,9 12,6 12,6
Swap 12,1 12,1 12,1 12,1 12,2 12,1
2 threads 6,8 6,7 6,8 6,7 6,7 6,7
4 threads 4,1 4,1 4,1 4,1 4,1 4,1
8 threads 2,5 2,5 2,5 2,5 2,5 2,5
s s s s s s
Now make it generic!
typedef vector<vector<VertexID>> VertexIDBatches;
VertexIDBatches batch_vertices(Manifold& m) {
VertexIDBatches vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
return vertex_ids;
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
#1 Produces a vector of
vectors of vertex IDs
#2 Actually spawns off worker
threads
void laplacian_smooth7(Manifold& m, float weight, int max_iter)
{
auto vertex_ids = batch_vertices(m);
auto new_pos = m.positions_attribute_vector();
auto f = [&](const vector<VertexID>& vids) {
for(VertexID v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
};
for(auto _ : range(0, max_iter)) {
for_each_vertex_parallel(CORES, vertex_ids, f);
swap(m.positions_attribute_vector(), new_pos);
}
}
Slightly faster, much simpler. Note I threw in a range
class to get rid of old school for loops.
2.4
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<future<void>> f_vec(no_threads);
for(auto t : range(0, no_threads))
f_vec[t] = async(launch::async, f, ref(batches[t]));
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
See the code above is simpler and the destructor joins!
what happens if we ignore the future?!
But the async code takes 50% more time than the old code
where I join threads explicitly. Not sure why?!
More C++11 examples
Polymorphism with std::function
class MyClass {
int c;
public:
MyClass(int _c): c(_c) {}
function<int(int)> fun;
void set_fun(function<int(int,int)> f) {
fun = bind1st(f, c);
}
};
int fun1(int c, int x) { return c*x;}
int fun2(int c, int x) { return x/c;}
int main(int argc, const char * argv[]) {
MyClass m1{1},m2{2};
m1.set_fun(fun1);
m2.set_fun(fun2);
cout << m1.fun(42) << " " << m2.fun(42) << endl;
}
Maybe more exotic than
actually useful, but instructive
that polymorphism can be
achieved so differently from
when using virtual functions
Kinder, gentler member init
class VisObj
! {
! ! std::string file;
! ! GLGraphics::GLViewController view_ctrl;
! ! bool create_display_list;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics;
GLGraphics::ManifoldRenderer* renderer;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0),
create_display_list(true), harmonics(0) {}
// ... and so on
We never really liked these
long initialization lists and
always wondered why we
could not just initialize when
we declare
Kinder, gentler member init
class VisObj
! {
! ! std::string file = "";
! ! GLGraphics::GLViewController view_ctrl =
GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0);
! ! bool create_display_list = true;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics = nullptr;
GLGraphics::ManifoldRenderer* renderer = nullptr;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj() {}
// and so on
Now, we can! What
is up with nullptr?!
ArithVec changes
template <class T, class V, unsigned int N>
class ArithVec
{
protected:
/// The actual contents of the vector.
std::array<T,N> data;
// ......... Look, I did away with C style
arrays
ArithVec::ArithVec(T _a, T _b, T _c, T _d)
{
assert(N==4);
data[0] = _a;
data[1] = _b;
data[2] = _c;
data[3] = _d;
}
ArithVec::ArithVec(T _a, T _b, T _c, T _d):
data({_a,_b,_c,_d}) {assert(N==4);}
Look! an initializer list ... hmmm MSVC does not like it
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::for_each(begin(), end(), [k](T& x){x*=k;});
return static_cast<const V&>(*this);
}
Note: begin() and end()
make the code nicer
than before
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
for(auto& x : data) {x*=k;}
return static_cast<const V&>(*this);
}
Morten: this is
actually simpler!
bool ArithVec:: operator==(const V& v) const
{
return std::equal(begin(),end(), v.begin());
}
bool ArithVec::operator==(const V& v) const
{
return std::inner_product(data, &data[N], v.get(), true,
! ! ! std::logical_and<bool>(), std::equal_to<T>());
}
Just to use the
obvious.This was
possible before
C++11
circulate with functors
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(Walker&)> f)
{
Walker w = m.walker(v);
for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w);
return w.no_steps();
}
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(VertexID)> f)
{
return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());});
}
Five slides that show
what we can do by having
circulator functions
accepting functors
int valency(const Manifold& m, VertexID v)
{
// perform full circulation to get valency
Walker vj = m.walker(v);
while(!vj.full_circle())
vj = vj.circulate_vertex_cw();
return vj.no_steps();
}
int valency(const Manifold& m, VertexID v)
{
return circulate_vertex_ccw(m,v, [](Walker){});
}
bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
for(Walker vj = m.walker(v0); !vj.full_circle();
vj = vj.circulate_vertex_cw()){
if(vj.vertex() == v1)
return true;
}
return false;
}
bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
bool c=false;
circulate_vertex_ccw(m, v0, [&](VertexID v){ c |= (v==v1);});
return c;
}
inline Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d p(0);
int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); });
return p / n - m.pos(v);
}
Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d avg_pos(0);
int n = 0;
for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){
avg_pos += m.pos(w.vertex());
++n;
}
return avg_pos / n - m.pos(v);
}
int no_edges(const Manifold& m, FaceID f)
{
return circulate_face_ccw(m, f, [](Walker w){});
}
int no_edges(const Manifold& m, FaceID f)
{
// perform full circulation to get valency
Walker w = m.walker(f);
for(; !w.full_circle(); w = w.circulate_face_cw());
return w.no_steps();
}
Conclusions
• Multicore is very important and the C++11 thread library makes
concurrency easy.We will rely on the compiler for SIMD
optimization!
• range for is great. Makes code far more clear and we get rid of
iterators in many cases
• move semantics & RVO make clear code faster
• lambda functions improve on locality ... awesome with the STL
algorithms and std::function
• auto helps us avoid obfuscation with ugly type names
• uniform initialization and initializer lists also make code concise
Discussion
• A C++11 developer version of GEL has branched off: should
we go for built-in parallellism?
• Hmm - just so you know - there is much more in the C++11
standard.This is just the part I understand so far...
• Herb Sutter:“We broke all the books!”
• Yet the learning curve is less daunting than when we first had
to do templates.

More Related Content

PPT
Using QString effectively
Roman Okolovich
 
PDF
05 - Qt External Interaction and Graphics
Andreas Jakl
 
PDF
Actor Concurrency
Alex Miller
 
PDF
[C++ gui programming with qt4] chap9
Shih-Hsiang Lin
 
PDF
Learning Timed Automata with Cypher
Gábor Szárnyas
 
PPT
Kinematics 2
yapshinn
 
PPT
chapter-1
ahmedmosallm
 
Using QString effectively
Roman Okolovich
 
05 - Qt External Interaction and Graphics
Andreas Jakl
 
Actor Concurrency
Alex Miller
 
[C++ gui programming with qt4] chap9
Shih-Hsiang Lin
 
Learning Timed Automata with Cypher
Gábor Szárnyas
 
Kinematics 2
yapshinn
 
chapter-1
ahmedmosallm
 

What's hot (15)

PDF
Monolith to Reactive Microservices
Reactivesummit
 
PPTX
Projectile Motion
Arlo Alegre
 
PPTX
Chapter 2
Younes Sina
 
PPTX
Materi 3 Finite State Automata
ahmad haidaroh
 
PPTX
Concurrency in Programming Languages
Yudong Li
 
PPT
Kinematic equations of motion
mantlfin
 
PDF
Dynamics Kinematics Curvilinear Motion
Nikolai Priezjev
 
PPTX
Projectile motion
Nishant Sutar
 
PPSX
Solving Accelerated Motion Problems
walt sautter
 
PPT
Clojure concurrency
Alex Navis
 
PDF
Qt Widget In-Depth
account inactive
 
PDF
Introduction to idris
Conor Farrell
 
PPT
Lec 02 (constant acc 051)
nur amalina
 
DOC
M3 - Banfi Cavallo - Fila di sedie
GIOVANNI LARICCIA
 
PPTX
Linear motion of a particle
KhanSaif2
 
Monolith to Reactive Microservices
Reactivesummit
 
Projectile Motion
Arlo Alegre
 
Chapter 2
Younes Sina
 
Materi 3 Finite State Automata
ahmad haidaroh
 
Concurrency in Programming Languages
Yudong Li
 
Kinematic equations of motion
mantlfin
 
Dynamics Kinematics Curvilinear Motion
Nikolai Priezjev
 
Projectile motion
Nishant Sutar
 
Solving Accelerated Motion Problems
walt sautter
 
Clojure concurrency
Alex Navis
 
Qt Widget In-Depth
account inactive
 
Introduction to idris
Conor Farrell
 
Lec 02 (constant acc 051)
nur amalina
 
M3 - Banfi Cavallo - Fila di sedie
GIOVANNI LARICCIA
 
Linear motion of a particle
KhanSaif2
 
Ad

Viewers also liked (9)

DOCX
Dave's_CurriculumVitae_2015
Dave Jackson
 
PDF
Comunicado de la juventud radical de mendoza
Unidiversidad
 
PPTX
Chile
stello86
 
PPTX
Sobre la identidad de los pueblos
Patrick Days
 
DOCX
175059616 rpp-perakitan-komputer
smansabes
 
DOC
Comunicado por decreto 1625
Unidiversidad
 
PDF
zk resume
zafrulla khan
 
PDF
Día de los enamorados 2017
Unidiversidad
 
PPTX
Reactive Frustrations
Tomasz Polanski
 
Dave's_CurriculumVitae_2015
Dave Jackson
 
Comunicado de la juventud radical de mendoza
Unidiversidad
 
Chile
stello86
 
Sobre la identidad de los pueblos
Patrick Days
 
175059616 rpp-perakitan-komputer
smansabes
 
Comunicado por decreto 1625
Unidiversidad
 
zk resume
zafrulla khan
 
Día de los enamorados 2017
Unidiversidad
 
Reactive Frustrations
Tomasz Polanski
 
Ad

Similar to Experiments with C++11 (20)

PPTX
C++11 - STL Additions
GlobalLogic Ukraine
 
PDF
Memory efficient pytorch
Hyungjoo Cho
 
ODP
C++ Secure Programming
Marian Marinov
 
PPTX
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Eastern European Computer Vision Conference
 
PDF
GPU Kernels for Block-Sparse Weights
Willy Marroquin (WillyDevNET)
 
DOCX
I am trying to fill out a program where the method definitions will b.docx
Phil4IDBrownh
 
PPT
Advance features of C++
vidyamittal
 
PPT
Cppt 101102014428-phpapp01
Getachew Ganfur
 
PDF
include ltfunctionalgt include ltiteratorgt inclu.pdf
naslin841216
 
PPTX
ExploringPrimsAlgorithmforMinimumSpanningTreesinC.pptx
naufalmaulana43
 
PPTX
What's New in C++ 11/14?
Dina Goldshtein
 
PDF
MultipleObjectDepthMapRec
Zachary Job
 
PPT
Cpp tutorial
FALLEE31188
 
PDF
Pragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Marina Kolpakova
 
PDF
Introduction to Parallelization ans performance optimization
CSUC - Consorci de Serveis Universitaris de Catalunya
 
PDF
include ltfunctionalgt include ltiteratorgt inclu.pdf
aathmiboutique
 
PDF
Accelerating microbiome research with OpenACC
Igor Sfiligoi
 
PDF
Write CC++ a program that inputs a weighted undirected graph and fi.pdf
4babies2010
 
PPTX
Better performance through Superscalarity
Mårten Rånge
 
PDF
Task based Programming with OmpSs and its Application
Facultad de Informática UCM
 
C++11 - STL Additions
GlobalLogic Ukraine
 
Memory efficient pytorch
Hyungjoo Cho
 
C++ Secure Programming
Marian Marinov
 
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Eastern European Computer Vision Conference
 
GPU Kernels for Block-Sparse Weights
Willy Marroquin (WillyDevNET)
 
I am trying to fill out a program where the method definitions will b.docx
Phil4IDBrownh
 
Advance features of C++
vidyamittal
 
Cppt 101102014428-phpapp01
Getachew Ganfur
 
include ltfunctionalgt include ltiteratorgt inclu.pdf
naslin841216
 
ExploringPrimsAlgorithmforMinimumSpanningTreesinC.pptx
naufalmaulana43
 
What's New in C++ 11/14?
Dina Goldshtein
 
MultipleObjectDepthMapRec
Zachary Job
 
Cpp tutorial
FALLEE31188
 
Pragmatic Optimization in Modern Programming - Mastering Compiler Optimizations
Marina Kolpakova
 
Introduction to Parallelization ans performance optimization
CSUC - Consorci de Serveis Universitaris de Catalunya
 
include ltfunctionalgt include ltiteratorgt inclu.pdf
aathmiboutique
 
Accelerating microbiome research with OpenACC
Igor Sfiligoi
 
Write CC++ a program that inputs a weighted undirected graph and fi.pdf
4babies2010
 
Better performance through Superscalarity
Mårten Rånge
 
Task based Programming with OmpSs and its Application
Facultad de Informática UCM
 

Recently uploaded (20)

PPTX
Dakar Framework Education For All- 2000(Act)
santoshmohalik1
 
PDF
Virat Kohli- the Pride of Indian cricket
kushpar147
 
PPTX
PROTIEN ENERGY MALNUTRITION: NURSING MANAGEMENT.pptx
PRADEEP ABOTHU
 
PPTX
Python-Application-in-Drug-Design by R D Jawarkar.pptx
Rahul Jawarkar
 
PDF
Health-The-Ultimate-Treasure (1).pdf/8th class science curiosity /samyans edu...
Sandeep Swamy
 
PPTX
Applications of matrices In Real Life_20250724_091307_0000.pptx
gehlotkrish03
 
PPTX
Basics and rules of probability with real-life uses
ravatkaran694
 
PPTX
Continental Accounting in Odoo 18 - Odoo Slides
Celine George
 
PPTX
Five Point Someone – Chetan Bhagat | Book Summary & Analysis by Bhupesh Kushwaha
Bhupesh Kushwaha
 
PPTX
Kanban Cards _ Mass Action in Odoo 18.2 - Odoo Slides
Celine George
 
PPTX
BASICS IN COMPUTER APPLICATIONS - UNIT I
suganthim28
 
PDF
Biological Classification Class 11th NCERT CBSE NEET.pdf
NehaRohtagi1
 
PPTX
HISTORY COLLECTION FOR PSYCHIATRIC PATIENTS.pptx
PoojaSen20
 
PPTX
Measures_of_location_-_Averages_and__percentiles_by_DR SURYA K.pptx
Surya Ganesh
 
PPTX
A Smarter Way to Think About Choosing a College
Cyndy McDonald
 
PPTX
Artificial-Intelligence-in-Drug-Discovery by R D Jawarkar.pptx
Rahul Jawarkar
 
PPTX
20250924 Navigating the Future: How to tell the difference between an emergen...
McGuinness Institute
 
PPTX
How to Track Skills & Contracts Using Odoo 18 Employee
Celine George
 
PDF
The-Invisible-Living-World-Beyond-Our-Naked-Eye chapter 2.pdf/8th science cur...
Sandeep Swamy
 
PPTX
Sonnet 130_ My Mistress’ Eyes Are Nothing Like the Sun By William Shakespear...
DhatriParmar
 
Dakar Framework Education For All- 2000(Act)
santoshmohalik1
 
Virat Kohli- the Pride of Indian cricket
kushpar147
 
PROTIEN ENERGY MALNUTRITION: NURSING MANAGEMENT.pptx
PRADEEP ABOTHU
 
Python-Application-in-Drug-Design by R D Jawarkar.pptx
Rahul Jawarkar
 
Health-The-Ultimate-Treasure (1).pdf/8th class science curiosity /samyans edu...
Sandeep Swamy
 
Applications of matrices In Real Life_20250724_091307_0000.pptx
gehlotkrish03
 
Basics and rules of probability with real-life uses
ravatkaran694
 
Continental Accounting in Odoo 18 - Odoo Slides
Celine George
 
Five Point Someone – Chetan Bhagat | Book Summary & Analysis by Bhupesh Kushwaha
Bhupesh Kushwaha
 
Kanban Cards _ Mass Action in Odoo 18.2 - Odoo Slides
Celine George
 
BASICS IN COMPUTER APPLICATIONS - UNIT I
suganthim28
 
Biological Classification Class 11th NCERT CBSE NEET.pdf
NehaRohtagi1
 
HISTORY COLLECTION FOR PSYCHIATRIC PATIENTS.pptx
PoojaSen20
 
Measures_of_location_-_Averages_and__percentiles_by_DR SURYA K.pptx
Surya Ganesh
 
A Smarter Way to Think About Choosing a College
Cyndy McDonald
 
Artificial-Intelligence-in-Drug-Discovery by R D Jawarkar.pptx
Rahul Jawarkar
 
20250924 Navigating the Future: How to tell the difference between an emergen...
McGuinness Institute
 
How to Track Skills & Contracts Using Odoo 18 Employee
Celine George
 
The-Invisible-Living-World-Beyond-Our-Naked-Eye chapter 2.pdf/8th science cur...
Sandeep Swamy
 
Sonnet 130_ My Mistress’ Eyes Are Nothing Like the Sun By William Shakespear...
DhatriParmar
 

Experiments with C++11

  • 1. C++11 A subset of the features explored
  • 2. What is happening? • We want • the performance of carefully optimized code • the convenience of a high level language • to use all our cores
  • 3. Example: Laplacian Smoothing Vertex moves to center of neighbors
  • 6. void laplacian_smooth0(Manifold& m,float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { VertexAttributeVector<Vec3d> L_attr(m.no_vertices()); for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v) if(!boundary(m, *v)) L_attr[*v] =laplacian(m, *v); for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v) if(!boundary(m, *v)) m.pos(*v) += weight*L_attr[*v]; } } 14.6Original It is so C++98
  • 7. void laplacian_smooth1(Manifold& m,float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { VertexAttributeVector<Vec3d> L_attr(m.no_vertices()); for(VertexID v : m.vertices()) if(!boundary(m, v)) L_attr[v] =laplacian(m, v); for(VertexID v : m.vertices()){ if(!boundary(m, v)) m.pos(v) += weight*L_attr[v]; } } } 14.2Range for Much better to read. Not only is the for loop clear, we did away with `*´. vertices() returns a class which just contains begin and end functions
  • 8. void laplacian_smooth2(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); m.positions_attribute_vector() = new_pos; } } 12.4Optimized And we only need one loop.We can memory move the vertex positions
  • 9. void laplacian_smooth3(Manifold& m, float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { auto new_pos = m.positions_attribute_vector(); for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); m.positions_attribute_vector() = move(new_pos); } } 12.6move Actually, we should move, but ... oh ... now I copy somewhere else
  • 10. void laplacian_smooth4(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); swap(m.positions_attribute_vector(),new_pos); } } 12.1swap Now we only have two buffers for vertex positions and always read from one and write to the other.Then swap! I think this version is the sweet spot for single threaded code.
  • 11. void laplacian_smooth4_5(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);}); swap(m.positions_attribute_vector(),new_pos); } } Lambda variation Not much more clear. Should be about the same performance...
  • 12. void laplacian_smooth5(Manifold& m, float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { auto new_pos = m.positions_attribute_vector(); vector<thread> t_vec; for(auto v : m.vertices()) if(!boundary(m, v)) t_vec.push_back(thread([&](VertexID vid){ if(!boundary(m, vid)) new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v)); for(int i=0;i<t_vec.size();++i) t_vec[i].join(); m.positions_attribute_vector() = move(new_pos); } } ∞Threads done wrong For a brief moment I must have thought I was coding to a GPU. First time I timed it, I got 666 times longer run time
  • 13. inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids, VertexAttributeVector<Vec3d>& new_pos, float weight){ for(auto v: vids) new_pos[v] = m.pos(v)+weight*laplacian(m, v); } void laplacian_smooth6(Manifold& m, float weight, int max_iter) { vector<vector<VertexID>> vertex_ids(CORES); auto batch_size = m.no_vertices()/CORES; int cnt = 0; for_each_vertex(m, [&](VertexID v) { if (!boundary(m, v)) vertex_ids[(cnt++/batch_size)%CORES].push_back(v); }); vector<thread> t_vec(CORES); VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(int thread_no=0;thread_no<CORES;++thread_no) t_vec[thread_no] = thread(laplacian_smooth_vertex, ref(m), ref(vertex_ids[thread_no]), ref(new_pos), weight); for(int thread_no=0;thread_no<CORES;++thread_no) t_vec[thread_no].join(); swap(m.positions_attribute_vector(), new_pos); } } 2.5 Almost five times performance improvement with four physical cores. hyperthreading works!! CORES = 8
  • 14. Statistics Median Baseline 14,6 14,6 14,5 14,6 14,6 14,6 Range for 14,4 14,2 14,2 14,2 14,2 14,2 Copy back 12,4 12,4 12,4 12,4 12,4 12,4 Move back 12,5 12,5 12,9 12,9 12,6 12,6 Swap 12,1 12,1 12,1 12,1 12,2 12,1 2 threads 6,8 6,7 6,8 6,7 6,7 6,7 4 threads 4,1 4,1 4,1 4,1 4,1 4,1 8 threads 2,5 2,5 2,5 2,5 2,5 2,5 s s s s s s
  • 15. Now make it generic!
  • 16. typedef vector<vector<VertexID>> VertexIDBatches; VertexIDBatches batch_vertices(Manifold& m) { VertexIDBatches vertex_ids(CORES); auto batch_size = m.no_vertices()/CORES; int cnt = 0; for_each_vertex(m, [&](VertexID v) { if (!boundary(m, v)) vertex_ids[(cnt++/batch_size)%CORES].push_back(v); }); return vertex_ids; } template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<thread> t_vec(no_threads); for(auto t : range(0, no_threads)) t_vec[t] = thread(f, ref(batches[t])); for(auto t : range(0, no_threads)) t_vec[t].join(); } #1 Produces a vector of vectors of vertex IDs #2 Actually spawns off worker threads
  • 17. void laplacian_smooth7(Manifold& m, float weight, int max_iter) { auto vertex_ids = batch_vertices(m); auto new_pos = m.positions_attribute_vector(); auto f = [&](const vector<VertexID>& vids) { for(VertexID v: vids) new_pos[v] = m.pos(v)+weight*laplacian(m, v); }; for(auto _ : range(0, max_iter)) { for_each_vertex_parallel(CORES, vertex_ids, f); swap(m.positions_attribute_vector(), new_pos); } } Slightly faster, much simpler. Note I threw in a range class to get rid of old school for loops. 2.4
  • 18. template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<future<void>> f_vec(no_threads); for(auto t : range(0, no_threads)) f_vec[t] = async(launch::async, f, ref(batches[t])); } template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<thread> t_vec(no_threads); for(auto t : range(0, no_threads)) t_vec[t] = thread(f, ref(batches[t])); for(auto t : range(0, no_threads)) t_vec[t].join(); } See the code above is simpler and the destructor joins! what happens if we ignore the future?! But the async code takes 50% more time than the old code where I join threads explicitly. Not sure why?!
  • 20. Polymorphism with std::function class MyClass { int c; public: MyClass(int _c): c(_c) {} function<int(int)> fun; void set_fun(function<int(int,int)> f) { fun = bind1st(f, c); } }; int fun1(int c, int x) { return c*x;} int fun2(int c, int x) { return x/c;} int main(int argc, const char * argv[]) { MyClass m1{1},m2{2}; m1.set_fun(fun1); m2.set_fun(fun2); cout << m1.fun(42) << " " << m2.fun(42) << endl; } Maybe more exotic than actually useful, but instructive that polymorphism can be achieved so differently from when using virtual functions
  • 21. Kinder, gentler member init class VisObj ! { ! ! std::string file; ! ! GLGraphics::GLViewController view_ctrl; ! ! bool create_display_list; ! ! HMesh::Manifold mani; ! ! HMesh::Manifold old_mani; ! ! ! ! Harmonics* harmonics; GLGraphics::ManifoldRenderer* renderer; ! ! CGLA::Vec3d bsphere_center; ! ! float bsphere_radius; ! public: ! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0), create_display_list(true), harmonics(0) {} // ... and so on We never really liked these long initialization lists and always wondered why we could not just initialize when we declare
  • 22. Kinder, gentler member init class VisObj ! { ! ! std::string file = ""; ! ! GLGraphics::GLViewController view_ctrl = GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0); ! ! bool create_display_list = true; ! ! HMesh::Manifold mani; ! ! HMesh::Manifold old_mani; ! ! ! ! Harmonics* harmonics = nullptr; GLGraphics::ManifoldRenderer* renderer = nullptr; ! ! CGLA::Vec3d bsphere_center; ! ! float bsphere_radius; ! public: ! ! VisObj() {} // and so on Now, we can! What is up with nullptr?!
  • 23. ArithVec changes template <class T, class V, unsigned int N> class ArithVec { protected: /// The actual contents of the vector. std::array<T,N> data; // ......... Look, I did away with C style arrays
  • 24. ArithVec::ArithVec(T _a, T _b, T _c, T _d) { assert(N==4); data[0] = _a; data[1] = _b; data[2] = _c; data[3] = _d; } ArithVec::ArithVec(T _a, T _b, T _c, T _d): data({_a,_b,_c,_d}) {assert(N==4);} Look! an initializer list ... hmmm MSVC does not like it
  • 25. /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::transform(data, &data[N], data, std::bind2nd(std::multiplies<T>(), k)); return static_cast<const V&>(*this); } /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::for_each(begin(), end(), [k](T& x){x*=k;}); return static_cast<const V&>(*this); } Note: begin() and end() make the code nicer than before
  • 26. /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::transform(data, &data[N], data, std::bind2nd(std::multiplies<T>(), k)); return static_cast<const V&>(*this); } /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { for(auto& x : data) {x*=k;} return static_cast<const V&>(*this); } Morten: this is actually simpler!
  • 27. bool ArithVec:: operator==(const V& v) const { return std::equal(begin(),end(), v.begin()); } bool ArithVec::operator==(const V& v) const { return std::inner_product(data, &data[N], v.get(), true, ! ! ! std::logical_and<bool>(), std::equal_to<T>()); } Just to use the obvious.This was possible before C++11
  • 28. circulate with functors inline int circulate_vertex_ccw(const Manifold& m, VertexID v, std::function<void(Walker&)> f) { Walker w = m.walker(v); for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w); return w.no_steps(); } inline int circulate_vertex_ccw(const Manifold& m, VertexID v, std::function<void(VertexID)> f) { return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());}); } Five slides that show what we can do by having circulator functions accepting functors
  • 29. int valency(const Manifold& m, VertexID v) { // perform full circulation to get valency Walker vj = m.walker(v); while(!vj.full_circle()) vj = vj.circulate_vertex_cw(); return vj.no_steps(); } int valency(const Manifold& m, VertexID v) { return circulate_vertex_ccw(m,v, [](Walker){}); }
  • 30. bool connected(const Manifold& m, VertexID v0, VertexID v1) { for(Walker vj = m.walker(v0); !vj.full_circle(); vj = vj.circulate_vertex_cw()){ if(vj.vertex() == v1) return true; } return false; } bool connected(const Manifold& m, VertexID v0, VertexID v1) { bool c=false; circulate_vertex_ccw(m, v0, [&](VertexID v){ c |= (v==v1);}); return c; }
  • 31. inline Vec3d laplacian(const Manifold& m, VertexID v) { Vec3d p(0); int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); }); return p / n - m.pos(v); } Vec3d laplacian(const Manifold& m, VertexID v) { Vec3d avg_pos(0); int n = 0; for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){ avg_pos += m.pos(w.vertex()); ++n; } return avg_pos / n - m.pos(v); }
  • 32. int no_edges(const Manifold& m, FaceID f) { return circulate_face_ccw(m, f, [](Walker w){}); } int no_edges(const Manifold& m, FaceID f) { // perform full circulation to get valency Walker w = m.walker(f); for(; !w.full_circle(); w = w.circulate_face_cw()); return w.no_steps(); }
  • 33. Conclusions • Multicore is very important and the C++11 thread library makes concurrency easy.We will rely on the compiler for SIMD optimization! • range for is great. Makes code far more clear and we get rid of iterators in many cases • move semantics & RVO make clear code faster • lambda functions improve on locality ... awesome with the STL algorithms and std::function • auto helps us avoid obfuscation with ugly type names • uniform initialization and initializer lists also make code concise
  • 34. Discussion • A C++11 developer version of GEL has branched off: should we go for built-in parallellism? • Hmm - just so you know - there is much more in the C++11 standard.This is just the part I understand so far... • Herb Sutter:“We broke all the books!” • Yet the learning curve is less daunting than when we first had to do templates.