SlideShare a Scribd company logo
GPU
Programming
on CPUs
Using C++AMP
Miller Lee
Outline
1. Introduction to C++AMP
2. Introduction to Tiling
3. tile_static
4. barrier.wait and solutions
a. C++11 thread
b. setjmp/longjmp
c. ucontext
2
(Homogeneous coordinates)
(0, 0) (0, 1) (0, 2) (0, 3)
(1, 0) (1, 1) (1, 2) (1, 3)
(2, 0) (2, 1) (2, 2) (2, 3)
(3, 0) (3, 1) (3, 2) (3, 3)
X
0
1
2
3
Matrix A b
=
0
1
2
3
result
Computing example
● Simple matrix multiplication
3
C++ Version
1. int A[4][4];
2. int b[4];
3. int result[4];
4. for (int i = 0; i < 4; i++) {
5. result[i] = 0;
6. for (int j = 0; j < 4; j++)
7. result[i] += A[i][j] * b[j];
8. } 4
C++AMP Version
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext, [&](index<1> idx) restrict(amp)
6. {
7. result[idx[0]] = 0;
8. for (int i = 0; i < 4; i++)
9. result[idx[0]] += A(idx[0], i) * b(i);
10. });
5
memory access
0 1 2 3
P0 P1 P2 P3
global memory
b
100t
Total access time = 400t 6
shared memory
0 1 2 3
shared memory
10t
100t
Total access time = 130t
b
7
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx)
restrict(amp)
6. {
7. int local = tidx.local[0];
8. int global = tidx.global[0];
9. tile_statc int buf[4];
10. buf[local] = b[global];
11. tidx.barrier.wait();
12. result[idx[0]] = 0;
13. for (int i = 0; i < 4; i++)
14. result[idx[0]] += A[idx[0]][i] * buf[i];
15. }); 8
barrier
9
Architecture
source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE
shared memory
accessible to all SPs
10
Goal
● Implement all the C++AMP function on CPU
instead of GPU without any compiler
modification.
11
tiled_static
● The limitation of C++ syntax leads to the
following choices
○ const, volatile
○ __attribute__(...)
○ static
● Choose static
○ static memory can be shared among all the threads
○ side effect: At most one thread group can be
executed at the same time.
#define tile_static static
12
Barrier.wait
● Threads in the same thread group will be
waited at the point where “wait” is called.
● Program can
a. perform real barrier action
b. jump out of current execution context
13
● True threading
○ C++11 thread
● Fake threading(Coroutines)
○ setjmp/longjmp
○ makecontext/getcontext/swapcontext/setcontext
Approaches
14
C++11 thread
● launch hundreds of threads at a time.
● implemente my own barrier by using C++11
mutex library.
→ extremely slow.
→ The data on static memory will be corrupted
15
setjmp/longjmp
● int setjmp(jmp_buf env)
○ setjmp() saves the stack context/environment in env
for later use by longjmp.
○ The stack context will be invalidated if the function
which called setjmp() returns.
● void longjmp(jmp_buf env, int val);
○ longjmp() restores the environment saved by the last
call of setjmp.
16
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf;
4. void wait(void) {
5. printf("waitn"); // prints
6. longjmp(buf,1);
7. }
8. void first(void) {
9. wait();
10. printf("firstn"); // does not print
11. }
12. int main() {
13. if (!setjmp(buf))
14. first(); // when executed, setjmp returns 0
15. else // when longjmp jumps back, setjmp returns 1
16. printf("mainn"); // prints
17. return 0;
18. }
17
Pseudo code (1)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
18
Pseudo code (2)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
19
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. } 20
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
21
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
ret address
buf
b
22
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
b
23
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
Cannot return
???
???
???
buf
b
24
Problems
● Cannot return
○ return address in the stack is destroyed
● Cannot use too many static variables
○ will lost spilled registers
→ can be solved by using “alloca”
http://www.codemud.net/~thinker/GinGin_CGI.
py/show_id_doc/489
25
ucontext.h
● ucontext_t
● getcontext
● makecontest
● swapcontext
● setcontext
26
ucontext_t
typedef struct ucontext {
struct ucontext *uc_link;
sigset_t uc_sigmask;
stack_t uc_stack;
mcontext_t uc_mcontext;
...
} ucontext_t;
● uc_link
○ points to the context that will be resumed when the current context
terminates
● uc_stack
○ the stack used by this context
● uc_mcontext
○ machine-specific representation of the saved context, that includes the
calling thread's machine registers
27
Functions
● int getcontext(ucontext_t *ucp);
○ initializes the structure pointed at by ucp.
● int setcontext(const ucontext_t *ucp);
○ restores the user context pointed at by ucp
● int swapcontext(ucontext_t *oucp, const
ucontext_t *ucp);
○ saves the current context in the structure pointed to
by oucp, and then activates the context pointed to by
ucp.
28
makecontext
● void makecontext(ucontext_t *ucp, void
(*func)(), int argc, ...);
○ glibc(x86_64) saves the arguments to registers
instead of pushing them on stack as AMD64 ABI
said
○ The size of the arguments that passed to
makecontext should be no less than sizeof(register)
29
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[2];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx[0]);
7. puts("finish f1");
8. }
9. int main (void)
10. {
11. char st1[8192];
12. getcontext(&ctx[1]);
13. ctx[1].uc_stack.ss_sp = st1;
14. ctx[1].uc_stack.ss_size = sizeof st1;
15. ctx[1].uc_link = &ctx[0];
16. makecontext(&ctx[1], f1, 0);
17. swapcontext(&ctx[0], &ctx[1]);
18. swapcontext(&ctx[0], &ctx[1]);
19. return 0;
20. } 30
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[3];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx
[0]);
7. puts("finish f1");
8. }
9. static void f2 (void)
10. {
11. puts("start f2");
12. swapcontext(&ctx[2], &ctx
[1]);
13. puts("finish f2");
14. }
1. int main (void)
2. {
3. char st1[8192], st2[8192];
4. getcontext(&ctx[1]);
5. ctx[1].uc_stack.ss_sp = st1;
6. ctx[1].uc_stack.ss_size = sizeof
st1;
7. ctx[1].uc_link = &ctx[0];
8. makecontext(&ctx[1], f1, 0);
9.
10. getcontext(&ctx[2]);
11. ctx[2].uc_stack.ss_sp = st2;
12. ctx[2].uc_stack.ss_size = sizeof
st2;
13. ctx[2].uc_link = &ctx[1];
14. makecontext(&ctx[2], f2, 0);
15. swapcontext(&ctx[0], &ctx[2]);
16. swapcontext(&ctx[0], &ctx[2]);
17. return 0;
18. }
31
Fake threading (yield)
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
32
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
Problems
1. How to pass a lambda?
○ makecontext(&ctx,
(void (*)(void))&Kernel::operator(), …);
2. How to pass non-int arguments?
○ What if sizeof(Type) > sizeof(int)
○ How about complex structure and class
33
Pass lambda
1. Use a wrapper function!!
template <typename Ker, typename Arg>
void fun(Ker k, Arg arg)
{
k(arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg);
}
34
Pass non-int arguments
2. Pass pointer instead!!
template <typename Ker, typename Arg>
void fun(Ker *k, Arg *arg)
{
(*k)(*arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg);
}
35
Additional
● Use a counter so that we can spawn
coroutines dynamically
● Can it be multithreaded? Yes
36
true threading
barrier
There are 12 threads in one thread group
37
one thread
barrier
38
multithreading
barrier
Hardware Core = 4
39
barrier
struct bar_t {
unsigned const count;
std::atomic<unsigned> spaces;
std::atomic<unsigned> generation;
bar_t(unsigned count_) :
count(count_), spaces(count_), generation(0)
{}
void wait() noexcept {
unsigned const my_generation = generation;
if (!--spaces) {
spaces = count;
++generation;
} else {
while(generation == my_generation);
}
}
}; source: C++ Concurrency in Action: Practical Multithreading
40
Summary
● It works fine on AMP right now
● The importance of low level knowledge
41
42

More Related Content

PDF
C++ amp on linux
Miller Lee
 
PPTX
ISCA Final Presentaiton - Compilations
HSA Foundation
 
PDF
C++ How I learned to stop worrying and love metaprogramming
cppfrug
 
PPT
Intro2 Cuda Moayad
Moayadhn
 
PDF
Tiramisu をちょっと、味見してみました。
Mr. Vengineer
 
PDF
Joel Falcou, Boost.SIMD
Sergey Platonov
 
PDF
TVM VTA (TSIM)
Mr. Vengineer
 
PDF
Vc4c development of opencl compiler for videocore4
nomaddo
 
C++ amp on linux
Miller Lee
 
ISCA Final Presentaiton - Compilations
HSA Foundation
 
C++ How I learned to stop worrying and love metaprogramming
cppfrug
 
Intro2 Cuda Moayad
Moayadhn
 
Tiramisu をちょっと、味見してみました。
Mr. Vengineer
 
Joel Falcou, Boost.SIMD
Sergey Platonov
 
TVM VTA (TSIM)
Mr. Vengineer
 
Vc4c development of opencl compiler for videocore4
nomaddo
 

What's hot (20)

PPTX
Story of static code analyzer development
Andrey Karpov
 
DOCX
Histogram dan Segmentasi 2
Lusiana Diyan
 
PDF
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
changehee lee
 
PDF
OpenGL SC 2.0 Quick Reference
The Khronos Group Inc.
 
PDF
Vulkan 1.1 Reference Guide
The Khronos Group Inc.
 
PDF
Powered by Python - PyCon Germany 2016
Steffen Wenz
 
PDF
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
PROIDEA
 
PDF
Dafunctor
Buganini Chiu
 
PDF
Facebook Glow Compiler のソースコードをグダグダ語る会
Mr. Vengineer
 
PDF
Kirk Shoop, Reactive programming in C++
Sergey Platonov
 
PDF
HKG15-207: Advanced Toolchain Usage Part 3
Linaro
 
PDF
Global Interpreter Lock: Episode I - Break the Seal
Tzung-Bi Shih
 
PDF
Cluj.py Meetup: Extending Python in C
Steffen Wenz
 
PDF
深入淺出C語言
Simen Li
 
PDF
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Mr. Vengineer
 
PDF
Open CL For Speedup Workshop
Ofer Rosenberg
 
PDF
Interpreter, Compiler, JIT from scratch
National Cheng Kung University
 
PDF
Windbg랑 친해지기
Ji Hun Kim
 
PDF
Cluj Big Data Meetup - Big Data in Practice
Steffen Wenz
 
PDF
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Shinya Takamaeda-Y
 
Story of static code analyzer development
Andrey Karpov
 
Histogram dan Segmentasi 2
Lusiana Diyan
 
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
changehee lee
 
OpenGL SC 2.0 Quick Reference
The Khronos Group Inc.
 
Vulkan 1.1 Reference Guide
The Khronos Group Inc.
 
Powered by Python - PyCon Germany 2016
Steffen Wenz
 
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
PROIDEA
 
Dafunctor
Buganini Chiu
 
Facebook Glow Compiler のソースコードをグダグダ語る会
Mr. Vengineer
 
Kirk Shoop, Reactive programming in C++
Sergey Platonov
 
HKG15-207: Advanced Toolchain Usage Part 3
Linaro
 
Global Interpreter Lock: Episode I - Break the Seal
Tzung-Bi Shih
 
Cluj.py Meetup: Extending Python in C
Steffen Wenz
 
深入淺出C語言
Simen Li
 
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Mr. Vengineer
 
Open CL For Speedup Workshop
Ofer Rosenberg
 
Interpreter, Compiler, JIT from scratch
National Cheng Kung University
 
Windbg랑 친해지기
Ji Hun Kim
 
Cluj Big Data Meetup - Big Data in Practice
Steffen Wenz
 
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Shinya Takamaeda-Y
 
Ad

Similar to GPU Programming on CPU - Using C++AMP (20)

PDF
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
corehard_by
 
PDF
TLPI - 6 Process
Shu-Yu Fu
 
PPTX
Blazing Fast Windows 8 Apps using Visual C++
Microsoft Developer Network (MSDN) - Belgium and Luxembourg
 
PPT
Shared Memory Programming with Pthreads (1).ppt
MALARMANNANA1
 
PDF
HIS 2017 Mark Batty-Industrial concurrency specification for C/C++
jamieayre
 
PDF
Processes and Threads
Emery Berger
 
PDF
posix.pdf
ssuserf39414
 
PDF
OSTEP Chapter2 Introduction
Shuya Osaki
 
PPTX
Putting a Fork in Fork (Linux Process and Memory Management)
David Evans
 
PDF
Parallel Programming
Roman Okolovich
 
PDF
Unix Programs
Griffinder VinHai
 
PPTX
Threads and multi threading
Antonio Cesarano
 
PPT
CS4961-L9.ppt
MarlonMagtibay2
 
PPTX
CS345 09 - Ch04 Threads operating system1.pptx
RichaAgnihotri13
 
PPTX
24-sync-basic.pptx
ntploc22
 
PDF
Giorgio zoppi cpp11concurrency
Giorgio Zoppi
 
PDF
Appsec obfuscator reloaded
Cyber Security Alliance
 
PDF
Multithreaded Programming Part- II.pdf
Harika Pudugosula
 
PDF
Consider the fork_examplec code under Example code for pr.pdf
abinayamobiles
 
PDF
Unmanaged Parallelization via P/Invoke
Dmitri Nesteruk
 
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
corehard_by
 
TLPI - 6 Process
Shu-Yu Fu
 
Blazing Fast Windows 8 Apps using Visual C++
Microsoft Developer Network (MSDN) - Belgium and Luxembourg
 
Shared Memory Programming with Pthreads (1).ppt
MALARMANNANA1
 
HIS 2017 Mark Batty-Industrial concurrency specification for C/C++
jamieayre
 
Processes and Threads
Emery Berger
 
posix.pdf
ssuserf39414
 
OSTEP Chapter2 Introduction
Shuya Osaki
 
Putting a Fork in Fork (Linux Process and Memory Management)
David Evans
 
Parallel Programming
Roman Okolovich
 
Unix Programs
Griffinder VinHai
 
Threads and multi threading
Antonio Cesarano
 
CS4961-L9.ppt
MarlonMagtibay2
 
CS345 09 - Ch04 Threads operating system1.pptx
RichaAgnihotri13
 
24-sync-basic.pptx
ntploc22
 
Giorgio zoppi cpp11concurrency
Giorgio Zoppi
 
Appsec obfuscator reloaded
Cyber Security Alliance
 
Multithreaded Programming Part- II.pdf
Harika Pudugosula
 
Consider the fork_examplec code under Example code for pr.pdf
abinayamobiles
 
Unmanaged Parallelization via P/Invoke
Dmitri Nesteruk
 
Ad

Recently uploaded (20)

PPTX
TRAVEL APIs | WHITE LABEL TRAVEL API | TOP TRAVEL APIs
philipnathen82
 
PDF
What to consider before purchasing Microsoft 365 Business Premium_PDF.pdf
Q-Advise
 
DOCX
Can You Build Dashboards Using Open Source Visualization Tool.docx
Varsha Nayak
 
PDF
Summary Of Odoo 18.1 to 18.4 : The Way For Odoo 19
CandidRoot Solutions Private Limited
 
PDF
Bandai Playdia The Book - David Glotz
BluePanther6
 
PDF
Key Features to Look for in Arizona App Development Services
Net-Craft.com
 
PDF
Generating Union types w/ Static Analysis
K. Matthew Dupree
 
PDF
ChatPharo: an Open Architecture for Understanding How to Talk Live to LLMs
ESUG
 
PPTX
The-Dawn-of-AI-Reshaping-Our-World.pptxx
parthbhanushali307
 
PPTX
Odoo Integration Services by Candidroot Solutions
CandidRoot Solutions Private Limited
 
PDF
Adobe Illustrator Crack Full Download (Latest Version 2025) Pre-Activated
imang66g
 
PDF
Salesforce Implementation Services Provider.pdf
VALiNTRY360
 
PDF
Exploring AI Agents in Process Industries
amoreira6
 
PDF
MiniTool Power Data Recovery Crack New Pre Activated Version Latest 2025
imang66g
 
PDF
On Software Engineers' Productivity - Beyond Misleading Metrics
Romén Rodríguez-Gil
 
PDF
10 posting ideas for community engagement with AI prompts
Pankaj Taneja
 
PDF
Applitools Platform Pulse: What's New and What's Coming - July 2025
Applitools
 
PDF
Download iTop VPN Free 6.1.0.5882 Crack Full Activated Pre Latest 2025
imang66g
 
PPTX
Web Testing.pptx528278vshbuqffqhhqiwnwuq
studylike474
 
PDF
New Download MiniTool Partition Wizard Crack Latest Version 2025
imang66g
 
TRAVEL APIs | WHITE LABEL TRAVEL API | TOP TRAVEL APIs
philipnathen82
 
What to consider before purchasing Microsoft 365 Business Premium_PDF.pdf
Q-Advise
 
Can You Build Dashboards Using Open Source Visualization Tool.docx
Varsha Nayak
 
Summary Of Odoo 18.1 to 18.4 : The Way For Odoo 19
CandidRoot Solutions Private Limited
 
Bandai Playdia The Book - David Glotz
BluePanther6
 
Key Features to Look for in Arizona App Development Services
Net-Craft.com
 
Generating Union types w/ Static Analysis
K. Matthew Dupree
 
ChatPharo: an Open Architecture for Understanding How to Talk Live to LLMs
ESUG
 
The-Dawn-of-AI-Reshaping-Our-World.pptxx
parthbhanushali307
 
Odoo Integration Services by Candidroot Solutions
CandidRoot Solutions Private Limited
 
Adobe Illustrator Crack Full Download (Latest Version 2025) Pre-Activated
imang66g
 
Salesforce Implementation Services Provider.pdf
VALiNTRY360
 
Exploring AI Agents in Process Industries
amoreira6
 
MiniTool Power Data Recovery Crack New Pre Activated Version Latest 2025
imang66g
 
On Software Engineers' Productivity - Beyond Misleading Metrics
Romén Rodríguez-Gil
 
10 posting ideas for community engagement with AI prompts
Pankaj Taneja
 
Applitools Platform Pulse: What's New and What's Coming - July 2025
Applitools
 
Download iTop VPN Free 6.1.0.5882 Crack Full Activated Pre Latest 2025
imang66g
 
Web Testing.pptx528278vshbuqffqhhqiwnwuq
studylike474
 
New Download MiniTool Partition Wizard Crack Latest Version 2025
imang66g
 

GPU Programming on CPU - Using C++AMP

  • 2. Outline 1. Introduction to C++AMP 2. Introduction to Tiling 3. tile_static 4. barrier.wait and solutions a. C++11 thread b. setjmp/longjmp c. ucontext 2
  • 3. (Homogeneous coordinates) (0, 0) (0, 1) (0, 2) (0, 3) (1, 0) (1, 1) (1, 2) (1, 3) (2, 0) (2, 1) (2, 2) (2, 3) (3, 0) (3, 1) (3, 2) (3, 3) X 0 1 2 3 Matrix A b = 0 1 2 3 result Computing example ● Simple matrix multiplication 3
  • 4. C++ Version 1. int A[4][4]; 2. int b[4]; 3. int result[4]; 4. for (int i = 0; i < 4; i++) { 5. result[i] = 0; 6. for (int j = 0; j < 4; j++) 7. result[i] += A[i][j] * b[j]; 8. } 4
  • 5. C++AMP Version 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext, [&](index<1> idx) restrict(amp) 6. { 7. result[idx[0]] = 0; 8. for (int i = 0; i < 4; i++) 9. result[idx[0]] += A(idx[0], i) * b(i); 10. }); 5
  • 6. memory access 0 1 2 3 P0 P1 P2 P3 global memory b 100t Total access time = 400t 6
  • 7. shared memory 0 1 2 3 shared memory 10t 100t Total access time = 130t b 7
  • 8. 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx) restrict(amp) 6. { 7. int local = tidx.local[0]; 8. int global = tidx.global[0]; 9. tile_statc int buf[4]; 10. buf[local] = b[global]; 11. tidx.barrier.wait(); 12. result[idx[0]] = 0; 13. for (int i = 0; i < 4; i++) 14. result[idx[0]] += A[idx[0]][i] * buf[i]; 15. }); 8
  • 10. Architecture source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE shared memory accessible to all SPs 10
  • 11. Goal ● Implement all the C++AMP function on CPU instead of GPU without any compiler modification. 11
  • 12. tiled_static ● The limitation of C++ syntax leads to the following choices ○ const, volatile ○ __attribute__(...) ○ static ● Choose static ○ static memory can be shared among all the threads ○ side effect: At most one thread group can be executed at the same time. #define tile_static static 12
  • 13. Barrier.wait ● Threads in the same thread group will be waited at the point where “wait” is called. ● Program can a. perform real barrier action b. jump out of current execution context 13
  • 14. ● True threading ○ C++11 thread ● Fake threading(Coroutines) ○ setjmp/longjmp ○ makecontext/getcontext/swapcontext/setcontext Approaches 14
  • 15. C++11 thread ● launch hundreds of threads at a time. ● implemente my own barrier by using C++11 mutex library. → extremely slow. → The data on static memory will be corrupted 15
  • 16. setjmp/longjmp ● int setjmp(jmp_buf env) ○ setjmp() saves the stack context/environment in env for later use by longjmp. ○ The stack context will be invalidated if the function which called setjmp() returns. ● void longjmp(jmp_buf env, int val); ○ longjmp() restores the environment saved by the last call of setjmp. 16
  • 17. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf; 4. void wait(void) { 5. printf("waitn"); // prints 6. longjmp(buf,1); 7. } 8. void first(void) { 9. wait(); 10. printf("firstn"); // does not print 11. } 12. int main() { 13. if (!setjmp(buf)) 14. first(); // when executed, setjmp returns 0 15. else // when longjmp jumps back, setjmp returns 1 16. printf("mainn"); // prints 17. return 0; 18. } 17
  • 18. Pseudo code (1) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 18
  • 19. Pseudo code (2) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 19
  • 20. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } 20
  • 21. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf 21
  • 22. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } ret address buf b 22
  • 23. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf b 23
  • 24. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } Cannot return ??? ??? ??? buf b 24
  • 25. Problems ● Cannot return ○ return address in the stack is destroyed ● Cannot use too many static variables ○ will lost spilled registers → can be solved by using “alloca” http://www.codemud.net/~thinker/GinGin_CGI. py/show_id_doc/489 25
  • 26. ucontext.h ● ucontext_t ● getcontext ● makecontest ● swapcontext ● setcontext 26
  • 27. ucontext_t typedef struct ucontext { struct ucontext *uc_link; sigset_t uc_sigmask; stack_t uc_stack; mcontext_t uc_mcontext; ... } ucontext_t; ● uc_link ○ points to the context that will be resumed when the current context terminates ● uc_stack ○ the stack used by this context ● uc_mcontext ○ machine-specific representation of the saved context, that includes the calling thread's machine registers 27
  • 28. Functions ● int getcontext(ucontext_t *ucp); ○ initializes the structure pointed at by ucp. ● int setcontext(const ucontext_t *ucp); ○ restores the user context pointed at by ucp ● int swapcontext(ucontext_t *oucp, const ucontext_t *ucp); ○ saves the current context in the structure pointed to by oucp, and then activates the context pointed to by ucp. 28
  • 29. makecontext ● void makecontext(ucontext_t *ucp, void (*func)(), int argc, ...); ○ glibc(x86_64) saves the arguments to registers instead of pushing them on stack as AMD64 ABI said ○ The size of the arguments that passed to makecontext should be no less than sizeof(register) 29
  • 30. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[2]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx[0]); 7. puts("finish f1"); 8. } 9. int main (void) 10. { 11. char st1[8192]; 12. getcontext(&ctx[1]); 13. ctx[1].uc_stack.ss_sp = st1; 14. ctx[1].uc_stack.ss_size = sizeof st1; 15. ctx[1].uc_link = &ctx[0]; 16. makecontext(&ctx[1], f1, 0); 17. swapcontext(&ctx[0], &ctx[1]); 18. swapcontext(&ctx[0], &ctx[1]); 19. return 0; 20. } 30
  • 31. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[3]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx [0]); 7. puts("finish f1"); 8. } 9. static void f2 (void) 10. { 11. puts("start f2"); 12. swapcontext(&ctx[2], &ctx [1]); 13. puts("finish f2"); 14. } 1. int main (void) 2. { 3. char st1[8192], st2[8192]; 4. getcontext(&ctx[1]); 5. ctx[1].uc_stack.ss_sp = st1; 6. ctx[1].uc_stack.ss_size = sizeof st1; 7. ctx[1].uc_link = &ctx[0]; 8. makecontext(&ctx[1], f1, 0); 9. 10. getcontext(&ctx[2]); 11. ctx[2].uc_stack.ss_sp = st2; 12. ctx[2].uc_stack.ss_size = sizeof st2; 13. ctx[2].uc_link = &ctx[1]; 14. makecontext(&ctx[2], f2, 0); 15. swapcontext(&ctx[0], &ctx[2]); 16. swapcontext(&ctx[0], &ctx[2]); 17. return 0; 18. } 31
  • 32. Fake threading (yield) void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... } 32 void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... }
  • 33. Problems 1. How to pass a lambda? ○ makecontext(&ctx, (void (*)(void))&Kernel::operator(), …); 2. How to pass non-int arguments? ○ What if sizeof(Type) > sizeof(int) ○ How about complex structure and class 33
  • 34. Pass lambda 1. Use a wrapper function!! template <typename Ker, typename Arg> void fun(Ker k, Arg arg) { k(arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg); } 34
  • 35. Pass non-int arguments 2. Pass pointer instead!! template <typename Ker, typename Arg> void fun(Ker *k, Arg *arg) { (*k)(*arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg); } 35
  • 36. Additional ● Use a counter so that we can spawn coroutines dynamically ● Can it be multithreaded? Yes 36
  • 37. true threading barrier There are 12 threads in one thread group 37
  • 40. barrier struct bar_t { unsigned const count; std::atomic<unsigned> spaces; std::atomic<unsigned> generation; bar_t(unsigned count_) : count(count_), spaces(count_), generation(0) {} void wait() noexcept { unsigned const my_generation = generation; if (!--spaces) { spaces = count; ++generation; } else { while(generation == my_generation); } } }; source: C++ Concurrency in Action: Practical Multithreading 40
  • 41. Summary ● It works fine on AMP right now ● The importance of low level knowledge 41
  • 42. 42