SlideShare a Scribd company logo
1 of 34
Better performance
through Superscalarity
Mårten Rånge
How many GigaFlops?
i5 6600K 3.5 GHz
(4x cores)
~224 GigaFlops
64 Flops/cycle
Zn+1 = Zn
2 + C (1)
Z0 = C (2)
(x,y)
(x,y) + (c,d)
(x+c,y+d)
(x,y)2
(x2 - y2,2xy)
r
aZk
Z0
2
2a
r2
Z1 = Z0
2 + C
C
|R| = 2
Zl
Zm
Z0
Zn+1 = Zn
2 + C
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
r2 = x2 + y2
y
x
r
(x,y)2 = (x2 - y2,2xy)
Zn+1 = Zn
2 + C
SIMD
a = b+c
(a0,a1)=(b0,b1)+(c0,c1)
0 1 2 3
4 5 6 7
4 6 8 10
+
AVX
8 flops/instruction
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Minimize CPU stalls
opcode Latency Throughput
vmulps 5 1
vaddps 3 1
vsubps 3 1
vcmpps 3 1
vmovmskps 1 1
Task<float>
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
r2[0] = x2[0] + y2[0];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0]
y2[0] = y[0]*y[0]
r2[0] = x2[0]+y2[0]
x2[1] = x[1]*x[1]
y2[1] = y[1]*y[1]
r2[1] = x2[1]+y2[1]
Instructionqueue
FU
x2[0]
y2[0]
r2[0]
x2[1]
y2[1]
r2[1]
Resultqueue
Shouldn’t compilers
do this for us?
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Uses the mathematical properties of mandelbrot
Uses knowledge that inf and NaN <= 4 is false
AVX512
&
Hyper-threading
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
Questions?

More Related Content

What's hot

ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
hayato
 
ARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lectureARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lecture
anishgoel
 

What's hot (17)

JavaScript - Agora nervoso
JavaScript - Agora nervosoJavaScript - Agora nervoso
JavaScript - Agora nervoso
 
Vcs23
Vcs23Vcs23
Vcs23
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
ARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lectureARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lecture
 
Wap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithmWap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithm
 
El
ElEl
El
 
Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016
 
Computer graphics programs in c++
Computer graphics programs in c++Computer graphics programs in c++
Computer graphics programs in c++
 
10CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 1010CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 10
 
Senior design project code for PPG
Senior design project code for PPGSenior design project code for PPG
Senior design project code for PPG
 
Ssaw08 0624
Ssaw08 0624Ssaw08 0624
Ssaw08 0624
 
Numerical Method Assignment
Numerical Method AssignmentNumerical Method Assignment
Numerical Method Assignment
 
OOXX
OOXXOOXX
OOXX
 
Vcs9
Vcs9Vcs9
Vcs9
 
Snake.c
Snake.cSnake.c
Snake.c
 
When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)
 
Oprerator overloading
Oprerator overloadingOprerator overloading
Oprerator overloading
 

Similar to Better performance through Superscalarity

COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
TashiBhutia12
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdf
contact32
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltos
bellidomates
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programs
Gouthaman V
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScript
Roland Bouman
 

Similar to Better performance through Superscalarity (20)

Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
Vcs16
Vcs16Vcs16
Vcs16
 
Cocos2d Performance Tips
Cocos2d Performance TipsCocos2d Performance Tips
Cocos2d Performance Tips
 
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
 
SCIPY-SYMPY.pdf
SCIPY-SYMPY.pdfSCIPY-SYMPY.pdf
SCIPY-SYMPY.pdf
 
PBL1-v1-002j.pptx
PBL1-v1-002j.pptxPBL1-v1-002j.pptx
PBL1-v1-002j.pptx
 
Coscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageCoscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usage
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdf
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
 
Ocr code
Ocr codeOcr code
Ocr code
 
C# Assignmet Help
C# Assignmet HelpC# Assignmet Help
C# Assignmet Help
 
06 Recursion in C.pptx
06 Recursion in C.pptx06 Recursion in C.pptx
06 Recursion in C.pptx
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltos
 
C c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoC c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdo
 
Computer graphics lab manual
Computer graphics lab manualComputer graphics lab manual
Computer graphics lab manual
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programs
 
Guia edo todas
Guia edo todasGuia edo todas
Guia edo todas
 
Integral table
Integral tableIntegral table
Integral table
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScript
 

More from Mårten Rånge

More from Mårten Rånge (10)

Know your FOSS obligations
Know your FOSS obligationsKnow your FOSS obligations
Know your FOSS obligations
 
Ray Marching Explained
Ray Marching ExplainedRay Marching Explained
Ray Marching Explained
 
Property Based Tesing
Property Based TesingProperty Based Tesing
Property Based Tesing
 
Monad - a functional design pattern
Monad - a functional design patternMonad - a functional design pattern
Monad - a functional design pattern
 
Formlets
FormletsFormlets
Formlets
 
Pragmatic metaprogramming
Pragmatic metaprogrammingPragmatic metaprogramming
Pragmatic metaprogramming
 
Concurrency - responsiveness in .NET
Concurrency - responsiveness in .NETConcurrency - responsiveness in .NET
Concurrency - responsiveness in .NET
 
Meta Programming
Meta ProgrammingMeta Programming
Meta Programming
 
Concurrency scalability
Concurrency scalabilityConcurrency scalability
Concurrency scalability
 
Concurrency
ConcurrencyConcurrency
Concurrency
 

Recently uploaded

Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 

Recently uploaded (20)

Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
 
Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontology
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital Adaptability
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
Apidays New York 2024 - Accelerating FinTech Innovation by Vasa Krishnan, Fin...
Apidays New York 2024 - Accelerating FinTech Innovation by Vasa Krishnan, Fin...Apidays New York 2024 - Accelerating FinTech Innovation by Vasa Krishnan, Fin...
Apidays New York 2024 - Accelerating FinTech Innovation by Vasa Krishnan, Fin...
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)
 
Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​
 
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdfRising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024
 
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWEREMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 
CNIC Information System with Pakdata Cf In Pakistan
CNIC Information System with Pakdata Cf In PakistanCNIC Information System with Pakdata Cf In Pakistan
CNIC Information System with Pakdata Cf In Pakistan
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ..."I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
 

Better performance through Superscalarity

  • 2. How many GigaFlops? i5 6600K 3.5 GHz (4x cores)
  • 5.
  • 6. Zn+1 = Zn 2 + C (1) Z0 = C (2)
  • 12. r aZk Z0 2 2a r2 Z1 = Z0 2 + C C |R| = 2 Zl Zm Z0 Zn+1 = Zn 2 + C
  • 13.
  • 14. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; } r2 = x2 + y2 y x r (x,y)2 = (x2 - y2,2xy) Zn+1 = Zn 2 + C
  • 15. SIMD
  • 18. 0 1 2 3 4 5 6 7 4 6 8 10 +
  • 20. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 21. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 23. opcode Latency Throughput vmulps 5 1 vaddps 3 1 vsubps 3 1 vcmpps 3 1 vmovmskps 1 1
  • 25. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 26. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 27. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; r2[0] = x2[0] + y2[0]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 28. x2[0] = x[0]*x[0] y2[0] = y[0]*y[0] r2[0] = x2[0]+y2[0] x2[1] = x[1]*x[1] y2[1] = y[1]*y[1] r2[1] = x2[1]+y2[1] Instructionqueue FU x2[0] y2[0] r2[0] x2[1] y2[1] r2[1] Resultqueue
  • 30. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 31. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; } Uses the mathematical properties of mandelbrot Uses knowledge that inf and NaN <= 4 is false
  • 33. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }