470 lines
No EOL
32 KiB
HTML
470 lines
No EOL
32 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
|
||
|
||
|
||
<html lang="en">
|
||
<head>
|
||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||
|
||
<title>9.4. Limits of Parallelism and Scaling — Computer Systems Fundamentals</title>
|
||
|
||
<link rel="stylesheet" href="_static/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous" />
|
||
<link rel="stylesheet" href="_static/css/pygments.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/css/normalize.css" type="text/css" />
|
||
<link rel="stylesheet" href="../../../JSAV/css/JSAV.css" type="text/css" />
|
||
<link rel="stylesheet" href="../../../lib/odsaMOD-min.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/css/jquery-1.11.4-smoothness-ui.css" type="text/css" />
|
||
<link rel="stylesheet" href="../../../lib/odsaStyle-min.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/css/csf.css" type="text/css" />
|
||
|
||
<style>
|
||
.underline { text-decoration: underline; }
|
||
</style>
|
||
|
||
<script type="text/javascript">
|
||
var DOCUMENTATION_OPTIONS = {
|
||
URL_ROOT: './',
|
||
VERSION: '0.4.1',
|
||
COLLAPSE_INDEX: false,
|
||
FILE_SUFFIX: '.html',
|
||
HAS_SOURCE: true
|
||
};
|
||
</script>
|
||
|
||
<script type="text/x-mathjax-config">
|
||
MathJax.Hub.Config({
|
||
tex2jax: {
|
||
inlineMath: [['$','$'], ['\\(','\\)']],
|
||
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
|
||
processEscapes: true
|
||
},
|
||
"HTML-CSS": {
|
||
scale: "80"
|
||
}
|
||
});
|
||
</script>
|
||
<link rel="shortcut icon" href="_static/favicon.ico"/>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="index" title="Computer Systems Fundamentals" href="index.html" />
|
||
<link rel="next" title="5. Timing in Distributed Environments" href="DistTiming.html" />
|
||
<link rel="prev" title="3. Parallel Design Patterns" href="ParallelDesign.html" />
|
||
|
||
</head><body>
|
||
|
||
<nav class="navbar navbar-expand-md navbar-dark navbar-custom fixed-top">
|
||
|
||
<a class="navbar-brand py-0" href="index.html"><img src="_static/CSF-Logo-Square-Text.png" alt="OpenCSF Logo" height="40em" class="py-1 px-2 mb-0 align-center rounded-lg bg-white" /></a>
|
||
<!-- Show a navbar toggler on mobile -->
|
||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#defaultNavbars" aria-controls="defaultNavbars" aria-expanded="false" aria-label="Toggle navigation">
|
||
<span class="navbar-toggler-icon"></span>
|
||
</button>
|
||
<div class="collapse navbar-collapse" id="defaultNavbars">
|
||
<ul class="navbar-nav mr-auto">
|
||
<li class="nav-item dropdown">
|
||
<a class="nav-link dropdown-toggle jmu-gold rounded" href="Scaling.html#" id="navbarDropdownChapters" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Contents</a>
|
||
<div class="dropdown-menu scrollable-menu" role="menu" aria-labelledby="navbarDropdownChapters">
|
||
<a class="dropdown-item" tabindex="-1" href="Scaling.html#"><b>Chapter 1</b></a>
|
||
<a class="dropdown-item" href="IntroConcSysOverview.html"> 1.1. Introduction to Concurrent Systems</a>
|
||
<a class="dropdown-item" href="SysAndModels.html"> 1.2. Systems and Models</a>
|
||
<a class="dropdown-item" href="Themes.html"> 1.3. Themes and Guiding Principles</a>
|
||
<a class="dropdown-item" href="Architectures.html"> 1.4. System Architectures</a>
|
||
<a class="dropdown-item" href="StateModels.html"> 1.5. State Models in UML</a>
|
||
<a class="dropdown-item" href="SequenceModels.html"> 1.6. Sequence Models in UML</a>
|
||
<a class="dropdown-item" href="StateModelImplementation.html"> 1.7. Extended Example: State Model Implementation</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 2</b></a>
|
||
<a class="dropdown-item" href="ProcessesOverview.html"> 2.1. Processes and OS Basics</a>
|
||
<a class="dropdown-item" href="Multiprogramming.html"> 2.2. Processes and Multiprogramming</a>
|
||
<a class="dropdown-item" href="KernelMechanics.html"> 2.3. Kernel Mechanics</a>
|
||
<a class="dropdown-item" href="Syscall.html"> 2.4. System Call Interface</a>
|
||
<a class="dropdown-item" href="ProcessCycle.html"> 2.5. Process Life Cycle</a>
|
||
<a class="dropdown-item" href="UnixFile.html"> 2.6. The UNIX File Abstraction</a>
|
||
<a class="dropdown-item" href="EventsSignals.html"> 2.7. Events and Signals</a>
|
||
<a class="dropdown-item" href="Extended2Processes.html"> 2.8. Extended Example: Listing Files with Processes</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 3</b></a>
|
||
<a class="dropdown-item" href="IPCOverview.html"> 3.1. Concurrency with IPC</a>
|
||
<a class="dropdown-item" href="IPCModels.html"> 3.2. IPC Models</a>
|
||
<a class="dropdown-item" href="Pipes.html"> 3.3. Pipes and FIFOs</a>
|
||
<a class="dropdown-item" href="MMap.html"> 3.4. Shared Memory With Memory-mapped Files</a>
|
||
<a class="dropdown-item" href="POSIXvSysV.html"> 3.5. POSIX vs. System V IPC</a>
|
||
<a class="dropdown-item" href="MQueues.html"> 3.6. Message Passing With Message Queues</a>
|
||
<a class="dropdown-item" href="ShMem.html"> 3.7. Shared Memory</a>
|
||
<a class="dropdown-item" href="IPCSems.html"> 3.8. Semaphores</a>
|
||
<a class="dropdown-item" href="Extended3Bash.html"> 3.9. Extended Example: Bash-lite: A Simple Command-line Shell</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 4</b></a>
|
||
<a class="dropdown-item" href="SocketsOverview.html"> 4.1. Networked Concurrency</a>
|
||
<a class="dropdown-item" href="FiveLayer.html"> 4.2. The TCP/IP Internet Model</a>
|
||
<a class="dropdown-item" href="NetApps.html"> 4.3. Network Applications and Protocols</a>
|
||
<a class="dropdown-item" href="Sockets.html"> 4.4. The Socket Interface</a>
|
||
<a class="dropdown-item" href="TCPSockets.html"> 4.5. TCP Socket Programming: HTTP</a>
|
||
<a class="dropdown-item" href="UDPSockets.html"> 4.6. UDP Socket Programming: DNS</a>
|
||
<a class="dropdown-item" href="AppBroadcast.html"> 4.7. Application-Layer Broadcasting: DHCP</a>
|
||
<a class="dropdown-item" href="Extended4CGI.html"> 4.8. Extended Example: CGI Web Server</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 5</b></a>
|
||
<a class="dropdown-item" href="InternetOverview.html"> 5.1. The Internet and Connectivity</a>
|
||
<a class="dropdown-item" href="AppLayer.html"> 5.2. Application Layer: Overlay Networks</a>
|
||
<a class="dropdown-item" href="TransLayer.html"> 5.3. Transport Layer</a>
|
||
<a class="dropdown-item" href="NetSec.html"> 5.4. Network Security Fundamentals</a>
|
||
<a class="dropdown-item" href="NetLayer.html"> 5.5. Network Layer: IP</a>
|
||
<a class="dropdown-item" href="LinkLayer.html"> 5.6. Link Layer</a>
|
||
<a class="dropdown-item" href="Wireless.html"> 5.7. Wireless Connectivity: Wi-Fi, Bluetooth, and Zigbee</a>
|
||
<a class="dropdown-item" href="Extended5DNS.html"> 5.8. Extended Example: DNS client</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 6</b></a>
|
||
<a class="dropdown-item" href="ThreadsOverview.html"> 6.1. Concurrency with Multithreading</a>
|
||
<a class="dropdown-item" href="ProcVThreads.html"> 6.2. Processes vs. Threads</a>
|
||
<a class="dropdown-item" href="RaceConditions.html"> 6.3. Race Conditions and Critical Sections</a>
|
||
<a class="dropdown-item" href="POSIXThreads.html"> 6.4. POSIX Thread Library</a>
|
||
<a class="dropdown-item" href="ThreadArgs.html"> 6.5. Thread Arguments and Return Values</a>
|
||
<a class="dropdown-item" href="ImplicitThreads.html"> 6.6. Implicit Threading and Language-based Threads</a>
|
||
<a class="dropdown-item" href="Extended6Input.html"> 6.7. Extended Example: Keyboard Input Listener</a>
|
||
<a class="dropdown-item" href="Extended6Primes.html"> 6.8. Extended Example: Concurrent Prime Number Search</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 7</b></a>
|
||
<a class="dropdown-item" href="SynchOverview.html"> 7.1. Synchronization Primitives</a>
|
||
<a class="dropdown-item" href="CritSect.html"> 7.2. Critical Sections and Peterson's Solution</a>
|
||
<a class="dropdown-item" href="Locks.html"> 7.3. Locks</a>
|
||
<a class="dropdown-item" href="Semaphores.html"> 7.4. Semaphores</a>
|
||
<a class="dropdown-item" href="Barriers.html"> 7.5. Barriers</a>
|
||
<a class="dropdown-item" href="Condvars.html"> 7.6. Condition Variables</a>
|
||
<a class="dropdown-item" href="Deadlock.html"> 7.7. Deadlock</a>
|
||
<a class="dropdown-item" href="Extended7Events.html"> 7.8. Extended Example: Event Log File</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 8</b></a>
|
||
<a class="dropdown-item" href="SynchProblemsOverview.html"> 8.1. Synchronization Patterns and Problems</a>
|
||
<a class="dropdown-item" href="SynchDesign.html"> 8.2. Basic Synchronization Design Patterns</a>
|
||
<a class="dropdown-item" href="ProdCons.html"> 8.3. Producer-Consumer Problem</a>
|
||
<a class="dropdown-item" href="ReadWrite.html"> 8.4. Readers-Writers Problem</a>
|
||
<a class="dropdown-item" href="DiningPhil.html"> 8.5. Dining Philosophers Problem and Deadlock</a>
|
||
<a class="dropdown-item" href="CigSmokers.html"> 8.6. Cigarette Smokers Problem and the Limits of Semaphores and Locks</a>
|
||
<a class="dropdown-item" href="Extended8ModExp.html"> 8.7. Extended Example: Parallel Modular Exponentiation</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Chapter 9</b></a>
|
||
<a class="dropdown-item" href="ParallelDistributedOverview.html"> 9.1. Parallel and Distributed Systems</a>
|
||
<a class="dropdown-item" href="ParVConc.html"> 9.2. Parallelism vs. Concurrency</a>
|
||
<a class="dropdown-item" href="ParallelDesign.html"> 9.3. Parallel Design Patterns</a>
|
||
<a class="dropdown-item" href="Scaling.html"> 9.4. Limits of Parallelism and Scaling</a>
|
||
<a class="dropdown-item" href="DistTiming.html"> 9.5. Timing in Distributed Environments</a>
|
||
<a class="dropdown-item" href="DistDataStorage.html"> 9.6. Reliable Data Storage and Location</a>
|
||
<a class="dropdown-item" href="DistConsensus.html"> 9.7. Consensus in Distributed Systems</a>
|
||
<a class="dropdown-item" href="Extended9Blockchain.html"> 9.8. Extended Example: Blockchain Proof-of-Work</a>
|
||
<div class="dropdown-divider"></div>
|
||
<a class="dropdown-item disabled"><b>Appendix A</b></a>
|
||
<a class="dropdown-item" href="CLangOverview.html"> A.1. C Language Reintroduction</a>
|
||
<a class="dropdown-item" href="Debugging.html"> A.2. Documentation and Debugging</a>
|
||
<a class="dropdown-item" href="BasicTypes.html"> A.3. Basic Types and Pointers</a>
|
||
<a class="dropdown-item" href="Arrays.html"> A.4. Arrays, Structs, Enums, and Type Definitions</a>
|
||
<a class="dropdown-item" href="Functions.html"> A.5. Functions and Scope</a>
|
||
<a class="dropdown-item" href="Pointers.html"> A.6. Pointers and Dynamic Allocation</a>
|
||
<a class="dropdown-item" href="Strings.html"> A.7. Strings</a>
|
||
<a class="dropdown-item" href="FunctionPointers.html"> A.8. Function Pointers</a>
|
||
<a class="dropdown-item" href="Files.html"> A.9. Files</a>
|
||
</div>
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</div>
|
||
|
||
<ul class="navbar-nav flex-row ml-md-auto d-none d-md-flex">
|
||
<li class="nav-item"><a class="nav-link jmu-gold" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/Books/csf/source/Scaling.rst"
|
||
target="_blank" rel="nofollow">Show Source</a></li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
|
||
<div class="container center">
|
||
«  <a id="prevmod" href="ParallelDesign.html">9.3. Parallel Design Patterns</a>
|
||
  ::  
|
||
<a class="uplink" href="index.html">Contents</a>
|
||
  ::  
|
||
<a id="nextmod" href="DistTiming.html">9.5. Timing in Distributed Environments</a>  »
|
||
|
||
</div>
|
||
<br />
|
||
|
||
|
||
|
||
<script type="text/javascript" src="_static/js/jquery-2.1.4.min.js"></script>
|
||
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
||
<script type="text/javascript" src="_static/js/jquery-1.11.4-ui.min.js"></script>
|
||
<script type="text/javascript" src="_static/js/forge-0.7.0.min.js"></script>
|
||
<script type="text/javascript" src="../../../JSAV/lib/jquery.transit.js"></script>
|
||
<script type="text/javascript" src="../../../JSAV/lib/raphael.js"></script>
|
||
<script type="text/javascript" src="../../../JSAV/build/JSAV-min.js"></script>
|
||
<script type="text/javascript" src="_static/js/config.js"></script>
|
||
<script type="text/javascript" src="../../../lib/odsaUtils-min.js"></script>
|
||
<script type="text/javascript" src="../../../lib/odsaMOD-min.js"></script>
|
||
<script type="text/javascript" src="_static/js/d3-4.13.0.min.js"></script>
|
||
<script type="text/javascript" src="_static/js/d3-selection-multi.v1.min.js"></script>
|
||
<script type="text/javascript" src="../../../lib/dataStructures.js"></script>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<div class="container">
|
||
|
||
<script>ODSA.SETTINGS.DISP_MOD_COMP = true;ODSA.SETTINGS.MODULE_NAME = "Scaling";ODSA.SETTINGS.MODULE_LONG_NAME = "Limits of Parallelism and Scaling";ODSA.SETTINGS.MODULE_CHAPTER = "Parallel and Distributed Systems"; ODSA.SETTINGS.BUILD_DATE = "2021-06-01 15:31:51"; ODSA.SETTINGS.BUILD_CMAP = false;JSAV_OPTIONS['lang']='en';JSAV_EXERCISE_OPTIONS['code']='java';</script><div class="section" id="limits-of-parallelism-and-scaling">
|
||
<h1>9.4. Limits of Parallelism and Scaling<a class="headerlink" href="Scaling.html#limits-of-parallelism-and-scaling" title="Permalink to this headline">¶</a></h1>
|
||
<p>While hardware support is required to achieve parallel computation, it is not
|
||
sufficient on its own. Many problems or algorithms simply do not support
|
||
parallel computation. For example, consider merge sort as shown in <a class="reference external" href="Scaling.html#cl9-2">Code Listing
|
||
9.2</a>. Although it is true that the left and right halves of the array
|
||
could be sorted in parallel with two threads, the <code class="docutils literal notranslate"><span class="pre">merge()</span></code> routine cannot be
|
||
parallelized; a single thread must traverse through both the left and right
|
||
halves to assemble the results. Consequently, there are limits to how much merge
|
||
sort can be improved with parallel execution.</p>
|
||
<div class="section" id="amdahl-s-law-and-strong-scaling">
|
||
<h2>9.4.1. Amdahl’s Law and Strong Scaling<a class="headerlink" href="Scaling.html#amdahl-s-law-and-strong-scaling" title="Permalink to this headline">¶</a></h2>
|
||
<p><a class="reference internal" href="Glossary.html#term-amdahl-s-law"><span class="xref std std-term">Amdahl’s law</span></a> provides a way to quantify the theoretical maximum
|
||
<a class="reference internal" href="Glossary.html#term-speedup-in-latency"><span class="xref std std-term">speedup in latency</span></a> (also called the <a class="reference internal" href="Glossary.html#term-speedup-factor"><span class="xref std std-term">speedup factor</span></a> or just
|
||
<a class="reference internal" href="Glossary.html#term-speedup-factor"><span class="xref std std-term">speedup</span></a>) that can occur with parallel execution. Specifically, Amdahl’s
|
||
law describes the ratio of the original execution time with the improved
|
||
execution time, assuming perfect parallelism and no overhead penalty. That is,
|
||
Amdahl’s law provides a theoretical limit to how much faster a program can run
|
||
if it is parallelized. If $p$ denotes the percent of a program that can be
|
||
executed in parallel and $N$ denotes the number of parallel execution units,
|
||
Amdahl’s law states that the theoretical maximum speedup would be:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{1}{(1 - p) + \frac{p}{N}}$</span>
|
||
</center>
|
||
<br /><p>This formula can be naturally derived by taking the ratio of the original
|
||
execution time and the improved execution time for the parallelized version. If
|
||
we use $T_{orig}$ to denote the original execution time, then $(1 – p) T_{orig}$
|
||
would signify the portion of the execution time that must run sequentially.
|
||
Assuming perfect parallelism as Amdahl’s law does, the remainder of the time
|
||
would be divided across the $N$ processors. This gives us the derivation of Amdahl’s law:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{T_{orig}}{T_{parallel}} =
|
||
\frac{T_{orig}}{(1 - p)T_{orig} + \frac{p}{N}T_{orig}} =
|
||
\frac{T_{orig}}{((1 - p) + \frac{p}{N}) T_{orig}}$</span>
|
||
</center>
|
||
<br /><p>By cancelling out the $T_{orig}$ values from the numerator and denominator, we
|
||
are left with the formulation of Amdahl’s above. A variant of Amdahl’s law uses
|
||
$f$ to denote the portion that must be run sequentially; that is, $f = 1 – p$.
|
||
This leads to another derivation of Amdahl’s law. The result will be identical
|
||
as the original formulation, but the calculation might be easier.</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{1}{(1 - p) + \frac{p}{N}} =
|
||
\frac{N}{N(1 - p) + p} = \frac{N}{Nf + 1 - f} = \frac{N}{1 + (N - 1)f}$</span>
|
||
</center>
|
||
<br /><div class="topic border border-dark rounded-lg bg-light px-2 mb-3">
|
||
<div class="figure align-left">
|
||
<a class="reference internal image-reference" href="_images/CSF-Images-Example.png"><img alt="Decorative example icon" src="_images/CSF-Images-Example.png" style="width: 100%;" /></a>
|
||
</div>
|
||
<p class="topic-title first pt-2 mb-1">Example 9.4.1 </p><hr class="mt-1" />
|
||
<p>As an example, consider a program that runs in 20 ms. The way the program is
|
||
written, 20% of it must be run sequentially; the remaining 80% will be run in
|
||
parallel on a quad-core. Per Amdahl’s law, the maximum theoretical speedup of
|
||
this program would be:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{1}{0.2 + \frac{0.8}{4}} =
|
||
\frac{1}{0.2 + 0.2} = 2.5$</span>
|
||
</center>
|
||
<br /><p>Using the alternative derivation, we would still get the same result:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{4}{1 + (4 - 1)(0.2)} =
|
||
\frac{4}{1 + 0.6} = \frac{4}{1.6} = 2.5$</span>
|
||
</center>
|
||
<br /></div>
|
||
<p>In this case, we could also determine that the parallelized version would spend
|
||
4 ms in the sequential portion of the program. The remaining portion (16 ms in
|
||
the original) would be divided across the 4 cores, so the parallel portion would
|
||
take 4 ms. Consequently, parallelizing the program would improve the run time
|
||
from 20 ms to 8 ms, which is a speedup factor of 2.5. The advantage of Amdahl’s
|
||
law, however, is that <strong>we did not need to know the original run-time</strong>. As long
|
||
as we know what portion can be parallelized and how many processing units we
|
||
have, we can determine the speedup factor. Amdahl’s law also emphasizes a key
|
||
point about parallelism: <strong>Improving the percent of a program that can be
|
||
parallelized has more impact than increasing the amount of parallelism</strong>.</p>
|
||
<div class="topic border border-dark rounded-lg bg-light px-2 mb-3" id="amdahl">
|
||
<div class="figure align-left">
|
||
<a class="reference internal image-reference" href="_images/CSF-Images-Example.png"><img alt="Decorative example icon" src="_images/CSF-Images-Example.png" style="width: 100%;" /></a>
|
||
</div>
|
||
<p class="topic-title first pt-2 mb-1">Example 9.4.2 </p><hr class="mt-1" />
|
||
<p>To illustrate this point, let us consider two variants on the previous scenario.
|
||
In one variant, the program has been restructured so that 90% can be
|
||
parallelized rather than 80%. This now leads to a speedup factor of:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{1}{0.1 + \frac{0.9}{4}} =
|
||
\frac{1}{0.1 + 0.225} \approx 3.08$</span>
|
||
</center>
|
||
<br /><p>In the second variant, we can still only parallelize 80% of the program, but we
|
||
have increased the number of cores from four to six. This variant produces a speedup of:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = \displaystyle\frac{1}{0.2 + \frac{0.8}{6}} =
|
||
\frac{1}{0.2 + 0.133} = 3.00$</span>
|
||
</center>
|
||
<br /><p>In other words, increasing the percent of parallelized code by 12.5% had a
|
||
bigger improvement than increasing the number of cores by 50%.</p>
|
||
</div>
|
||
<p>As the number of processing units continues to increase, the precise calculation
|
||
of Amdahl’s law becomes less important. Specifically, we can determine a faster
|
||
approximation of the speedup limit by considering the impact of an arbitrarily
|
||
large number of processing units; that is, we can derive a simplified estimate
|
||
by calculating the limit of $S$ as $N$ goes to infinity:</p>
|
||
<center>
|
||
<span class="math inline">$\large \displaystyle\lim_{N \to \infty} \frac{1}{(1 - p) + \frac{p}{N}}
|
||
= \frac{1}{(1 - p) + 0} = \frac{1}{1 - p} = \frac{1}{f}$</span>
|
||
</center>
|
||
<br /><p>Using this simplified estimate, we can determine that the upper bound on the
|
||
speedup for the program in <a href="Scaling.html#amdahl">Example 9.4.2</a> would be 5 (i.e., 1 / 0.2).</p>
|
||
</div>
|
||
<div class="section" id="gustafson-s-law-and-weak-scaling">
|
||
<h2>9.4.2. Gustafson’s Law and Weak Scaling<a class="headerlink" href="Scaling.html#gustafson-s-law-and-weak-scaling" title="Permalink to this headline">¶</a></h2>
|
||
<p>Although Amdahl’s law provides an initial estimate to quantify the speedup from
|
||
parallel execution, it is important to note that it rests on unrealistic
|
||
assumptions. Amdahl’s law assumes that the problem solved by the program
|
||
exhibits <a class="reference internal" href="Glossary.html#term-strong-scaling"><span class="xref std std-term">strong scaling</span></a>, meaning that the difficulty of the problem is
|
||
unaffected by the number of processors involved. In perfectly strong scaling,
|
||
there is no overhead penalty for creating more threads or using more processors.
|
||
A program run on a system with 100 processors will run in 1/100<sup>th</sup>
|
||
of the time than it would on a single-processor system. In contrast, a more
|
||
realistic and common property is <a class="reference internal" href="Glossary.html#term-weak-scaling"><span class="xref std std-term">weak scaling</span></a>, which emphasizes
|
||
accomplishing more work rather than running in less time. In weak scaling, the
|
||
additional processors are used to tackle bigger and more complex problems, while
|
||
holding the expected run time to be the same.</p>
|
||
<div class="figure mb-2 align-right" id="id2" style="width: 30%">
|
||
<span id="gustafson"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.6.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="Fence painting appears to show strong scaling initially, but only for a few painters" src="_images/CSF-Images.9.6.png" style="width: 90%;" /></a>
|
||
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.4.3: Fence painting appears to show strong scaling initially, but only for a few
|
||
painters</span></p>
|
||
</div>
|
||
<p>To illustrate the difference between strong and weak scaling, consider a
|
||
painting business. <a href="Scaling.html#gustafson">Figure 9.4.3</a> illustrates the scenario
|
||
where the company has been hired to paint a fence that is 20 feet in length.
|
||
This job initially seems to exhibit strong scaling. If one painter could finish
|
||
painting the fence in one hour, then four painters could probably finish the job
|
||
in 15 minutes. However, as more painters are added, the scaling becomes weak. If
|
||
the company tries to send 20 painters for the same fence, they are unlikely to
|
||
finish the job in only three minutes. Rather, the fence would become too crowded
|
||
and painters would have to wait on each other. A better choice for the company
|
||
would be to send the additional 16 painters to paint other fences. If they work
|
||
in groups of four to paint multiple 20-foot fences, the company could paint five
|
||
fences in a 15-minute time period. Alternatively, the company could choose to
|
||
send just one painter per fence, completing 20 fences in a single hour. The
|
||
fences are not necessarily painted any faster than before, but the company is
|
||
getting more work accomplished in the same amount of time.</p>
|
||
<p>The usefulness of Amdahl’s law is limited by its reliance on strong scaling and
|
||
unrealistic assumptions of parallel execution. Specifically, Amdahl’s law
|
||
deliberately ignores any performance cost associated with creating and managing
|
||
threads, as well as system-specific factors such as NUMA or processor workload.
|
||
Amdahl’s law is also limited by its exclusive focus on parallelism; Amdahl’s law
|
||
cannot be used to predict the impact of changing the layout of data within NUMA.
|
||
<a class="reference internal" href="Glossary.html#term-gustafson-s-law"><span class="xref std std-term">Gustafson’s law</span></a> provides an alternative formulation for speedup that
|
||
addresses these limitations.</p>
|
||
<p>Similar to Amdahl’s law, Gustafson’s law uses $p$ to denote the percent of the
|
||
work that can benefit from an improvement of some sort. Unlike Amdahl’s law,
|
||
this improvement is not tied to parallelism solely; the improvement could result
|
||
from an improvement in how the OS manages threads, moving data around within a
|
||
NUMA architecture, increasing the parallelism, or any other such change. The
|
||
amount of the improvement <a class="footnote-reference" href="Scaling.html#f49" id="id1">[1]</a> is denoted as $s$. Gustafson’s law then states
|
||
that the maximum theoretical speedup of deploying the improvement is:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = 1 - p + sp$</span>
|
||
</center>
|
||
<br /><div class="topic border border-dark rounded-lg bg-light px-2 mb-3">
|
||
<div class="figure align-left">
|
||
<a class="reference internal image-reference" href="_images/CSF-Images-Example.png"><img alt="Decorative example icon" src="_images/CSF-Images-Example.png" style="width: 100%;" /></a>
|
||
</div>
|
||
<p class="topic-title first pt-2 mb-1">Example 9.4.3 </p><hr class="mt-1" />
|
||
<p>As an example, consider a program that can be partially improved with parallel
|
||
execution. Let us assume that 20% of the program cannot be improved and some
|
||
initial empirical results suggest that the parallel execution portion runs in
|
||
1/5<sup>th</sup> of the time that it takes sequentially (i.e., an
|
||
improvement factor of 5). Note that this does not assume anything about how
|
||
many processors are used, so it can be based on more realistic measurements by
|
||
running some initial tests. In this case, the speedup would be:</p>
|
||
<center>
|
||
<span class="math inline">$\large S = 0.2 + 5 * 0.8 = 0.2 + 4.0 = 4.2$</span>
|
||
</center>
|
||
<br /></div>
|
||
<p>It is key to note that this speedup has a different meaning than the speedup
|
||
described by Amdahl’s law. This speedup factor does not mean that the program
|
||
runs 4.2 times as fast as the original, which is an assertion built on strong
|
||
scaling. Instead, the proper interpretation of the Gustafson’s law notion of
|
||
speedup is that this program can achieve 4.2 times as much work in the same
|
||
amount of time, which is based on weak scaling. If the original program could
|
||
process 10 MB of data in a minute, then the improved version could process 42 MB
|
||
<strong>in the same amount of time</strong>. With Gustafson’s law, the emphasis is on the
|
||
<a class="reference internal" href="Glossary.html#term-throughput"><span class="xref std std-term">throughput</span></a> (amount of work done) rather than a faster time.</p>
|
||
<table class="docutils footnote" frame="void" id="f49" rules="none">
|
||
<colgroup><col class="label" /><col /></colgroup>
|
||
<tbody valign="top">
|
||
<tr><td class="label"><a class="fn-backref" href="Scaling.html#id1">[1]</a></td><td>To be fair, Gustafson’s law can also be criticized for ignoring
|
||
complicating factors such as synchronization or communication overhead.
|
||
However, this objection is not as strong as it is for Amdahl’s, as the
|
||
improvement factor $s$ can be based more empirically.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
<div
|
||
id="ScalingSumm"
|
||
class="embedContainer"
|
||
data-exer-name="ScalingSumm"
|
||
data-long-name="Scaling questions"
|
||
data-short-name="ScalingSumm"
|
||
data-frame-src="../../../Exercises/ParallelDistributed/ScalingSumm.html?selfLoggingEnabled=false&localMode=true&module=Scaling&JXOP-debug=true&JOP-lang=en&JXOP-code=java"
|
||
data-frame-width="950"
|
||
data-frame-height="550"
|
||
data-external="false"
|
||
data-points="1.0"
|
||
data-required="True"
|
||
data-showhide="show"
|
||
data-threshold="5"
|
||
data-type="ka"
|
||
data-exer-id="">
|
||
|
||
<div class="center">
|
||
<div id="ScalingSumm_iframe"></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="container">
|
||
|
||
<div class="mt-4 container center">
|
||
«  <a id="prevmod1" href="ParallelDesign.html">9.3. Parallel Design Patterns</a>
|
||
  ::  
|
||
<a class="uplink" href="index.html">Contents</a>
|
||
  ::  
|
||
<a id="nextmod1" href="DistTiming.html">9.5. Timing in Distributed Environments</a>  »
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
<br />
|
||
|
||
<div class="row jmu-dark-purple-bg">
|
||
<div class="col-md-12">
|
||
<center>
|
||
<a id="contact_us" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="mailto:webmaster@opencsf.org" role="button">Contact Us</a>
|
||
<a id="license" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/lib/license.html" target="_blank">License</a>
|
||
</center>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<script src="_static/js/popper.js-1.14.7-min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||
<script src="_static/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||
</body>
|
||
</html> |