761 lines
58 KiB
HTML
761 lines
58 KiB
HTML
|
|
|||
|
<!DOCTYPE html>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<html lang="en">
|
|||
|
<head>
|
|||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|||
|
|
|||
|
<title>9.3. Parallel Design Patterns — Computer Systems Fundamentals</title>
|
|||
|
|
|||
|
<link rel="stylesheet" href="_static/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous" />
|
|||
|
<link rel="stylesheet" href="_static/css/pygments.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/normalize.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../JSAV/css/JSAV.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../lib/odsaMOD-min.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/jquery-1.11.4-smoothness-ui.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../lib/odsaStyle-min.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/csf.css" type="text/css" />
|
|||
|
|
|||
|
<style>
|
|||
|
.underline { text-decoration: underline; }
|
|||
|
</style>
|
|||
|
|
|||
|
<script type="text/javascript">
|
|||
|
var DOCUMENTATION_OPTIONS = {
|
|||
|
URL_ROOT: './',
|
|||
|
VERSION: '0.4.1',
|
|||
|
COLLAPSE_INDEX: false,
|
|||
|
FILE_SUFFIX: '.html',
|
|||
|
HAS_SOURCE: true
|
|||
|
};
|
|||
|
</script>
|
|||
|
|
|||
|
<script type="text/x-mathjax-config">
|
|||
|
MathJax.Hub.Config({
|
|||
|
tex2jax: {
|
|||
|
inlineMath: [['$','$'], ['\\(','\\)']],
|
|||
|
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
|
|||
|
processEscapes: true
|
|||
|
},
|
|||
|
"HTML-CSS": {
|
|||
|
scale: "80"
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
<link rel="shortcut icon" href="_static/favicon.ico"/>
|
|||
|
<link rel="index" title="Index" href="genindex.html" />
|
|||
|
<link rel="search" title="Search" href="search.html" />
|
|||
|
<link rel="index" title="Computer Systems Fundamentals" href="index.html" />
|
|||
|
<link rel="next" title="4. Limits of Parallelism and Scaling" href="Scaling.html" />
|
|||
|
<link rel="prev" title="2. Parallelism vs. Concurrency" href="ParVConc.html" />
|
|||
|
|
|||
|
</head><body>
|
|||
|
|
|||
|
<nav class="navbar navbar-expand-md navbar-dark navbar-custom fixed-top">
|
|||
|
|
|||
|
<a class="navbar-brand py-0" href="index.html"><img src="_static/CSF-Logo-Square-Text.png" alt="OpenCSF Logo" height="40em" class="py-1 px-2 mb-0 align-center rounded-lg bg-white" /></a>
|
|||
|
<!-- Show a navbar toggler on mobile -->
|
|||
|
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#defaultNavbars" aria-controls="defaultNavbars" aria-expanded="false" aria-label="Toggle navigation">
|
|||
|
<span class="navbar-toggler-icon"></span>
|
|||
|
</button>
|
|||
|
<div class="collapse navbar-collapse" id="defaultNavbars">
|
|||
|
<ul class="navbar-nav mr-auto">
|
|||
|
<li class="nav-item dropdown">
|
|||
|
<a class="nav-link dropdown-toggle jmu-gold rounded" href="ParallelDesign.html#" id="navbarDropdownChapters" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Contents</a>
|
|||
|
<div class="dropdown-menu scrollable-menu" role="menu" aria-labelledby="navbarDropdownChapters">
|
|||
|
<a class="dropdown-item" tabindex="-1" href="ParallelDesign.html#"><b>Chapter 1</b></a>
|
|||
|
<a class="dropdown-item" href="IntroConcSysOverview.html"> 1.1. Introduction to Concurrent Systems</a>
|
|||
|
<a class="dropdown-item" href="SysAndModels.html"> 1.2. Systems and Models</a>
|
|||
|
<a class="dropdown-item" href="Themes.html"> 1.3. Themes and Guiding Principles</a>
|
|||
|
<a class="dropdown-item" href="Architectures.html"> 1.4. System Architectures</a>
|
|||
|
<a class="dropdown-item" href="StateModels.html"> 1.5. State Models in UML</a>
|
|||
|
<a class="dropdown-item" href="SequenceModels.html"> 1.6. Sequence Models in UML</a>
|
|||
|
<a class="dropdown-item" href="StateModelImplementation.html"> 1.7. Extended Example: State Model Implementation</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 2</b></a>
|
|||
|
<a class="dropdown-item" href="ProcessesOverview.html"> 2.1. Processes and OS Basics</a>
|
|||
|
<a class="dropdown-item" href="Multiprogramming.html"> 2.2. Processes and Multiprogramming</a>
|
|||
|
<a class="dropdown-item" href="KernelMechanics.html"> 2.3. Kernel Mechanics</a>
|
|||
|
<a class="dropdown-item" href="Syscall.html"> 2.4. System Call Interface</a>
|
|||
|
<a class="dropdown-item" href="ProcessCycle.html"> 2.5. Process Life Cycle</a>
|
|||
|
<a class="dropdown-item" href="UnixFile.html"> 2.6. The UNIX File Abstraction</a>
|
|||
|
<a class="dropdown-item" href="EventsSignals.html"> 2.7. Events and Signals</a>
|
|||
|
<a class="dropdown-item" href="Extended2Processes.html"> 2.8. Extended Example: Listing Files with Processes</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 3</b></a>
|
|||
|
<a class="dropdown-item" href="IPCOverview.html"> 3.1. Concurrency with IPC</a>
|
|||
|
<a class="dropdown-item" href="IPCModels.html"> 3.2. IPC Models</a>
|
|||
|
<a class="dropdown-item" href="Pipes.html"> 3.3. Pipes and FIFOs</a>
|
|||
|
<a class="dropdown-item" href="MMap.html"> 3.4. Shared Memory With Memory-mapped Files</a>
|
|||
|
<a class="dropdown-item" href="POSIXvSysV.html"> 3.5. POSIX vs. System V IPC</a>
|
|||
|
<a class="dropdown-item" href="MQueues.html"> 3.6. Message Passing With Message Queues</a>
|
|||
|
<a class="dropdown-item" href="ShMem.html"> 3.7. Shared Memory</a>
|
|||
|
<a class="dropdown-item" href="IPCSems.html"> 3.8. Semaphores</a>
|
|||
|
<a class="dropdown-item" href="Extended3Bash.html"> 3.9. Extended Example: Bash-lite: A Simple Command-line Shell</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 4</b></a>
|
|||
|
<a class="dropdown-item" href="SocketsOverview.html"> 4.1. Networked Concurrency</a>
|
|||
|
<a class="dropdown-item" href="FiveLayer.html"> 4.2. The TCP/IP Internet Model</a>
|
|||
|
<a class="dropdown-item" href="NetApps.html"> 4.3. Network Applications and Protocols</a>
|
|||
|
<a class="dropdown-item" href="Sockets.html"> 4.4. The Socket Interface</a>
|
|||
|
<a class="dropdown-item" href="TCPSockets.html"> 4.5. TCP Socket Programming: HTTP</a>
|
|||
|
<a class="dropdown-item" href="UDPSockets.html"> 4.6. UDP Socket Programming: DNS</a>
|
|||
|
<a class="dropdown-item" href="AppBroadcast.html"> 4.7. Application-Layer Broadcasting: DHCP</a>
|
|||
|
<a class="dropdown-item" href="Extended4CGI.html"> 4.8. Extended Example: CGI Web Server</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 5</b></a>
|
|||
|
<a class="dropdown-item" href="InternetOverview.html"> 5.1. The Internet and Connectivity</a>
|
|||
|
<a class="dropdown-item" href="AppLayer.html"> 5.2. Application Layer: Overlay Networks</a>
|
|||
|
<a class="dropdown-item" href="TransLayer.html"> 5.3. Transport Layer</a>
|
|||
|
<a class="dropdown-item" href="NetSec.html"> 5.4. Network Security Fundamentals</a>
|
|||
|
<a class="dropdown-item" href="NetLayer.html"> 5.5. Network Layer: IP</a>
|
|||
|
<a class="dropdown-item" href="LinkLayer.html"> 5.6. Link Layer</a>
|
|||
|
<a class="dropdown-item" href="Wireless.html"> 5.7. Wireless Connectivity: Wi-Fi, Bluetooth, and Zigbee</a>
|
|||
|
<a class="dropdown-item" href="Extended5DNS.html"> 5.8. Extended Example: DNS client</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 6</b></a>
|
|||
|
<a class="dropdown-item" href="ThreadsOverview.html"> 6.1. Concurrency with Multithreading</a>
|
|||
|
<a class="dropdown-item" href="ProcVThreads.html"> 6.2. Processes vs. Threads</a>
|
|||
|
<a class="dropdown-item" href="RaceConditions.html"> 6.3. Race Conditions and Critical Sections</a>
|
|||
|
<a class="dropdown-item" href="POSIXThreads.html"> 6.4. POSIX Thread Library</a>
|
|||
|
<a class="dropdown-item" href="ThreadArgs.html"> 6.5. Thread Arguments and Return Values</a>
|
|||
|
<a class="dropdown-item" href="ImplicitThreads.html"> 6.6. Implicit Threading and Language-based Threads</a>
|
|||
|
<a class="dropdown-item" href="Extended6Input.html"> 6.7. Extended Example: Keyboard Input Listener</a>
|
|||
|
<a class="dropdown-item" href="Extended6Primes.html"> 6.8. Extended Example: Concurrent Prime Number Search</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 7</b></a>
|
|||
|
<a class="dropdown-item" href="SynchOverview.html"> 7.1. Synchronization Primitives</a>
|
|||
|
<a class="dropdown-item" href="CritSect.html"> 7.2. Critical Sections and Peterson's Solution</a>
|
|||
|
<a class="dropdown-item" href="Locks.html"> 7.3. Locks</a>
|
|||
|
<a class="dropdown-item" href="Semaphores.html"> 7.4. Semaphores</a>
|
|||
|
<a class="dropdown-item" href="Barriers.html"> 7.5. Barriers</a>
|
|||
|
<a class="dropdown-item" href="Condvars.html"> 7.6. Condition Variables</a>
|
|||
|
<a class="dropdown-item" href="Deadlock.html"> 7.7. Deadlock</a>
|
|||
|
<a class="dropdown-item" href="Extended7Events.html"> 7.8. Extended Example: Event Log File</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 8</b></a>
|
|||
|
<a class="dropdown-item" href="SynchProblemsOverview.html"> 8.1. Synchronization Patterns and Problems</a>
|
|||
|
<a class="dropdown-item" href="SynchDesign.html"> 8.2. Basic Synchronization Design Patterns</a>
|
|||
|
<a class="dropdown-item" href="ProdCons.html"> 8.3. Producer-Consumer Problem</a>
|
|||
|
<a class="dropdown-item" href="ReadWrite.html"> 8.4. Readers-Writers Problem</a>
|
|||
|
<a class="dropdown-item" href="DiningPhil.html"> 8.5. Dining Philosophers Problem and Deadlock</a>
|
|||
|
<a class="dropdown-item" href="CigSmokers.html"> 8.6. Cigarette Smokers Problem and the Limits of Semaphores and Locks</a>
|
|||
|
<a class="dropdown-item" href="Extended8ModExp.html"> 8.7. Extended Example: Parallel Modular Exponentiation</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 9</b></a>
|
|||
|
<a class="dropdown-item" href="ParallelDistributedOverview.html"> 9.1. Parallel and Distributed Systems</a>
|
|||
|
<a class="dropdown-item" href="ParVConc.html"> 9.2. Parallelism vs. Concurrency</a>
|
|||
|
<a class="dropdown-item" href="ParallelDesign.html"> 9.3. Parallel Design Patterns</a>
|
|||
|
<a class="dropdown-item" href="Scaling.html"> 9.4. Limits of Parallelism and Scaling</a>
|
|||
|
<a class="dropdown-item" href="DistTiming.html"> 9.5. Timing in Distributed Environments</a>
|
|||
|
<a class="dropdown-item" href="DistDataStorage.html"> 9.6. Reliable Data Storage and Location</a>
|
|||
|
<a class="dropdown-item" href="DistConsensus.html"> 9.7. Consensus in Distributed Systems</a>
|
|||
|
<a class="dropdown-item" href="Extended9Blockchain.html"> 9.8. Extended Example: Blockchain Proof-of-Work</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Appendix A</b></a>
|
|||
|
<a class="dropdown-item" href="CLangOverview.html"> A.1. C Language Reintroduction</a>
|
|||
|
<a class="dropdown-item" href="Debugging.html"> A.2. Documentation and Debugging</a>
|
|||
|
<a class="dropdown-item" href="BasicTypes.html"> A.3. Basic Types and Pointers</a>
|
|||
|
<a class="dropdown-item" href="Arrays.html"> A.4. Arrays, Structs, Enums, and Type Definitions</a>
|
|||
|
<a class="dropdown-item" href="Functions.html"> A.5. Functions and Scope</a>
|
|||
|
<a class="dropdown-item" href="Pointers.html"> A.6. Pointers and Dynamic Allocation</a>
|
|||
|
<a class="dropdown-item" href="Strings.html"> A.7. Strings</a>
|
|||
|
<a class="dropdown-item" href="FunctionPointers.html"> A.8. Function Pointers</a>
|
|||
|
<a class="dropdown-item" href="Files.html"> A.9. Files</a>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</ul>
|
|||
|
</div>
|
|||
|
|
|||
|
<ul class="navbar-nav flex-row ml-md-auto d-none d-md-flex">
|
|||
|
<li class="nav-item"><a class="nav-link jmu-gold" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/Books/csf/source/ParallelDesign.rst"
|
|||
|
target="_blank" rel="nofollow">Show Source</a></li>
|
|||
|
|
|||
|
</ul>
|
|||
|
</nav>
|
|||
|
|
|||
|
|
|||
|
<div class="container center">
|
|||
|
«  <a id="prevmod" href="ParVConc.html">9.2. Parallelism vs. Concurrency</a>
|
|||
|
  ::  
|
|||
|
<a class="uplink" href="index.html">Contents</a>
|
|||
|
  ::  
|
|||
|
<a id="nextmod" href="Scaling.html">9.4. Limits of Parallelism and Scaling</a>  »
|
|||
|
|
|||
|
</div>
|
|||
|
<br />
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script type="text/javascript" src="_static/js/jquery-2.1.4.min.js"></script>
|
|||
|
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
|||
|
<script type="text/javascript" src="_static/js/jquery-1.11.4-ui.min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/forge-0.7.0.min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/lib/jquery.transit.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/lib/raphael.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/build/JSAV-min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/config.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/odsaUtils-min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/odsaMOD-min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/d3-4.13.0.min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/d3-selection-multi.v1.min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/dataStructures.js"></script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<div class="container">
|
|||
|
|
|||
|
<script>ODSA.SETTINGS.DISP_MOD_COMP = true;ODSA.SETTINGS.MODULE_NAME = "ParallelDesign";ODSA.SETTINGS.MODULE_LONG_NAME = "Parallel Design Patterns";ODSA.SETTINGS.MODULE_CHAPTER = "Parallel and Distributed Systems"; ODSA.SETTINGS.BUILD_DATE = "2021-06-14 17:15:25"; ODSA.SETTINGS.BUILD_CMAP = false;JSAV_OPTIONS['lang']='en';JSAV_EXERCISE_OPTIONS['code']='java';</script><div class="section" id="parallel-design-patterns">
|
|||
|
<h1>9.3. Parallel Design Patterns<a class="headerlink" href="ParallelDesign.html#parallel-design-patterns" title="Permalink to this headline">¶</a></h1>
|
|||
|
<p>There are multiple levels of parallel design patterns that can be applied to a
|
|||
|
program. At the highest level, <em>algorithmic strategy patterns</em> are
|
|||
|
strategies for decomposing a problem in its most abstract form. Next,
|
|||
|
<em>implementation strategy patterns</em> are practical techniques for
|
|||
|
implementing parallel execution in the source code. At the lowest level,
|
|||
|
<em>parallel execution patterns</em> dictate how the software is run on specific
|
|||
|
parallel hardware architectures.</p>
|
|||
|
<div class="section" id="algorithmic-strategy-patterns">
|
|||
|
<h2>9.3.1. Algorithmic Strategy Patterns<a class="headerlink" href="ParallelDesign.html#algorithmic-strategy-patterns" title="Permalink to this headline">¶</a></h2>
|
|||
|
<p>The first step in designing parallel processing software is to identify
|
|||
|
opportunities for concurrency within your program. The two fundamental
|
|||
|
approaches for parallel algorithms are identifying possibilities for <a class="reference internal" href="Glossary.html#term-task-parallelism"><span class="xref std std-term">task
|
|||
|
parallelism</span></a> and <a class="reference internal" href="Glossary.html#term-data-parallelism"><span class="xref std std-term">data parallelism</span></a>. Task parallelism refers to
|
|||
|
decomposing the problem into multiple sub-tasks, all of which can be separated
|
|||
|
and run in parallel. Data parallelism, on the other hand, refers to performing
|
|||
|
the same operation on several different pieces of data concurrently. Task
|
|||
|
parallelism is sometimes referred to as <em>functional decomposition</em>, whereas data
|
|||
|
parallel ism is also known as <em>domain decomposition</em>.</p>
|
|||
|
<p>A common example of task parallelism is input event handling: One task is
|
|||
|
responsible for detecting and processing keyboard presses, while another task is
|
|||
|
responsible for handling mouse clicks. <a class="reference external" href="ParallelDesign.html#cl9-1">Code Listing 9.1</a> illustrates
|
|||
|
an easy opportunity for data parallelism. Since each array element is modified
|
|||
|
independently of the rest of the array, it is possible to set every array
|
|||
|
element’s value at the same time. The previous examples are instances of
|
|||
|
<a class="reference internal" href="Glossary.html#term-embarrassingly-parallel"><span class="xref std std-term">embarrassingly parallel problems</span></a> <a class="footnote-reference" href="ParallelDesign.html#f47" id="id1">[1]</a>, which
|
|||
|
require little or no effort to parallelize, and they can easily be classified as
|
|||
|
task or data parallelism.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-1"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0">1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.1:</span>
|
|||
|
<span class="cm"> An embarrassingly parallel loop, as each array element is initialized independently</span>
|
|||
|
<span class="cm"> of all other elements.</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="mi">1000000000</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span>
|
|||
|
<span class="n">array</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span><span class="p">;</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p>There are other ways to classify algorithms that exploit parallelism. One common
|
|||
|
approach is a <a class="reference internal" href="Glossary.html#term-recursive-splitting"><span class="xref std std-term">recursive splitting</span></a> or <a class="reference internal" href="Glossary.html#term-divide-and-conquer-algorithm"><span class="xref std std-term">divide-and-conquer</span></a>
|
|||
|
strategy. In a divide-and-conquer strategy, a complex task is broken down into
|
|||
|
concurrent sub-tasks. One example of this strategy is the quicksort algorithm:
|
|||
|
The array is first partitioned into two sub-arrays based on a pivot value; the
|
|||
|
sub-arrays can then be sorted recursively in parallel.</p>
|
|||
|
<p><em>Merge sort</em> is a common example of an algorithm that is both
|
|||
|
embarrassingly parallel and a divide-and-conquer approach. Consider the basic
|
|||
|
outline of the algorithm, as shown in <a class="reference external" href="ParallelDesign.html#cl9-2">Code Listing 9.2</a>. Merge sort
|
|||
|
begins by recursively splitting an array into two halves <a class="footnote-reference" href="ParallelDesign.html#f48" id="id2">[2]</a>. The left and
|
|||
|
right halves are sorted independently, then their resulting sorted versions are
|
|||
|
merged. This algorithm is considered embarrassingly parallel, because sorting
|
|||
|
the left and right halves in parallel are naturally independent tasks. It is
|
|||
|
also considered divide-and-conquer because it takes the larger problem and
|
|||
|
breaks it down into smaller tasks that can be parallelized. Not all
|
|||
|
divide-and-conquer algorithms are embarrassingly parallel, and vice versa;
|
|||
|
however, there is a significant amount of overlap between these classifications.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-2"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11
|
|||
|
12
|
|||
|
13
|
|||
|
14
|
|||
|
15</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.2:</span>
|
|||
|
<span class="cm"> Merge sort is an embarrassingly parallel problem, given its natural</span>
|
|||
|
<span class="cm"> divide-and-conquer structure</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="kt">void</span>
|
|||
|
<span class="nf">mergesort</span> <span class="p">(</span><span class="kt">int</span> <span class="o">*</span> <span class="n">values</span><span class="p">,</span> <span class="kt">int</span> <span class="n">start</span><span class="p">,</span> <span class="kt">int</span> <span class="n">end</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="k">if</span> <span class="p">(</span><span class="n">start</span> <span class="o">>=</span> <span class="n">end</span><span class="p">)</span>
|
|||
|
<span class="k">return</span><span class="p">;</span>
|
|||
|
<span class="kt">int</span> <span class="n">mid</span> <span class="o">=</span> <span class="p">(</span><span class="n">start</span> <span class="o">+</span> <span class="n">end</span><span class="p">)</span> <span class="o">/</span> <span class="mi">2</span><span class="p">;</span>
|
|||
|
<span class="n">mergesort</span> <span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">mid</span><span class="p">);</span> <span class="cm">/* sort the left half */</span>
|
|||
|
<span class="n">mergesort</span> <span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">mid</span> <span class="o">+</span> <span class="mi">1</span><span class="p">,</span> <span class="n">end</span><span class="p">);</span> <span class="cm">/* sort the right half */</span>
|
|||
|
<span class="n">merge</span> <span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">end</span><span class="p">);</span>
|
|||
|
<span class="p">}</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p>Another common strategy is <a class="reference internal" href="Glossary.html#term-pipelining"><span class="xref std std-term">pipelining</span></a>. In pipelining, a complex task is
|
|||
|
broken down into a sequence of independent sub-tasks, typically referred to as
|
|||
|
stages. There are multiple real-world scenarios that help to demonstrate the key
|
|||
|
ideas that underlie pipelining. On example is to think about doing laundry. Once
|
|||
|
a load of clothes has finished washing, you can put them into the dryer. At the
|
|||
|
same time, you start another load of clothes in the washing machine. Another
|
|||
|
example is to consider the line at a cafeteria-style restaurant where you make
|
|||
|
multiple selections. The line is often structured so that you select (in order)
|
|||
|
a salad, an entrée, side dishes, a dessert, and a drink. At any time, there can
|
|||
|
be customers in every stage of the line; it is not necessary to wait for each
|
|||
|
customer to pass through all parts of the line before making your first selection.</p>
|
|||
|
<p>The canonical example of pipelining is the five-stage RISC processor
|
|||
|
architecture. Executing a single instruction involves passing through the fetch
|
|||
|
(IF), decode (ID), execute (EX), memory access (MEM), and write-back (WB)
|
|||
|
stages. The stages are designed so that five instructions can be executing
|
|||
|
simultaneously in a staggered pattern. From the software perspective,
|
|||
|
command-line programs are commonly linked together to run in parallel as a
|
|||
|
pipeline. Consider the following example:</p>
|
|||
|
<div class="highlight-none border border-dark rounded-lg bg-light px-2 mb-3 notranslate"><div class="highlight bg-light"><pre class="mb-0"><span></span>$ cat data.csv | cut -d’,’ -f1,2,3 | uniq
|
|||
|
</pre></div>
|
|||
|
</div>
|
|||
|
<p>In this scenario, a comma-separated value (CSV) file is read and printed to
|
|||
|
<code class="docutils literal notranslate"><span class="pre">STDOUT</span></code> by the <code class="docutils literal notranslate"><span class="pre">cat</span></code> program. As this is happening, the <code class="docutils literal notranslate"><span class="pre">cut</span></code> program
|
|||
|
parses these lines of data from its <code class="docutils literal notranslate"><span class="pre">STDIN</span></code> and prints the first three fields
|
|||
|
to its <code class="docutils literal notranslate"><span class="pre">STDOUT</span></code>. The <code class="docutils literal notranslate"><span class="pre">uniq</span></code> program eliminates any duplicate lines. These
|
|||
|
three processes can be run in parallel to a certain extent. The call to <code class="docutils literal notranslate"><span class="pre">cut</span></code>
|
|||
|
can begin processing some of the data before <code class="docutils literal notranslate"><span class="pre">cat</span></code> has managed to read the
|
|||
|
entire file. Similarly, <code class="docutils literal notranslate"><span class="pre">uniq</span></code> can start to eliminate lines from the first
|
|||
|
part of the file while the other two processes are still working. The key to
|
|||
|
making this successful is that <code class="docutils literal notranslate"><span class="pre">cut</span></code> and <code class="docutils literal notranslate"><span class="pre">uniq</span></code> continue to run as long as
|
|||
|
they are still receiving data from <code class="docutils literal notranslate"><span class="pre">STDIN</span></code>.</p>
|
|||
|
</div>
|
|||
|
<div class="section" id="implementation-strategy-patterns">
|
|||
|
<h2>9.3.2. Implementation Strategy Patterns<a class="headerlink" href="ParallelDesign.html#implementation-strategy-patterns" title="Permalink to this headline">¶</a></h2>
|
|||
|
<p>Once you have identified the overall algorithmic strategy for parallel
|
|||
|
execution, the next step is to identify techniques for implementing the
|
|||
|
algorithm in software. There are several well-established approaches for doing
|
|||
|
so. One of the most common is the <a class="reference internal" href="Glossary.html#term-fork-join-pattern"><span class="xref std std-term">fork/join pattern</span></a>, illustrated in
|
|||
|
<a href="ParallelDesign.html#forkjoin">Figure 9.3.1</a>. In this pattern, the program begins as a
|
|||
|
single main thread. Once a parallel task is encountered, additional threads are
|
|||
|
created and executed in parallel. All threads must complete and be destroyed
|
|||
|
before the main thread can continue the next portion of the code. This pattern
|
|||
|
is very common with data parallelism, as well as with <a class="reference internal" href="Glossary.html#term-loop-parallelism"><span class="xref std std-term">loop parallelism</span></a>,
|
|||
|
where the code contains loops that are computationally expensive but
|
|||
|
independent. <a class="reference external" href="ParallelDesign.html#cl9-1">Code Listing 9.1</a> was an example of loop parallelism.</p>
|
|||
|
<div class="figure mb-2 align-center" id="id9">
|
|||
|
<span id="forkjoin"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.2.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="Illustration of sequential tasks (above) and the corresponding fork/join parallel implementation (below). Image source: Wikipedia (recreated)" src="_images/CSF-Images.9.2.png" style="width: 80%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.3.1: Illustration of sequential tasks (above) and the corresponding fork/join
|
|||
|
parallel implementation (below). Image source: Wikipedia (recreated)</span></p>
|
|||
|
</div>
|
|||
|
<p>Implementing the fork/join pattern in practice can be straightforward,
|
|||
|
particularly in the cases of embarrassingly parallel problems. The fork stage
|
|||
|
consists of setting up the arguments that each thread should receive. <a class="reference external" href="ParallelDesign.html#cl9-3">Code
|
|||
|
Listing 9.3</a>, for example, shows how to break the loop from <a class="reference external" href="ParallelDesign.html#cl9-1">Code
|
|||
|
Listing 9.1</a> into 10 threads that each process 1/10<sup>th</sup>
|
|||
|
of the array calculations (encapsulated in a <code class="docutils literal notranslate"><span class="pre">multiply()</span></code> function). The join
|
|||
|
stage would combine their results after calling <code class="docutils literal notranslate"><span class="pre">pthread_join()</span></code>.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-3"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.3:</span>
|
|||
|
<span class="cm"> The fork stage of a fork/join pattern</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="mi">10</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span> <span class="cm">/* Assume we are creating 10 threads */</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">array</span> <span class="o">=</span> <span class="n">array</span><span class="p">;</span>
|
|||
|
<span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">start</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">100000000</span><span class="p">;</span>
|
|||
|
<span class="n">assert</span> <span class="p">(</span><span class="n">pthread_create</span> <span class="p">(</span><span class="o">&</span><span class="n">threads</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="nb">NULL</span><span class="p">,</span>
|
|||
|
<span class="n">multiply</span><span class="p">,</span> <span class="o">&</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">])</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">);</span>
|
|||
|
<span class="p">}</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p>The fork/join pattern is so common, particularly for loop parallelism, that many
|
|||
|
libraries provide simple mechanisms to automate the thread management for it
|
|||
|
when programming. <a class="reference external" href="ParallelDesign.html#cl9-4">Code Listing 9.4</a> shows the OpenMP version of
|
|||
|
parallelizing <a class="reference external" href="ParallelDesign.html#cl9-1">Code Listing 9.1</a>. When the compiler encounters this
|
|||
|
<code class="docutils literal notranslate"><span class="pre">pragma</span></code>, it will inject code that handles the thread creation and cleanup
|
|||
|
with no additional work by the programmer. This pragma makes implementing the
|
|||
|
fork/join pattern trivial in this and many cases.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-4"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0">1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.4:</span>
|
|||
|
<span class="cm"> OpenMP works very well with embarrassingly parallel fork/join patterns</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="cp">#pragma omp parallel for</span>
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="mi">1000000000</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span>
|
|||
|
<span class="n">array</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span><span class="p">;</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<div class="figure mb-2 align-right" id="id10" style="width: 35%">
|
|||
|
<span id="mapreduce"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.3.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="Map/reduce shares a common structure as fork/join" src="_images/CSF-Images.9.3.png" style="width: 90%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.3.2: Map/reduce shares a common structure as fork/join</span></p>
|
|||
|
</div>
|
|||
|
<p><a class="reference internal" href="Glossary.html#term-map-reduce-pattern"><span class="xref std std-term">Map/reduce</span></a>, shown in <a href="ParallelDesign.html#mapreduce">Figure 9.3.2</a>, is another
|
|||
|
strategy that is closely related to fork/join. As in fork/join, a collection of
|
|||
|
input data is processed in parallel by multiple threads. The results are then
|
|||
|
merged and collected as the threads are joined until a single answer is reached.
|
|||
|
Although they are structurally identical, the type of work being done reflects a
|
|||
|
somewhat different philosophy. The idea behind the map stage is based on a
|
|||
|
technique in <em>functional programming</em> languages. A single function is
|
|||
|
mapped to all inputs to yield new results; these functions are self-contained
|
|||
|
and free of side effects, such as producing output. Several map stages can be
|
|||
|
chained together to compose larger functions. The map and reduce stages are also
|
|||
|
more independent than standard fork/join; mapping can be done without a
|
|||
|
reduction, and vice versa. Map/reduce is a popular feature in cluster systems,
|
|||
|
such as the Apache Hadoop system.</p>
|
|||
|
<p>An implementation strategy common with task parallelism is the
|
|||
|
<a class="reference internal" href="Glossary.html#term-manager-worker"><span class="xref std std-term">manager/worker</span></a> pattern. In this scenario, independent tasks are
|
|||
|
distributed among several worker threads that communicate only with the single
|
|||
|
manager thread. The manager can monitor the distribution of the tasks to balance
|
|||
|
the workload evenly. To return to an earlier example, input event handlers are
|
|||
|
often implemented using the manager/worker pattern. In this case, the key press
|
|||
|
handler and the mouse event handler would each be implemented in separate
|
|||
|
workers; when either event occurs, the worker would inform the manager thread
|
|||
|
that would then process the data within the context of the program.</p>
|
|||
|
<p><a class="reference external" href="ParallelDesign.html#cl9-5">Code Listing 9.5</a> shows one way to structure a worker thread. The
|
|||
|
thread arguments contain a pointer to a piece of data to process
|
|||
|
(<code class="docutils literal notranslate"><span class="pre">args->data</span></code>) along with a lock (<code class="docutils literal notranslate"><span class="pre">args->lock</span></code>) and a condition variable
|
|||
|
(<code class="docutils literal notranslate"><span class="pre">args->data_received</span></code>). When the manager thread has a task to assign this
|
|||
|
particular thread, it would update the data pointer and send a signal with the
|
|||
|
condition variable. If each worker thread has its own data pointer, the manager
|
|||
|
can send data to a specific worker thread. On the other hand, if the data
|
|||
|
pointer is shared, this structure could still work; the main difference is that
|
|||
|
the thread would need to make a local copy of the data just before releasing the
|
|||
|
lock on line 17. Since condition variables also support broadcasting, the
|
|||
|
manager can send data to all of the worker threads with a single message. In
|
|||
|
addition, the thread arguments contain a boolean value <code class="docutils literal notranslate"><span class="pre">args->running</span></code>. The
|
|||
|
manager thread can stop all of the workers by setting this value to false and
|
|||
|
broadcasting on the condition variable (setting all <code class="docutils literal notranslate"><span class="pre">args->data</span></code> values to
|
|||
|
anything other than <code class="docutils literal notranslate"><span class="pre">NULL</span></code>).</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-5"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11
|
|||
|
12
|
|||
|
13
|
|||
|
14
|
|||
|
15
|
|||
|
16
|
|||
|
17
|
|||
|
18
|
|||
|
19
|
|||
|
20
|
|||
|
21
|
|||
|
22</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.5:</span>
|
|||
|
<span class="cm"> A simple worker with condition variables</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="kt">void</span> <span class="o">*</span>
|
|||
|
<span class="nf">worker</span> <span class="p">(</span><span class="kt">void</span> <span class="o">*</span> <span class="n">_args</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="k">struct</span> <span class="n">args</span> <span class="o">*</span><span class="n">args</span> <span class="o">=</span> <span class="p">(</span><span class="k">struct</span> <span class="n">args</span> <span class="o">*</span><span class="p">)</span> <span class="n">_args</span><span class="p">;</span>
|
|||
|
<span class="k">while</span> <span class="p">(</span><span class="nb">true</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="cm">/* Wait for the next available data */</span>
|
|||
|
<span class="n">pthread_mutex_lock</span> <span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">lock</span><span class="p">);</span>
|
|||
|
<span class="k">while</span> <span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">data</span> <span class="o">==</span> <span class="nb">NULL</span><span class="p">)</span>
|
|||
|
<span class="n">pthread_cond_wait</span> <span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">data_received</span><span class="p">,</span> <span class="n">args</span><span class="o">-></span><span class="n">lock</span><span class="p">);</span>
|
|||
|
<span class="k">if</span> <span class="p">(</span><span class="o">!</span> <span class="n">args</span><span class="o">-></span><span class="n">running</span><span class="p">)</span>
|
|||
|
<span class="k">break</span><span class="p">;</span>
|
|||
|
<span class="n">pthread_mutex_unlock</span> <span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">lock</span><span class="p">);</span>
|
|||
|
<span class="cm">/* Do something with the data here */</span>
|
|||
|
<span class="p">}</span>
|
|||
|
<span class="n">pthread_mutex_unlock</span> <span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">lock</span><span class="p">);</span>
|
|||
|
<span class="n">pthread_exit</span> <span class="p">(</span><span class="nb">NULL</span><span class="p">);</span>
|
|||
|
<span class="p">}</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
</div>
|
|||
|
<div class="section" id="parallel-execution-patterns">
|
|||
|
<h2>9.3.3. Parallel Execution Patterns<a class="headerlink" href="ParallelDesign.html#parallel-execution-patterns" title="Permalink to this headline">¶</a></h2>
|
|||
|
<div class="figure mb-2 align-right" id="id11" style="width: 45%">
|
|||
|
<span id="threadpool"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.4.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="A thread pool retrieves tasks from the associated queue and returns completed results" src="_images/CSF-Images.9.4.png" style="width: 95%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.3.3: A thread pool retrieves tasks from the associated queue and returns
|
|||
|
completed results</span></p>
|
|||
|
</div>
|
|||
|
<p>Once the implementation strategy has been established, the software designer
|
|||
|
needs to make decisions about how the parallel software will run given the
|
|||
|
underlying hardware support available. One technique is to create a
|
|||
|
<a class="reference internal" href="Glossary.html#term-thread-pool"><span class="xref std std-term">thread pool</span></a> with an associated <a class="reference internal" href="Glossary.html#term-task-queue"><span class="xref std std-term">task queue</span></a>. A thread pool is a
|
|||
|
fixed number of threads that are available for the program to use. As parallel
|
|||
|
tasks arrive, they are placed into a task queue. If a thread in the pool is
|
|||
|
available, it will remove a task from the queue and execute it.
|
|||
|
<a href="ParallelDesign.html#threadpool">Figure 9.3.3</a> shows the logical structure of this approach.</p>
|
|||
|
<p>At first glance, thread pools may look identical to the manager/worker
|
|||
|
implementation pattern described above. The difference is that the
|
|||
|
manager/worker pattern describes <em>what</em> is to be done, whereas the thread pool
|
|||
|
structure describes <em>how</em> it will be done. The master/worker pattern is in
|
|||
|
contrast to the fork/join pattern. Master/worker employs task parallelism, with
|
|||
|
different workers may be performing different tasks; fork/join employs data
|
|||
|
parallelism, with identical threads performing the same task on different data.
|
|||
|
A thread pool can be used for both approaches.</p>
|
|||
|
<p>The main idea of a thread pool is to create all of the threads needed once at
|
|||
|
the beginning of the program, rather than when needed. <a class="reference external" href="ParallelDesign.html#cl9-3">Code Listing 9.3</a>, for instance, did not use a thread pool to parallelize the loop.
|
|||
|
This could be contrasted with <a class="reference external" href="ParallelDesign.html#cl9-6">Code Listing 9.6</a>, which assumes the
|
|||
|
presence of a thread pool. In this approach, the for-loop employs the
|
|||
|
producer-consumer <code class="docutils literal notranslate"><span class="pre">enqueue()</span></code> operation from Chapter 8 to place the starting
|
|||
|
values into the shared queue. The <code class="docutils literal notranslate"><span class="pre">space</span></code> and <code class="docutils literal notranslate"><span class="pre">items</span></code> semaphores help to
|
|||
|
ensure that the queue is modified safely. After enqueueing all of the data, the
|
|||
|
main thread waits at a barrier until the pool threads have reached the end of
|
|||
|
their calculations. The barrier prevents the main thread from moving past the
|
|||
|
fork/join structure until all of the pool threads have reached the same point.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-6"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11
|
|||
|
12</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.6:</span>
|
|||
|
<span class="cm"> Using a thread pool to parallelize Code Listing 9.1</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="cm">/* Initialize barrier for this thread + 10 from the thread pool */</span>
|
|||
|
<span class="n">pthread_barrier_init</span> <span class="p">(</span><span class="o">&</span><span class="n">barrier</span><span class="p">,</span> <span class="nb">NULL</span><span class="p">,</span> <span class="mi">11</span><span class="p">);</span>
|
|||
|
|
|||
|
<span class="cm">/* Use the producer-consumer enqueue from Code Listing 8.15 */</span>
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="mi">10</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span>
|
|||
|
<span class="n">enqueue</span> <span class="p">(</span><span class="n">queue</span><span class="p">,</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">100000000</span><span class="p">,</span> <span class="n">space</span><span class="p">,</span> <span class="n">items</span><span class="p">);</span>
|
|||
|
|
|||
|
<span class="n">pthread_barrier_wait</span> <span class="p">(</span><span class="n">barrier</span><span class="p">);</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p><a class="reference external" href="ParallelDesign.html#cl9-7">Code Listing 9.7</a> shows the structure of a thread in the thread pool.
|
|||
|
(For simplicity and focus on the thread pool, we are assuming all semaphores,
|
|||
|
the queue, the lock, and the barrier are globally accessible.) This thread
|
|||
|
starts by retrieving a piece of data from the shared queue, using the
|
|||
|
producer-consumer <code class="docutils literal notranslate"><span class="pre">dequeue()</span></code> operation. Note that, since there are multiple
|
|||
|
threads in the pool, this thread needs the <code class="docutils literal notranslate"><span class="pre">dequeue()</span></code> operation that employs
|
|||
|
a lock. This version synchronizes the pool threads’ access to the variables that
|
|||
|
maintain the queue structure. After retrieving the starting value (which was
|
|||
|
passed through the queue as a pointer), the thread performs the desired work and
|
|||
|
waits at the barrier (indicating completion to the main thread).</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-7"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11
|
|||
|
12
|
|||
|
13
|
|||
|
14
|
|||
|
15</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.7:</span>
|
|||
|
<span class="cm"> The threads in the pool for parallelizing Code Listing 9.1</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="kt">void</span> <span class="o">*</span>
|
|||
|
<span class="nf">pool_thread</span> <span class="p">(</span><span class="k">struct</span> <span class="n">args</span> <span class="o">*</span><span class="n">args</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="cm">/* ... Declarations and other work omitted for brevity ... */</span>
|
|||
|
<span class="cm">/* Use the dequeue from 8.18, given multiple consumers */</span>
|
|||
|
<span class="kt">int</span> <span class="n">starting</span> <span class="o">=</span> <span class="p">(</span><span class="kt">int</span><span class="p">)</span><span class="n">dequeue</span> <span class="p">(</span><span class="n">queue</span><span class="p">,</span> <span class="n">space</span><span class="p">,</span> <span class="n">items</span><span class="p">,</span> <span class="n">lock</span><span class="p">);</span>
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="n">i</span> <span class="o">=</span> <span class="n">starting</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="n">starting</span> <span class="o">+</span> <span class="mi">100000000</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span>
|
|||
|
<span class="n">array</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span><span class="p">;</span>
|
|||
|
<span class="n">pthread_barrier_wait</span> <span class="p">(</span><span class="n">barrier</span><span class="p">);</span>
|
|||
|
<span class="cm">/* ... Additional work may happen later ... */</span>
|
|||
|
<span class="p">}</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p>One way to characterize the difference between the thread pool approach in <a class="reference external" href="ParallelDesign.html#cl9-6">Code
|
|||
|
Listing 9.6</a> with the basic fork/join style of <a class="reference external" href="ParallelDesign.html#cl9-3">Code Listing 9.3</a> is to distinguish them as either a <em>pull</em> or a <em>push</em>
|
|||
|
model. Thread pools are typically implemented using a pull model, where the pool
|
|||
|
threads manage themselves, retrieving data from the queue at their availability
|
|||
|
and discretion. In contrast, the original approach used a push model, with the
|
|||
|
main thread controlling which thread accomplished which task.</p>
|
|||
|
<p>The thread pool approach has several advantages. First, it minimizes the cost of
|
|||
|
creating new threads, as they are only created once when the program begins.
|
|||
|
Many naive multithreaded implementations fail to realize the benefits of
|
|||
|
parallelism because of the overhead cost of creating and managing the threads.
|
|||
|
Second, thread pools make the resource consumption more predictable. A
|
|||
|
simultaneous request to create a large number of threads could cause a spike in
|
|||
|
memory consumption that could cause a variety of problems in the system. Third,
|
|||
|
as just noted previously, thread pools allow the threads to be more
|
|||
|
self-managing based on local performance characteristics. If one thread is
|
|||
|
running on a core that is overloaded with other work, that thread can naturally
|
|||
|
commit less work to this task.</p>
|
|||
|
<p>The self-management of thread pools can also be a disadvantage, as well. If
|
|||
|
there is no coordination between the threads and they are running on different
|
|||
|
processor cores, the cache performance may suffer, as data may need to be
|
|||
|
shuffled around frequently. Similarly, if there is no logging of which core
|
|||
|
takes which task or data set, responding to hardware failures might be difficult
|
|||
|
and data could be lost. Finally, the shared queue must be managed, which could
|
|||
|
induce performance delays in the synchronization that could otherwise be avoided.</p>
|
|||
|
<div class="figure mb-2 align-right" id="id12" style="width: 55%">
|
|||
|
<span id="flynn"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.5.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="The four paradigms of Flynn's taxonomy, showing how single/multiple instructions and single/multiple data are linked to processing units. Image source: Wikipedia (recreated)" src="_images/CSF-Images.9.5.png" style="width: 95%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.3.4: The four paradigms of Flynn’s taxonomy, showing how single/multiple
|
|||
|
instructions and single/multiple data are linked to processing units.
|
|||
|
Image source: Wikipedia (recreated)</span></p>
|
|||
|
</div>
|
|||
|
<p>Another factor that influences the execution of parallel systems is the
|
|||
|
capabilities of the hardware. <a class="reference internal" href="Glossary.html#term-flynn-s-taxonomy"><span class="xref std std-term">Flynn’s taxonomy</span></a> describes four paradigms
|
|||
|
of parallel hardware architectures in terms of the decompositions they support.
|
|||
|
<a href="ParallelDesign.html#flynn">Figure 9.3.4</a> illustrates the logical organization of each of
|
|||
|
the four paradigms. In each structure, a processing unit (“PU”) is provided with
|
|||
|
one or more parallel instructions (SI for “single instruction” and MI for
|
|||
|
“multiple instruction”) and performs the desired computation on a single input
|
|||
|
(SD) or multiple pieces of data (MD).</p>
|
|||
|
<p>Traditional uniprocessing software adheres to the <a class="reference internal" href="Glossary.html#term-sisd"><span class="xref std std-term">SISD</span></a> model, as a
|
|||
|
single processor executes a single instruction on a single piece of data at a
|
|||
|
time; as such, SISD does not support parallel execution but is included in the
|
|||
|
taxonomy for completeness. Of the parallel models, <a class="reference internal" href="Glossary.html#term-simd"><span class="xref std std-term">SIMD</span></a> is perhaps the
|
|||
|
most intuitive and the one with which most readers would be familiar. Modern
|
|||
|
CPUs provide SIMD support through various extensions to the ISA, such as the
|
|||
|
streaming SIMD extensions (SSE) or advanced vector extensions (AVX) for Intel
|
|||
|
processors, as well as the Neon or Helium instruction sets for ARM processors.
|
|||
|
<em>Graphics processing units (GPUs)</em> provide native support for SIMD
|
|||
|
operations, such as manipulating the rows of pixels in an image in parallel.
|
|||
|
Given this native support, GPUs are also widely used for applications that
|
|||
|
perform independent calculations in parallel. For instance, many scientific or
|
|||
|
security applications involve brute-force searches of a large set of data; GPU
|
|||
|
SIMD instructions can facilitate parallelizing the computations needed for these applications.</p>
|
|||
|
<p>The <a class="reference internal" href="Glossary.html#term-misd"><span class="xref std std-term">MISD</span></a> model is often confusing when first encountered, and many
|
|||
|
people do not immediately perceive its value as parallelism. One common use of
|
|||
|
MISD would be to provide fault tolerance in programs that require precision. The
|
|||
|
multiple instructions are all executed in parallel on the same input data; the
|
|||
|
results of the computations can then be evaluated to confirm that no errors
|
|||
|
occurred. Systolic array architectures, which are specialized systems for
|
|||
|
parallelizing adanced mathematical operations, can also be classified as MISD.
|
|||
|
For instance, specialized designs can optimize the parallel calculation of the
|
|||
|
multiplication and addition operations found in matrix multiplication. MISD
|
|||
|
hardware implementations are not common and are typically only found in custom
|
|||
|
hardware designs.</p>
|
|||
|
<p><a class="reference internal" href="Glossary.html#term-mimd"><span class="xref std std-term">MIMD</span></a> architectures allow the processors to work independently,
|
|||
|
performing different instructions on different pieces or sets of data at the
|
|||
|
same time. SPMD (single program, multiple data) is a common subset of MIMD in
|
|||
|
distributed computing; in SPMD, a single program with multiple instructions can
|
|||
|
be deployed independently to run in parallel. The key distinction between SPMD
|
|||
|
and SIMD is that SIMD instructions are synchronized. In a SIMD system, all
|
|||
|
processors are executing the same instruction at the same time according to the
|
|||
|
same clock cycle; SPMD architectures provide more autonomy, with each processor
|
|||
|
executing instructions independently of the rest of the system.</p>
|
|||
|
<p>Large-scale parallel architectures, particularly MIMD, typically rely on a
|
|||
|
memory architecture that complicates their software development. In traditional
|
|||
|
SISD computing models (e.g., personal computers and laptops), all memory
|
|||
|
accesses are essentially equal; accessing a global variable near address
|
|||
|
0x0804a000 takes the same amount of time as accessing a stack variable near
|
|||
|
0xbfff8000. In large-scale MIMD systems, such as those used for high-performance
|
|||
|
computing, that claim is not necessarily true. These large-scale systems use
|
|||
|
<a class="reference internal" href="Glossary.html#term-non-uniform-memory-access"><span class="xref std std-term">non-uniform memory access (NUMA)</span></a> designs. In NUMA, the memory hardware
|
|||
|
is distributed throughout the system. Some portions of memory are physically
|
|||
|
closer to a processing unit than others; consequently, accessing these closer
|
|||
|
portions of memory is faster than others.</p>
|
|||
|
<table class="docutils footnote" frame="void" id="f47" rules="none">
|
|||
|
<colgroup><col class="label" /><col /></colgroup>
|
|||
|
<tbody valign="top">
|
|||
|
<tr><td class="label"><a class="fn-backref" href="ParallelDesign.html#id1">[1]</a></td><td>While “embarrassingly parallel” is the dominant term for these types
|
|||
|
of problems, some researchers in the field dislike this term, as it has a
|
|||
|
negative connotation and can be interpreted as suggesting these problems are
|
|||
|
undesirable. Instead, they tend to use “naturally parallel” to suggest that
|
|||
|
parallelism naturally aligns with the problem.</td></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<table class="docutils footnote" frame="void" id="f48" rules="none">
|
|||
|
<colgroup><col class="label" /><col /></colgroup>
|
|||
|
<tbody valign="top">
|
|||
|
<tr><td class="label"><a class="fn-backref" href="ParallelDesign.html#id2">[2]</a></td><td>We are showing the most trivial form of merge sort here for
|
|||
|
illustration. In practice, merge sort would never be implemented this way, as
|
|||
|
the overhead of the recursive function calls becomes a significant burden.
|
|||
|
Instead, practical implementations include optimizations that switch to a more
|
|||
|
efficient iterative solution once the size of the recurrence becomes small.</td></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<div
|
|||
|
id="DesignSumm"
|
|||
|
class="embedContainer"
|
|||
|
data-exer-name="DesignSumm"
|
|||
|
data-long-name="Parallel design questions"
|
|||
|
data-short-name="DesignSumm"
|
|||
|
data-frame-src="../../../Exercises/ParallelDistributed/DesignSumm.html?selfLoggingEnabled=false&localMode=true&module=ParallelDesign&JXOP-debug=true&JOP-lang=en&JXOP-code=java"
|
|||
|
data-frame-width="950"
|
|||
|
data-frame-height="550"
|
|||
|
data-external="false"
|
|||
|
data-points="1.0"
|
|||
|
data-required="True"
|
|||
|
data-showhide="show"
|
|||
|
data-threshold="5"
|
|||
|
data-type="ka"
|
|||
|
data-exer-id="">
|
|||
|
|
|||
|
<div class="center">
|
|||
|
<div id="DesignSumm_iframe"></div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<div class="container">
|
|||
|
|
|||
|
<div class="mt-4 container center">
|
|||
|
«  <a id="prevmod1" href="ParVConc.html">9.2. Parallelism vs. Concurrency</a>
|
|||
|
  ::  
|
|||
|
<a class="uplink" href="index.html">Contents</a>
|
|||
|
  ::  
|
|||
|
<a id="nextmod1" href="Scaling.html">9.4. Limits of Parallelism and Scaling</a>  »
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
<br />
|
|||
|
|
|||
|
<div class="row jmu-dark-purple-bg">
|
|||
|
<div class="col-md-12">
|
|||
|
<center>
|
|||
|
<a id="contact_us" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="mailto:webmaster@opencsf.org" role="button">Contact Us</a>
|
|||
|
<a id="license" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/lib/license.html" target="_blank">License</a>
|
|||
|
</center>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script src="_static/js/popper.js-1.14.7-min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
|||
|
<script src="_static/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
|||
|
</body>
|
|||
|
</html>
|