483 lines
35 KiB
HTML
483 lines
35 KiB
HTML
|
|
|||
|
<!DOCTYPE html>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<html lang="en">
|
|||
|
<head>
|
|||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|||
|
|
|||
|
<title>9.6. Reliable Data Storage and Location — Computer Systems Fundamentals</title>
|
|||
|
|
|||
|
<link rel="stylesheet" href="_static/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous" />
|
|||
|
<link rel="stylesheet" href="_static/css/pygments.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/normalize.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../JSAV/css/JSAV.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../lib/odsaMOD-min.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/jquery-1.11.4-smoothness-ui.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="../../../lib/odsaStyle-min.css" type="text/css" />
|
|||
|
<link rel="stylesheet" href="_static/css/csf.css" type="text/css" />
|
|||
|
|
|||
|
<style>
|
|||
|
.underline { text-decoration: underline; }
|
|||
|
</style>
|
|||
|
|
|||
|
<script type="text/javascript">
|
|||
|
var DOCUMENTATION_OPTIONS = {
|
|||
|
URL_ROOT: './',
|
|||
|
VERSION: '0.4.1',
|
|||
|
COLLAPSE_INDEX: false,
|
|||
|
FILE_SUFFIX: '.html',
|
|||
|
HAS_SOURCE: true
|
|||
|
};
|
|||
|
</script>
|
|||
|
|
|||
|
<script type="text/x-mathjax-config">
|
|||
|
MathJax.Hub.Config({
|
|||
|
tex2jax: {
|
|||
|
inlineMath: [['$','$'], ['\\(','\\)']],
|
|||
|
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
|
|||
|
processEscapes: true
|
|||
|
},
|
|||
|
"HTML-CSS": {
|
|||
|
scale: "80"
|
|||
|
}
|
|||
|
});
|
|||
|
</script>
|
|||
|
<link rel="shortcut icon" href="_static/favicon.ico"/>
|
|||
|
<link rel="index" title="Index" href="genindex.html" />
|
|||
|
<link rel="search" title="Search" href="search.html" />
|
|||
|
<link rel="index" title="Computer Systems Fundamentals" href="index.html" />
|
|||
|
<link rel="next" title="7. Consensus in Distributed Systems" href="DistConsensus.html" />
|
|||
|
<link rel="prev" title="5. Timing in Distributed Environments" href="DistTiming.html" />
|
|||
|
|
|||
|
</head><body>
|
|||
|
|
|||
|
<nav class="navbar navbar-expand-md navbar-dark navbar-custom fixed-top">
|
|||
|
|
|||
|
<a class="navbar-brand py-0" href="index.html"><img src="_static/CSF-Logo-Square-Text.png" alt="OpenCSF Logo" height="40em" class="py-1 px-2 mb-0 align-center rounded-lg bg-white" /></a>
|
|||
|
<!-- Show a navbar toggler on mobile -->
|
|||
|
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#defaultNavbars" aria-controls="defaultNavbars" aria-expanded="false" aria-label="Toggle navigation">
|
|||
|
<span class="navbar-toggler-icon"></span>
|
|||
|
</button>
|
|||
|
<div class="collapse navbar-collapse" id="defaultNavbars">
|
|||
|
<ul class="navbar-nav mr-auto">
|
|||
|
<li class="nav-item dropdown">
|
|||
|
<a class="nav-link dropdown-toggle jmu-gold rounded" href="DistDataStorage.html#" id="navbarDropdownChapters" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Contents</a>
|
|||
|
<div class="dropdown-menu scrollable-menu" role="menu" aria-labelledby="navbarDropdownChapters">
|
|||
|
<a class="dropdown-item" tabindex="-1" href="DistDataStorage.html#"><b>Chapter 1</b></a>
|
|||
|
<a class="dropdown-item" href="IntroConcSysOverview.html"> 1.1. Introduction to Concurrent Systems</a>
|
|||
|
<a class="dropdown-item" href="SysAndModels.html"> 1.2. Systems and Models</a>
|
|||
|
<a class="dropdown-item" href="Themes.html"> 1.3. Themes and Guiding Principles</a>
|
|||
|
<a class="dropdown-item" href="Architectures.html"> 1.4. System Architectures</a>
|
|||
|
<a class="dropdown-item" href="StateModels.html"> 1.5. State Models in UML</a>
|
|||
|
<a class="dropdown-item" href="SequenceModels.html"> 1.6. Sequence Models in UML</a>
|
|||
|
<a class="dropdown-item" href="StateModelImplementation.html"> 1.7. Extended Example: State Model Implementation</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 2</b></a>
|
|||
|
<a class="dropdown-item" href="ProcessesOverview.html"> 2.1. Processes and OS Basics</a>
|
|||
|
<a class="dropdown-item" href="Multiprogramming.html"> 2.2. Processes and Multiprogramming</a>
|
|||
|
<a class="dropdown-item" href="KernelMechanics.html"> 2.3. Kernel Mechanics</a>
|
|||
|
<a class="dropdown-item" href="Syscall.html"> 2.4. System Call Interface</a>
|
|||
|
<a class="dropdown-item" href="ProcessCycle.html"> 2.5. Process Life Cycle</a>
|
|||
|
<a class="dropdown-item" href="UnixFile.html"> 2.6. The UNIX File Abstraction</a>
|
|||
|
<a class="dropdown-item" href="EventsSignals.html"> 2.7. Events and Signals</a>
|
|||
|
<a class="dropdown-item" href="Extended2Processes.html"> 2.8. Extended Example: Listing Files with Processes</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 3</b></a>
|
|||
|
<a class="dropdown-item" href="IPCOverview.html"> 3.1. Concurrency with IPC</a>
|
|||
|
<a class="dropdown-item" href="IPCModels.html"> 3.2. IPC Models</a>
|
|||
|
<a class="dropdown-item" href="Pipes.html"> 3.3. Pipes and FIFOs</a>
|
|||
|
<a class="dropdown-item" href="MMap.html"> 3.4. Shared Memory With Memory-mapped Files</a>
|
|||
|
<a class="dropdown-item" href="POSIXvSysV.html"> 3.5. POSIX vs. System V IPC</a>
|
|||
|
<a class="dropdown-item" href="MQueues.html"> 3.6. Message Passing With Message Queues</a>
|
|||
|
<a class="dropdown-item" href="ShMem.html"> 3.7. Shared Memory</a>
|
|||
|
<a class="dropdown-item" href="IPCSems.html"> 3.8. Semaphores</a>
|
|||
|
<a class="dropdown-item" href="Extended3Bash.html"> 3.9. Extended Example: Bash-lite: A Simple Command-line Shell</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 4</b></a>
|
|||
|
<a class="dropdown-item" href="SocketsOverview.html"> 4.1. Networked Concurrency</a>
|
|||
|
<a class="dropdown-item" href="FiveLayer.html"> 4.2. The TCP/IP Internet Model</a>
|
|||
|
<a class="dropdown-item" href="NetApps.html"> 4.3. Network Applications and Protocols</a>
|
|||
|
<a class="dropdown-item" href="Sockets.html"> 4.4. The Socket Interface</a>
|
|||
|
<a class="dropdown-item" href="TCPSockets.html"> 4.5. TCP Socket Programming: HTTP</a>
|
|||
|
<a class="dropdown-item" href="UDPSockets.html"> 4.6. UDP Socket Programming: DNS</a>
|
|||
|
<a class="dropdown-item" href="AppBroadcast.html"> 4.7. Application-Layer Broadcasting: DHCP</a>
|
|||
|
<a class="dropdown-item" href="Extended4CGI.html"> 4.8. Extended Example: CGI Web Server</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 5</b></a>
|
|||
|
<a class="dropdown-item" href="InternetOverview.html"> 5.1. The Internet and Connectivity</a>
|
|||
|
<a class="dropdown-item" href="AppLayer.html"> 5.2. Application Layer: Overlay Networks</a>
|
|||
|
<a class="dropdown-item" href="TransLayer.html"> 5.3. Transport Layer</a>
|
|||
|
<a class="dropdown-item" href="NetSec.html"> 5.4. Network Security Fundamentals</a>
|
|||
|
<a class="dropdown-item" href="NetLayer.html"> 5.5. Network Layer: IP</a>
|
|||
|
<a class="dropdown-item" href="LinkLayer.html"> 5.6. Link Layer</a>
|
|||
|
<a class="dropdown-item" href="Wireless.html"> 5.7. Wireless Connectivity: Wi-Fi, Bluetooth, and Zigbee</a>
|
|||
|
<a class="dropdown-item" href="Extended5DNS.html"> 5.8. Extended Example: DNS client</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 6</b></a>
|
|||
|
<a class="dropdown-item" href="ThreadsOverview.html"> 6.1. Concurrency with Multithreading</a>
|
|||
|
<a class="dropdown-item" href="ProcVThreads.html"> 6.2. Processes vs. Threads</a>
|
|||
|
<a class="dropdown-item" href="RaceConditions.html"> 6.3. Race Conditions and Critical Sections</a>
|
|||
|
<a class="dropdown-item" href="POSIXThreads.html"> 6.4. POSIX Thread Library</a>
|
|||
|
<a class="dropdown-item" href="ThreadArgs.html"> 6.5. Thread Arguments and Return Values</a>
|
|||
|
<a class="dropdown-item" href="ImplicitThreads.html"> 6.6. Implicit Threading and Language-based Threads</a>
|
|||
|
<a class="dropdown-item" href="Extended6Input.html"> 6.7. Extended Example: Keyboard Input Listener</a>
|
|||
|
<a class="dropdown-item" href="Extended6Primes.html"> 6.8. Extended Example: Concurrent Prime Number Search</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 7</b></a>
|
|||
|
<a class="dropdown-item" href="SynchOverview.html"> 7.1. Synchronization Primitives</a>
|
|||
|
<a class="dropdown-item" href="CritSect.html"> 7.2. Critical Sections and Peterson's Solution</a>
|
|||
|
<a class="dropdown-item" href="Locks.html"> 7.3. Locks</a>
|
|||
|
<a class="dropdown-item" href="Semaphores.html"> 7.4. Semaphores</a>
|
|||
|
<a class="dropdown-item" href="Barriers.html"> 7.5. Barriers</a>
|
|||
|
<a class="dropdown-item" href="Condvars.html"> 7.6. Condition Variables</a>
|
|||
|
<a class="dropdown-item" href="Deadlock.html"> 7.7. Deadlock</a>
|
|||
|
<a class="dropdown-item" href="Extended7Events.html"> 7.8. Extended Example: Event Log File</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 8</b></a>
|
|||
|
<a class="dropdown-item" href="SynchProblemsOverview.html"> 8.1. Synchronization Patterns and Problems</a>
|
|||
|
<a class="dropdown-item" href="SynchDesign.html"> 8.2. Basic Synchronization Design Patterns</a>
|
|||
|
<a class="dropdown-item" href="ProdCons.html"> 8.3. Producer-Consumer Problem</a>
|
|||
|
<a class="dropdown-item" href="ReadWrite.html"> 8.4. Readers-Writers Problem</a>
|
|||
|
<a class="dropdown-item" href="DiningPhil.html"> 8.5. Dining Philosophers Problem and Deadlock</a>
|
|||
|
<a class="dropdown-item" href="CigSmokers.html"> 8.6. Cigarette Smokers Problem and the Limits of Semaphores and Locks</a>
|
|||
|
<a class="dropdown-item" href="Extended8ModExp.html"> 8.7. Extended Example: Parallel Modular Exponentiation</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Chapter 9</b></a>
|
|||
|
<a class="dropdown-item" href="ParallelDistributedOverview.html"> 9.1. Parallel and Distributed Systems</a>
|
|||
|
<a class="dropdown-item" href="ParVConc.html"> 9.2. Parallelism vs. Concurrency</a>
|
|||
|
<a class="dropdown-item" href="ParallelDesign.html"> 9.3. Parallel Design Patterns</a>
|
|||
|
<a class="dropdown-item" href="Scaling.html"> 9.4. Limits of Parallelism and Scaling</a>
|
|||
|
<a class="dropdown-item" href="DistTiming.html"> 9.5. Timing in Distributed Environments</a>
|
|||
|
<a class="dropdown-item" href="DistDataStorage.html"> 9.6. Reliable Data Storage and Location</a>
|
|||
|
<a class="dropdown-item" href="DistConsensus.html"> 9.7. Consensus in Distributed Systems</a>
|
|||
|
<a class="dropdown-item" href="Extended9Blockchain.html"> 9.8. Extended Example: Blockchain Proof-of-Work</a>
|
|||
|
<div class="dropdown-divider"></div>
|
|||
|
<a class="dropdown-item disabled"><b>Appendix A</b></a>
|
|||
|
<a class="dropdown-item" href="CLangOverview.html"> A.1. C Language Reintroduction</a>
|
|||
|
<a class="dropdown-item" href="Debugging.html"> A.2. Documentation and Debugging</a>
|
|||
|
<a class="dropdown-item" href="BasicTypes.html"> A.3. Basic Types and Pointers</a>
|
|||
|
<a class="dropdown-item" href="Arrays.html"> A.4. Arrays, Structs, Enums, and Type Definitions</a>
|
|||
|
<a class="dropdown-item" href="Functions.html"> A.5. Functions and Scope</a>
|
|||
|
<a class="dropdown-item" href="Pointers.html"> A.6. Pointers and Dynamic Allocation</a>
|
|||
|
<a class="dropdown-item" href="Strings.html"> A.7. Strings</a>
|
|||
|
<a class="dropdown-item" href="FunctionPointers.html"> A.8. Function Pointers</a>
|
|||
|
<a class="dropdown-item" href="Files.html"> A.9. Files</a>
|
|||
|
</div>
|
|||
|
</li>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</ul>
|
|||
|
</div>
|
|||
|
|
|||
|
<ul class="navbar-nav flex-row ml-md-auto d-none d-md-flex">
|
|||
|
<li class="nav-item"><a class="nav-link jmu-gold" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/Books/csf/source/DistDataStorage.rst"
|
|||
|
target="_blank" rel="nofollow">Show Source</a></li>
|
|||
|
|
|||
|
</ul>
|
|||
|
</nav>
|
|||
|
|
|||
|
|
|||
|
<div class="container center">
|
|||
|
«  <a id="prevmod" href="DistTiming.html">9.5. Timing in Distributed Environments</a>
|
|||
|
  ::  
|
|||
|
<a class="uplink" href="index.html">Contents</a>
|
|||
|
  ::  
|
|||
|
<a id="nextmod" href="DistConsensus.html">9.7. Consensus in Distributed Systems</a>  »
|
|||
|
|
|||
|
</div>
|
|||
|
<br />
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script type="text/javascript" src="_static/js/jquery-2.1.4.min.js"></script>
|
|||
|
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
|||
|
<script type="text/javascript" src="_static/js/jquery-1.11.4-ui.min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/forge-0.7.0.min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/lib/jquery.transit.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/lib/raphael.js"></script>
|
|||
|
<script type="text/javascript" src="../../../JSAV/build/JSAV-min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/config.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/odsaUtils-min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/odsaMOD-min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/d3-4.13.0.min.js"></script>
|
|||
|
<script type="text/javascript" src="_static/js/d3-selection-multi.v1.min.js"></script>
|
|||
|
<script type="text/javascript" src="../../../lib/dataStructures.js"></script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<div class="container">
|
|||
|
|
|||
|
<script>ODSA.SETTINGS.DISP_MOD_COMP = true;ODSA.SETTINGS.MODULE_NAME = "DistDataStorage";ODSA.SETTINGS.MODULE_LONG_NAME = "Reliable Data Storage and Location";ODSA.SETTINGS.MODULE_CHAPTER = "Parallel and Distributed Systems"; ODSA.SETTINGS.BUILD_DATE = "2021-06-01 15:31:51"; ODSA.SETTINGS.BUILD_CMAP = false;JSAV_OPTIONS['lang']='en';JSAV_EXERCISE_OPTIONS['code']='java';</script><div class="section" id="reliable-data-storage-and-location">
|
|||
|
<h1>9.6. Reliable Data Storage and Location<a class="headerlink" href="DistDataStorage.html#reliable-data-storage-and-location" title="Permalink to this headline">¶</a></h1>
|
|||
|
<p>Distributed systems are commonly used to store collections of data.
|
|||
|
Consequently, a key issue in distributed systems is knowing where to find the
|
|||
|
data in question. As an example, consider a traditional static web page, such as
|
|||
|
<code class="docutils literal notranslate"><span class="pre">http://www.abc.com/index.html</span></code>. The structure of this URL indicates that
|
|||
|
there is a file named index.html located in the root directory of a web server.
|
|||
|
Since the URL is an HTTP request, this web server is a process listening on port
|
|||
|
80 from a machine somewhere. The hostname <code class="docutils literal notranslate"><span class="pre">www.abc.com</span></code> would be translated into
|
|||
|
an IP address to determine the machine’s logical location within the Internet.
|
|||
|
When the HTTP request gets made, the client sends the request to a router and
|
|||
|
the request gets forwarded until it reaches this machine. In other words,
|
|||
|
completing this single HTTP request requires identifying the machine that stores
|
|||
|
the file, the logical location of the machine in the Internet, the physical
|
|||
|
location of the network card that connects the machine, and the specific process
|
|||
|
on that machine responsible for hosting that file. If any of those components
|
|||
|
fail, then the file cannot be served to the user’s browser.</p>
|
|||
|
<p>This scenario illustrates a common goal in distributed systems: how to locate
|
|||
|
and retrieve data objects reliably, <strong>even when components fail</strong>. The routing
|
|||
|
protocols of the Internet are designed around the assumption of failure. If a
|
|||
|
wire gets cut or a machine crashes, the routing protocols attempt to find an
|
|||
|
alternative path by updating their routing data structures. This design
|
|||
|
principle of reliable service with unreliable components can be extended to the
|
|||
|
application layer of the network stack, as well.</p>
|
|||
|
<p>One fundamental building block of reliably locating objects in a distributed
|
|||
|
system is <a class="reference internal" href="Glossary.html#term-343"><span class="xref std std-term">replication</span></a>. When an object is replicated, multiple servers store
|
|||
|
identical copies, so clients can retrieve the object from several locations.
|
|||
|
Replication avoids the problem of a <em>single point of failure</em> — one node failure is
|
|||
|
not sufficient to eliminate all service. Additionally, replication provides a
|
|||
|
means of load balancing. Since the same object is accessible from multiple
|
|||
|
servers within the system, clients can request copies from any of them rather
|
|||
|
than overloading servers that host the most popular objects.</p>
|
|||
|
<div class="section" id="google-file-system">
|
|||
|
<h2>9.6.1. Google File System<a class="headerlink" href="DistDataStorage.html#google-file-system" title="Permalink to this headline">¶</a></h2>
|
|||
|
<p>As an example of how replication provides reliable service for data storage,
|
|||
|
consider the <em>Google File System (GFS)</em>. <a href="DistDataStorage.html#google">Figure 9.6.1</a>
|
|||
|
shows the structure of the main components of GFS. In GFS, the assumption is
|
|||
|
that files are large (e.g., each file might be multiple terabytes in size), so
|
|||
|
they are broken up into <em>chunks</em>. The individual chunks with the file
|
|||
|
contents are stored on <em>chunkservers</em>. Each of these chunkservers
|
|||
|
contains its own traditional local file system; for example, assuming the
|
|||
|
machines are running Linux, then the chunks would be stored as local files
|
|||
|
within that chunkserver’s ext4 file system. In addition, a GFS master maintains
|
|||
|
a non-authoritative table that maps files to their locations.</p>
|
|||
|
<div class="figure mb-2 align-center" id="id3">
|
|||
|
<span id="google"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.10.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="The structure of the Google File System (GFS)" src="_images/CSF-Images.9.10.png" style="width: 70%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.6.1: The structure of the Google File System (GFS)</span></p>
|
|||
|
</div>
|
|||
|
<p>For each given chunk, there is a designated <em>primary chunkserver</em> along with
|
|||
|
multiple <a class="reference internal" href="Glossary.html#term-343"><span class="xref std std-term">replicas</span></a>. The primary and replicas all store identical copies
|
|||
|
of the chunks, but the primary is designated as having a <em>lease</em> on the file and
|
|||
|
has exclusive access to modify it if needed. For instance, in <a href="DistDataStorage.html#google">Figure 9.6.1</a>,
|
|||
|
the first chunk of File 1 can be retrieved from either node
|
|||
|
<code class="docutils literal notranslate"><span class="pre">a3d2</span></code> (the left-most chunkserver) or node <code class="docutils literal notranslate"><span class="pre">c9c4</span></code> (the right-most).
|
|||
|
Ultimately, each chunkserver has full control over the chunks that it stores,
|
|||
|
and this information might differ from the records in the GFS master. To keep
|
|||
|
the GFS master’s records updated, the GFS master sends out periodic <em>HeartBeat</em>
|
|||
|
messages to the chunkservers to determine their current status and provide
|
|||
|
additional instructions.</p>
|
|||
|
</div>
|
|||
|
<div class="section" id="distributed-hash-tables">
|
|||
|
<h2>9.6.2. Distributed Hash Tables<a class="headerlink" href="DistDataStorage.html#distributed-hash-tables" title="Permalink to this headline">¶</a></h2>
|
|||
|
<p>GFS was designed to create a distributed file system for a single organization.
|
|||
|
Consequently, Google could build in some assumptions into the design about the
|
|||
|
locations of nodes. When the GFS master in <a href="DistDataStorage.html#google">Figure 9.6.1</a>
|
|||
|
informed the client that node <code class="docutils literal notranslate"><span class="pre">a3d2</span></code> was the assigned primary for chunk 1 of
|
|||
|
file 1, the client knew which machine to contact, because the clients maintain
|
|||
|
information about the IP addresses of nodes in the system. In other distributed
|
|||
|
file systems—particularly those designed to be openly accessible—clients do not
|
|||
|
have this information.</p>
|
|||
|
<div class="figure mb-2 align-right" id="id4" style="width: 40%">
|
|||
|
<span id="chord"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.11.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="A Chord ring with up to 32 nodes. Black nodes are live (available); nodes with a red X are considered failed (absent or unavailable)" src="_images/CSF-Images.9.11.png" style="width: 90%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.6.2: A Chord ring with up to 32 nodes. Black nodes are live (available); nodes
|
|||
|
with a red X are considered failed (absent or unavailable)</span></p>
|
|||
|
</div>
|
|||
|
<p>Instead of relying on the clients to have information about node locations,
|
|||
|
systems can create a <a class="reference internal" href="Glossary.html#term-distributed-hash-table"><span class="xref std std-term">distributed hash table (DHT)</span></a> that maps objects to
|
|||
|
machines that host them. <a class="reference internal" href="Glossary.html#term-chord"><span class="xref std std-term">Chord</span></a> was an early <a class="footnote-reference" href="DistDataStorage.html#f51" id="id1">[1]</a> and influential DHT.
|
|||
|
In Chord, all nodes were assigned unique identifiers and arranged in a logical
|
|||
|
ring structure. <a href="DistDataStorage.html#chord">Figure 9.6.2</a> shows the logical structure of a
|
|||
|
simplified Chord ring with support for 32 nodes. (Chord node identifiers are
|
|||
|
created by calculating the hash of the machine’s IP address; for a 256-bit hash
|
|||
|
value, the ring would have up to $2^{256}$ nodes.) Most of the nodes are <em>live</em>,
|
|||
|
meaning those machines are running and connected to the system; nodes marked
|
|||
|
with an X (e.g., 2, 8, and 14) are not connected.</p>
|
|||
|
<p><a href="DistDataStorage.html#chord">Figure 9.6.2</a> also shows a key data structure to define the
|
|||
|
nature of the Chord ring: the <em>finger table</em>. Every node contains a finger
|
|||
|
table, which contains information about other Chord ring indexes. Each entry in
|
|||
|
the finger table is based on the current node’s identifier plus a power of 2.
|
|||
|
The finger table shown here would be the table for node 0. Node 0 would contain
|
|||
|
information about the indexes 1, 2, 4, 8, and 16 (i.e., $0+2^0$, $0+2^1$,
|
|||
|
$0+2^2$, $0+2^3$, and $0+2^4$). Similarly, node 1 would contain information
|
|||
|
about 2, 3, 5, 9, and 17 ($1+2^0$, $1+2^1$, $1+2^2$, $1+2^3$, and $1+2^4$).
|
|||
|
However, given that some nodes are missing (such as 2 and 8), Chord nodes keep
|
|||
|
track of the <em>successor</em> of the target index, which is the closest value
|
|||
|
greater than or equal to the index. Since 1, 4, and 16 are present, these nodes
|
|||
|
are their own successors. Since 2 and 8 are missing, 3 is the successor of 2 and
|
|||
|
9 is the successor of 8. Within the finger table, the Chord nodes keep track of
|
|||
|
information about that successor, including its IP address.</p>
|
|||
|
<p>The logical ring structure of Chord makes it possible for items to be located
|
|||
|
very efficiently, even though nodes in the system have very little information
|
|||
|
about the structure. Assume that a user is running a client on node 6. This user
|
|||
|
tries to open the file <code class="docutils literal notranslate"><span class="pre">"chord/data/foo"</span></code>. As with the node identifiers,
|
|||
|
objects are mapped to keys using hashes, so the user would calculate the hash of
|
|||
|
the file name. If <code class="docutils literal notranslate"><span class="pre">"chord/data/foo"</span></code> hashes to the value 19, then the
|
|||
|
successor of 19 is deemed the location of that object.</p>
|
|||
|
<div class="figure mb-2 align-right" id="id5" style="width: 40%">
|
|||
|
<span id="chordsucc"></span><a class="reference internal image-reference" href="_images/CSF-Images.9.12.png"><img class="p-3 mb-2 align-center border border-dark rounded-lg" alt="Routing a Chord lookup message for key 19 from node 6 to node 21" src="_images/CSF-Images.9.12.png" style="width: 90%;" /></a>
|
|||
|
<p class="caption align-center px-3"><span class="caption-text"> Figure 9.6.3: Routing a Chord lookup message for key 19 from node 6 to node 21</span></p>
|
|||
|
</div>
|
|||
|
<p><a href="DistDataStorage.html#chordsucc">Figure 9.6.3</a> shows the routing of the request message
|
|||
|
through the Chord ring. Node 6 would have entries for the successors of 14
|
|||
|
($6+2^3$) and 22 ($6+2^4$). Since 22 is greater than the target key 19, node 16
|
|||
|
would try to contact the successor of 14. Since node 14 is absent from the ring,
|
|||
|
node 6 would actually contact node 16. Note that node 6 determines this itself,
|
|||
|
because this information is stored in its own finger table. When node 16
|
|||
|
receives the request for key 19, it would find the successor of 18, which is
|
|||
|
$16+2^1$. Since nodes 18, 19, and 20 are all missing, node 21 is the successor
|
|||
|
of 18, so this node is the location of the requested file.</p>
|
|||
|
<p>The routing structure of Chord is deceptively powerful. Although it seems
|
|||
|
complicated, the structure is straightforward to implement in code. <a class="reference external" href="DistDataStorage.html#cl9-10">Code
|
|||
|
Listing 9.10</a> illustrates the algorithm for this lookup procedure.
|
|||
|
The logic of the algorithm is to start with the last entry of the finger table
|
|||
|
(assuming 64 entries because of 64-bit integer types; supporting a 256-bit
|
|||
|
identifier would require larger integer types) and work backwards to find the
|
|||
|
first entry that precedes the key.</p>
|
|||
|
<div class="highlight-c border border-dark rounded-lg bg-light px-0 mb-3 notranslate" id="cl9-10"><table class="highlighttable"><tr><td class="linenos px-0 mx-0"><div class="linenodiv"><pre class="mb-0"> 1
|
|||
|
2
|
|||
|
3
|
|||
|
4
|
|||
|
5
|
|||
|
6
|
|||
|
7
|
|||
|
8
|
|||
|
9
|
|||
|
10
|
|||
|
11
|
|||
|
12
|
|||
|
13
|
|||
|
14
|
|||
|
15
|
|||
|
16
|
|||
|
17
|
|||
|
18
|
|||
|
19
|
|||
|
20</pre></div></td><td class="code"><div class="highlight bg-light"><pre class="mb-0"><span></span><span class="cm">/* Code Listing 9.10:</span>
|
|||
|
<span class="cm"> Algorithm for finding the next node in a Chord finger table</span>
|
|||
|
<span class="cm"> */</span>
|
|||
|
|
|||
|
<span class="cp">#define NUM_ENTRIES 63 </span><span class="cm">/* assume all keys are 64 bits */</span><span class="cp"></span>
|
|||
|
<span class="kt">uint64_t</span>
|
|||
|
<span class="nf">lookup</span> <span class="p">(</span><span class="kt">uint64_t</span> <span class="n">key</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">index</span> <span class="o">=</span> <span class="n">NUM_ENTRIES</span><span class="p">;</span> <span class="n">index</span> <span class="o">></span> <span class="mi">0</span><span class="p">;</span> <span class="n">index</span><span class="o">--</span><span class="p">)</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="k">if</span> <span class="p">(</span><span class="n">node_id</span> <span class="o"><</span> <span class="n">finger_table</span><span class="p">[</span><span class="n">index</span><span class="p">])</span>
|
|||
|
<span class="p">{</span>
|
|||
|
<span class="k">if</span> <span class="p">(</span><span class="n">key</span> <span class="o">>=</span> <span class="n">finger_table</span> <span class="p">[</span><span class="n">index</span><span class="p">]</span> <span class="o">||</span> <span class="n">key</span> <span class="o"><</span> <span class="n">node_id</span><span class="p">)</span>
|
|||
|
<span class="k">return</span> <span class="n">finger_table</span> <span class="p">[</span><span class="n">index</span><span class="p">];</span>
|
|||
|
<span class="p">}</span>
|
|||
|
<span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">key</span> <span class="o">>=</span> <span class="n">finger_table</span> <span class="p">[</span><span class="n">index</span><span class="p">]</span> <span class="o">&&</span> <span class="n">key</span> <span class="o"><</span> <span class="n">node_id</span><span class="p">)</span>
|
|||
|
<span class="k">return</span> <span class="n">finger_table</span> <span class="p">[</span><span class="n">index</span><span class="p">];</span>
|
|||
|
<span class="p">}</span>
|
|||
|
<span class="k">return</span> <span class="n">node_id</span><span class="p">;</span> <span class="cm">/* default response is current node */</span>
|
|||
|
<span class="p">}</span>
|
|||
|
</pre></div>
|
|||
|
</td></tr></table></div>
|
|||
|
<p>The logic in <a class="reference external" href="DistDataStorage.html#cl9-10">Code Listing 9.10</a> is complicated by the modular
|
|||
|
arithmetic imposed by the Chord ring structure. Essentially, the algorithm
|
|||
|
starts by looking at <code class="docutils literal notranslate"><span class="pre">node_id</span> <span class="pre">+</span> <span class="pre">2</span></code><sup>max</sup> and incrementally
|
|||
|
decreases the exponent. Each time, the algorithm checks that the key would occur
|
|||
|
<em>after</em> the jumped node but before the current node. For instance, considering
|
|||
|
the lookup in <a href="DistDataStorage.html#chord">Figure 9.6.2</a>, the algorithm would look at
|
|||
|
$6+^2$4, to check if the key lies after 22 but before 6; if so, the algorithm
|
|||
|
would return 22 as the next node to jump to, otherwise it would look at $6+2^3$.
|
|||
|
If we consider a similar query, though, starting from node 18, the algorithm
|
|||
|
would first look at $18+2^4$, but this value must be mapped to the ring by
|
|||
|
applying mod 32. As such, the algorithm would need to check if the key comes
|
|||
|
after node 2 and before node 18. With modular arithmetic, determining whether a
|
|||
|
key value is <em>between</em> two nodes on the ring can be tricky to get the logic correct.</p>
|
|||
|
<p>The benefit of the Chord structure is that it places a very efficient
|
|||
|
upper-bound of Θ(log n) on the number of messages to find an object in the
|
|||
|
system. To put that into practical figures, given a system with 1,000,000 nodes
|
|||
|
storing data, any item could be found by forwarding the request at most 20
|
|||
|
times. If the number of nodes doubles, the maximum number of request messages
|
|||
|
would only increase by one. Consequently, the network overhead with Chord is very light.</p>
|
|||
|
<p>To improve the system performance further, Chord also employs replication
|
|||
|
through caching. Whenever a node is involved in looking up an object’s location,
|
|||
|
that node receives a copy of the object, as well. Returning to <a href="DistDataStorage.html#chord">Figure 9.6.2</a>,
|
|||
|
since node 16 was used as a step toward locating key 19 at
|
|||
|
node 21, node 16 also gets a copy. At that point, any request for the file
|
|||
|
<code class="docutils literal notranslate"><span class="pre">"chord/data/foo"</span></code> that goes through either node 6 or 16 will get a faster
|
|||
|
response. Since these nodes have a local replica of the file, there is no need
|
|||
|
to forward the request. Because of this caching technique and other forms of
|
|||
|
replication, Chord provides efficient times to find objects and high levels of
|
|||
|
availability (even when nodes fail).</p>
|
|||
|
<table class="docutils footnote" frame="void" id="f51" rules="none">
|
|||
|
<colgroup><col class="label" /><col /></colgroup>
|
|||
|
<tbody valign="top">
|
|||
|
<tr><td class="label"><a class="fn-backref" href="DistDataStorage.html#id1">[1]</a></td><td>During the 2001-2003 time frame, several DHTs were designed and
|
|||
|
proposed. Chord, along with Pastry, Tapestry, and CAN, was one of the four
|
|||
|
original systems created at this time. All of these systems contributed
|
|||
|
important ideas to this subfield of distributed systems.</td></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<div
|
|||
|
id="StorageSumm"
|
|||
|
class="embedContainer"
|
|||
|
data-exer-name="StorageSumm"
|
|||
|
data-long-name="Distributed storage questions"
|
|||
|
data-short-name="StorageSumm"
|
|||
|
data-frame-src="../../../Exercises/ParallelDistributed/StorageSumm.html?selfLoggingEnabled=false&localMode=true&module=DistDataStorage&JXOP-debug=true&JOP-lang=en&JXOP-code=java"
|
|||
|
data-frame-width="950"
|
|||
|
data-frame-height="550"
|
|||
|
data-external="false"
|
|||
|
data-points="1.0"
|
|||
|
data-required="True"
|
|||
|
data-showhide="show"
|
|||
|
data-threshold="3"
|
|||
|
data-type="ka"
|
|||
|
data-exer-id="">
|
|||
|
|
|||
|
<div class="center">
|
|||
|
<div id="StorageSumm_iframe"></div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<div class="container">
|
|||
|
|
|||
|
<div class="mt-4 container center">
|
|||
|
«  <a id="prevmod1" href="DistTiming.html">9.5. Timing in Distributed Environments</a>
|
|||
|
  ::  
|
|||
|
<a class="uplink" href="index.html">Contents</a>
|
|||
|
  ::  
|
|||
|
<a id="nextmod1" href="DistConsensus.html">9.7. Consensus in Distributed Systems</a>  »
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
<br />
|
|||
|
|
|||
|
<div class="row jmu-dark-purple-bg">
|
|||
|
<div class="col-md-12">
|
|||
|
<center>
|
|||
|
<a id="contact_us" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="mailto:webmaster@opencsf.org" role="button">Contact Us</a>
|
|||
|
<a id="license" class="btn button-link-no-blue jmu-gold" rel="nofollow" href="https://w3.cs.jmu.edu/kirkpams/OpenCSF/lib/license.html" target="_blank">License</a>
|
|||
|
</center>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script src="_static/js/popper.js-1.14.7-min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
|||
|
<script src="_static/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
|||
|
</body>
|
|||
|
</html>
|