.rm tm
.lf 1 -
.nr Rb 1
.ds m1 Jan.
.ds m2 Feb.
.ds m3 Mar.
.ds m4 Apr.
.ds m5 May
.ds m6 June
.ds m7 July
.ds m8 Aug.
.ds m9 Sep.
.ds ma Oct.
.ds mb Nov.
.ds mc Dec.
.\"		Robbert's Dynamite Troff Macros
.\"
.\" Use at your own risk.  These will normally be used next to -ms.  It
.\" redefines LP, PP, IP, SH, NH, FS, KS, KF, KE, bp (!), refer macros,
.\" and page format.  Lines are aligned on vertical spacing for a perfect
.\" page mirror.  It attempts to remove widows and to balance the pages.
.\" Figure macros are available through .F1 <figure> .F2 <trailer> .F3.
.\" There's no extra spacing between paragraphs, so you can use .LP any-
.\" time to align on vertical spacing or to reset the formatting parameters
.\" (point size, ...).  .KW keyword specifies a keyword, .KW flushes them.
.\" Use my refb if you want this to work.  If you look through this file,
.\" you may find some handy definitions that you can use as well.  By the
.\" way, if there's no .TL, .NH begins a new chapter.
.\"		Good luck, brave person.
.\"
.\"
.\"	=====> Ds is like ds, but then accepts real arguments
.\"
.de Ds	\"	--- define string ---
.ds \\$1 "\\$2
..
.de As	\"	--- append to string ---
.as \\$1 "\\$2
..
.\"	=====> page formatting macros <=====
.\"
.de Al	\"	--- alignment macro ---
.br
.if !\\n(PD \{\
.   nr VV \\n(VS-(\\n(VS/11)	\" vertical spacing minus a little bit
.   sp \\n(VVu-((\\n(.du-\\n(T[u+\\n(VVu)%\\n(VSu)
.\}
..
.de T]	\"	--- bottom of page trap macro ---
.ev 1				\" switch environment to save line buffer
.ie \\n(C%%2 \{\
.   nr C% +1			\" increase column counter
.   po \\n(POu+\\n(LLu+1c	\" new page offset
.   sp |\\n(Tpu			\" to top of right column
.   ev
.\}
.el \{\
.   ch T]			\" remove trap immediately
.   if \\n(C% .nr C% +1		\" if counting columns, count columns
.   po \\n(POu			\" set page offset
.   ie e .nr Bl \\n(nl		\" save position of left page
.   el .if \\n(Rb&\\n(Tc&((\\n(nl-\\n(Bl>0.5v):(\\n(Bl-\\n(nl>0.5v)) \
.      tm WN:balancing problem (\\n(nl != \\n(Bl)
.   if \\n(Fd .\{
.      sp |\\n(Plu-\\n(Fdu	\" to bottom of page
.      Fd			\" output footnotes
.      rm Fd			\" remove footnotes
.      nr Fd 0			\" clear footnote size
.   \}
.   nr Tl 0
.   if e .if \\n(nl+1v<=\\n(Pl .nr Tl 1	\" left page was shortened
.   if !'\\*(Pf'no' \{\
.      ie \\n(Tc \{\
.         sp |2.4c		\" some room at the top of the page
.         ie \\n(Pp  .tl ''\\s-1- \\n% -\\s+1''		\" paper header
.         el .ie o   .tl '\\*(S2'\\*(T2'\\f3\\n%\\fP'	\" right page header
.         el	     .tl '\\f3\\n%\\fP'\\*(T1'\\*(S1'	\" left page header
.      \}
.      el \{\
.         sp |\\n(.pu-2c	\" bottom of page
.         if !\\n(Pp .tl ''\\s-1- \\n% -\\s+1''		\" paper header
.      \}
.   \}
.   nr Tc 1			\" page number at top of page
.   ev				\" restore environment
'   b[				\" skip to next page, springing T[
.\}
..
.de E]	\"	--- end of input ---
.P]				\" end of last paragraph
.nr Kf 1			\" flush floating keeps
.if \\n(Kr \c
..
.de Bt	\"	--- change bottom of page trap ---
.nr Bt \\$1			\" calculate new page trap
.ie \\n(Bt<=\\n(nl .T]		\" if before current pos, spring now
.el .ch T] \\n(Btu		\" set new page trap
..
.nr T| 0			\" busy flag
.de T[	\"	--- top of page macro ---
.if \\n(Rb .tm PG:\\n%
.nr Bt \\n(Pl-1v+1		\" bottom of page trap position
.wh \\n(Btu T]			\" set bottom of page trap
.po \\n(POu			\" page offset
.nr Fc 0 1			\" reset footnote count
.if \\n(Fe .Fa			\" append leftover footnote
.ev 1				\" switch environment to save line buffer
.nr T[ 2.4c+1v+0.7c		\" size of page header
.sp |\\n(T[u			\" end of header
.if \\n(Kr .Kr			\" release some floating keeps
.Al				\" align in case of figures
.ev				\" restore environment
.nr Tp \\n(.d			\" page start
..
.de 2C	\"	--- 2 column output ---
.P]
.nr C% 1 1			\" start column counter
.ll (\\n(LTu-1c)/2u		\" calculate line length
.nr LL \\n(.l			\" -ms compatibility
.Al				\" align
.nr Tp \\n(.d			\" new top of page
.P[
..
.de 1C	\"	--- back to 1 column output ---
.P]
.ll \\n(LTu			\" restore line length
.nr LL \\n(.l			\" -ms compatibility
.po \\n(POu			\" restore margin
.nr C% 0			\" stop column count
.P[
..
.\"
.\"	=====> paragraph macros <=====
.\"
.de P[	\"	--- begin paragraph ---
.if !\\n(Ks .di Pd		\" divert
..
.de P]	\"	--- end paragraph ---
.ce 0				\" break, turn off centering
.in 0				\" turn off indent
.if !\\n(Ks \{\
.   nr Pm \\n(.u		\" save fill mode
.   nf				\" stop filling
.   di
.   \" diversion ended.  If paragraph doesn't fit, do something special
.   \" if left page was decreased, decrease right page too, else if
.   \" paragraph doesn't fit for but one line, decrease page length
.   if \\n(.t+1v<\\n(dn .if \\n(Tl:(\\n(.t+2v>=\\n(dn) .Bt -1v
.   Pd				\" flush paragraph
.   if \\n(Pm .fi		\" restore fill mode, but don't break
.\}
..
.\"
.\"	=====> footnote macros <=====
.\"
.rm FS FE FJ FK			\" remove -ms footnote stuff
.de FS	\"	--- start footnote ---
.ev 1				\" switch environments
.da Fe				\" divert footnote to Fe
.fi
..
.de FE	\"	--- end of footnote ---
.nf				\" break and stop filling
.da
.ev				\" restore environment
.\" If footnote doesn't fit, break here and now.  If it does, append it to
.\" the other macro and move end of page trap up.  If buffering already,
.\" continue buffering.
.nr Fe +\\n(dn			\" calculate new footnote size
.if \\n(Fe=\\n(dn .ie \\n(nl+\\n(.d+1v+\\n(Fd+\\n(Fe>=\\n(Bt .Bt \\n(nl+\\n(.d
.el .Fa				\" footnote still fits
..
.de Fa	\"	--- add footnote to buffer ---
.ev 1				\" switch environments again
.da Fd				\" add footnote to Fd
.if \\n+(Fc=1 \l'1i'		\" footnote separator on first footnote
.Fe				\" ditto
.br				\" ditto
.da
.ev				\" restore environment
.nr Fd +\\n(dn			\" calculate new footnote size
.Bt \\n(Pl-\\n(Fd-1v		\" calculate new page trap
.rm Fe				\" remove old footnote
.nr Fe 0			\" clear footnote size
..
.\"
.\"	=====> keep macros <=====
.\"
.nr Kl 0			\" free list
.nr n0 1
.nr n1 2
.nr n2 3
.nr n3 4
.nr n4 5
.nr n5 6
.nr n6 7
.nr n7 8
.nr n8 9
.nr n9 (-1)			\" end of free list
.nr Ko (-1)			\" queue in
.nr Ki (-1)			\" queue out
.de Bp	\"	--- begin a new page ---
.if !(\\n(.d=(\\n(T[) .T]	\" if not top of page, go to bottom
..
.de Kg	\"	--- output keep \\$1 with height \\$2
.nr Pm \\n(.u			\" save fill mode
.nf				\" don't fill
.\\$1				\" output keep
.if \\n(Pm .fi			\" restore fill mode
.if \\n(.t<2v .Bp		\" if little room left, begin new page
..
.de KS	\"	--- begin static keep ---
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Ks				\" divert keep to Ks
.P[
..
.de KF	\"	--- begin floating keep --
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Kf				\" divert keep to Kf
.P[
..
.de KE	\"	--- end keep --
.P]				\" break
.ie '\\n(.z'Ks' \{\
.   di
.   if \\n(dn>=\\n(.t .Bp	\" if it doesn't fit, begin a new page
.   Kg Ks \\n(dn		\" release static keep
.\}
.el \{\
.   di
.   ie (\\n(Ki<0)&(\\n(dn<\\n(.t) .Kg Kf \\n(dn
.   el \{\
.      if \\n(Kl<0 .Kr		\" free list exhausted, flush some entries
.      if \\n(Ki>=0 .nr n\\n(Ki (\\n(Kl)	\" if (Ki != NIL) n[Ki] = Kl
.      nr Ki (\\n(Kl)		\" Ki = Kl
.      nr Kl (\\n(n\\n(Kl)	\" Kl = n[Kl]
.      rn Kf d\\n(Ki		\" d[Ki] = Kf	diversion
.      nr h\\n(Ki (\\n(dn)	\" h[Ki] = dn	height
.      nr n\\n(Ki (-1)		\" n[Ki] = -1	(end of list)
.      if \\n(Ko<0 .nr Ko (\\n(Ki)	\" if (Ko < 0) Ko = Ki
.      nr Kr 1			\" entries to release
.   \}
.\}
.nr Ks -1
.P[				\" start a new paragraph
..
.de Kr	\"	--- release floating keep ---
.in 0				\" no indentation
.nf				\" no filling
.nr Kr 0			\" don't release while releasing
.Kg d\\n(Ko \\n(h\\n(Ko		\" output it
.fi				\" restore filling
.in				\" restore indentation
.nr Kt \\n(Ko			\" Kt = Ko
.nr Ko (\\n(n\\n(Kt)		\" Ko = n[Kt]	remove from queue
.nr n\\n(Kt (\\n(Kl)		\" n[Kt] = Kl	put on free list
.nr Kl (\\n(Kt)			\" Kl = Kt
.nr Kr (\\n(Ko>=0)		\" Kr = (Ko >= 0)
.ie !\\n(Kr .nr Ki (-1)		\" if Ko < 0 then Ki = end of list
.el .if \\n(h\\n(Ko<\\n(.t .Kr	\" release another one
.if \\n(Kf .T]			\" if flushing, begin new page
..
.de KK	\"	--- flush floating keeps ---
.nr Kf 1			\" flush floating keeps
.Bp				\" begin a new page
.nr Kf 0			\" don't flush anymore
..
.\"
.\"	=====> user macros <=====
.\"
.rn bp b[			\" rename begin page request
.de bp	\"	--- begin page for users ---
.P]				\" end paragraph
.T]				\" to bottom of page
.P[				\" begin new paragraph
..
.de B[	\"	--- begin block ---
.br
.ds Bw \\$2
.ds Bh \\$3
.di \\$1
.nf
..
.de B]	\"	--- end block ---
.fi
.di
.nr \\*(Bw \\n(dl
.nr \\*(Bh \\n(dn
..
.de B|	\"	--- position block ---
.nf
.mk B|				\" remember vertical position
.nr Bw \\$2			\" copy x argument
.nr Bh \\$3			\" copy y argument
.in +\\n(Bwu			\" go to horizontal position
.sp \\n(Bhu			\" go to vertical position
.\\$1				\" output block
.in				\" return to horizontal position
.sp |\\n(B|u			\" return to vertical position
.fi
..
.de C[	\"	--- begin centered block ---
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Cd				\" divert to Cd
.P[
..
.de C]	\"	--- end centered block ---
.P]				\" break
.di
.if \\n(dl<\\n(.l .in (\\n(.lu-\\n(dlu)/2u	\" indent to center
.Kg Cd \\n(dl			\" get diverted text
.in 0				\" no indentation
.nr Ks -1			\" end of keep
.P[				\" begin normal paragraph
..
.de Q[	\"	--- begin quote ---
.C[				\" begin centered block
.nr Ql \\n(.l			\" save line length
.ll \\n(.lu*3u/4u		\" set line length to 3/4 of current ll
..
.de Q]	\"	--- end quote ---
.ll \\n(Qlu			\" restore line length
.C]				\" end centered block
..
.
.de SZ	\"	--- size change ---
.br				\" first break
.ps \\$1			\" change point size
.vs \\$1			\" change vertical spacing
..
.de JR	\"	--- reset indentation ---
.nr Jn 0			\" current indent
.nr Ji 0			\" index of indented paragraphs
.nr J0 5n			\" reset indent sizes
.nr J1 5n
.nr J2 5n
.nr J3 5n
.nr J4 5n
.nr J5 5n
..
.de RT	\"	--- reset fonts and such ---
.ps \\n(PS			\" point size
.vs \\n(VS			\" vertical spacing
.ll \\n(LLu			\" line length
.nr Pl 27c+0.5v			\" length of page
.ll \\n(LLu			\" line length
.ev 1				\" parameters in environment 1 (title)
.ps 12				\" point size
.ll \\n(LLu			\" line length
.lt \\n(LTu			\" title length in environment 1
.ev
.ft 1				\" reset font
..
.de RS	\"	--- increase indent ---
.nr Jn +\\n(J\\n(Ji
.nr Ji +1
..
.de RE	\"	--- decrease indent ---
.nr Ji -1
.nr Jn -\\n(J\\n(Ji
..
.de JP	\"	--- begin unlabeled, indented paragraph ---
.P]				\" end paragraph
.if \\n(.t<1v .Bp		\" if not enough room, begin page
.if !(\\n(.d=\\n(Tp) .sp 0.3v 	\" if not top of page, skip some space
.nr Jj \\n(J\\n(Ji		\" increase in indent
.fi				\" start filling
.in \\n(Jnu+\\n(Jju		\" set new indent
.ta \\n(Jju			\" set tab
.ti -\\n(Jju			\" set temporary indent
.P[
..
.de IP	\"	--- begin labeled, indented paragraph ---
.if \\n(.$>1 .nr J\\n(Ji \\$2n	\" set indent if specified in ens
.JP				\" do indented paragraph
.RT				\" restore -ms variables
\&\\$1	\c
..
.de QP	\"	--- begin quotation ---
.if \\n(.$>1 .nr J\\n(Ji \\$2n	\" set indent if specified in ens
.ll -\\n(J\\n(Jiu		\" decrease line length
.JP				\" do indented paragraph
.RT				\" restore -ms variables
\&\\$1	\c
..
.de LP	\"	--- begin paragraph ---
.P]				\" end last paragraph
.Al				\" align
.sp \\n(PDu			\" interparagraph spacing
.JR				\" reset indentation
.RT				\" restore -ms variables
.fi				\" start filling
.P[				\" begin next
..
.de PP	\"	--- begin paragraph with temporary indent ---
.P]				\" end last paragraph
.Al				\" align
.sp \\n(PDu			\" interparagraph spacing
.JR				\" reset indentation
.RT				\" restore -ms variables
.fi				\" start filling
.ti +5n				\" temporary indent
.P[				\" begin next
..
.de CH	\"	--- chapter heading ---
.P]				\" break, start chapter
.KK				\" flush floating keeps
.Bp				\" begin page
.nr Tc 0			\" page number at bottom of page
.nr Fi 1 1			\" current figure
.RT				\" restore -ms variables
.ps 18				\" set point size
.vs 24				\" set vertical spacing
.ce 1000			\" center all lines
.nr Hi 0			\" header index 0
.rm HS				\" remove header string
.rm Rc				\" do not count pages
.fi				\" filling
.P[				\" start a new paragraph
..
.de SH	\"	--- section heading ---
.P]				\" end last paragraph
.Al				\" align
.if \\n(.t<5v .Bp		\" if not enough room, begin new page
.if !(\\n(.d=\\n(Tp) .sp 	\" if not top of page, skip some space
.RT				\" restore -ms variables
.ft 3				\" bold font
.nr Hi 0			\" header index 0
.rm HS				\" remove header string
.fi				\" start filling
.P[				\" start a new paragraph
..
.de NH	\"	--- numbered section header ---
.ie \\n(.$=0 .nr Ha 1		\" if no argument, Ha = 1
.el .nr Ha \\$1			\" Ha is argument
.if \\n(Ha<1 .nr H1 0		\" reset subsection numbers
.if \\n(Ha<2 .nr H2 0
.if \\n(Ha<3 .nr H3 0
.if \\n(Ha<4 .nr H4 0
.if \\n(Ha<5 .nr H5 0
.if \\n(Ha=0 .nr Ha 1		\" .NH 0 is like .NH 1, but then resets
.nr H\\n(Ha +1			\" H[Ha]++
.ie (\\n(Pp=0)&(\\n(Ha=1) \{\
.   CH
.   if !\\n(Pp .Ds Fn "\\n(H1\\*(Fs1"	\" reset next figure string
.   ds HS \\n(H1
\\s+6\\*(HS\\s-6
.   sp 0.5
.\}
.el \{\
.   SH
.   ds HS \\n(H1
.   if \\n(Ha>1 .as HS .\\n(H2
.   if \\n(Ha>2 .as HS .\\n(H3
.   if \\n(Ha>3 .as HS .\\n(H4
.   if \\n(Ha>4 .as HS .\\n(H5
\\*(HS.
.\}
.ds H0 \\*(HS.
.nr Hi \\n(Ha			\" header index
..
.de TL	\"	--- title of paper ---
.nr Pp 1			\" mark it's a paper
.CH
.Ds Fn "1"			\" next figure string
.ps -2
..
.de AU	\"	--- authors ---
.sp
.ft 2
.ps \\n(PS
.vs \\n(VS
..
.de AI	\"	--- author's institution
.sp
.ft 1
.ps \\n(PS
.vs \\n(VS
..
.de AB
.AI
.if !\\n(.$ ABSTRACT
.sp
.ce 0
.Q[
..
.de AE
.Q]
.sp
..
.de PS	\"	--- start picture ---
.\" $1 is height, $2 is width in units
..
.de PE	\"	--- end of picture ---
..
.de UX	\"	--- UNIX macro ---
.ie \\n(U1 \\$2\s-1UNIX\s0\\$1
.el \{\
\\$2\s-1UNIX\\s0\\$1\(dg
.   FS
\(dg UNIX is a Registered Trademark of AT&T in the USA and other countries.
.   FE
.nr U1 1
.\}
..
.de IX	\"	--- add to index, update page headers ---
.LP				\" end header, define page headers
.if \\n(Hi=0 \{\
.   ds T1 \\$1
.   ds T2 \\$1
.   rm S1 S2			\" no chapter or section number
.\}
.if \\n(Hi=1 \{\
.   ds T1 \\$1
.   ds S1 \s-2CHAP.\& \\*(HS\s+2
.   ds T2 \\$1
.   ds S2 \\*(S1
.\}
.if \\n(Hi=2 \{\
.   ds T2 \\$1
.   ds S2 \s-2SEC.\& \\*(HS\s+2
.\}
.da IO				\" divert to index
\\!.In \\n(Hi "\\*(HS" "\\$1" \\n%
.da
..
.de In	\"	--- output index ---
.P]				\" end of paragraph
.if !(\\n(.d=\\n(Tp) .ie \\$1<2 .sp 1.7
.el .if \\$1=2 .sp 0.3
.in 0
.ad l				\" adjust only left side
.ll -5n				\" decrease line length
.nr J0 0
.P[
.ie \\$1 \{\
.   nr In \\$1-1
.   nr J\\$1 \\n(J\\n(In+\\w'\\$2'+3.5n
.   in \\n(J\\$1u		\" set indent
.   ta \\w'\\$2'u+3.5n
.   ti -\\w'\\$2'u+3.5n
.   ie \\$1<2 \\s+3\\f3\\$2	\\$3\\f1\\s-3\&\c
.   el \\$2	\\$3\&\c
.\}
.el \\s+3\\f3\\$3\\f1\\s-3\&\c
.ll +5n				\" reset line length
.nr In \\n(.l-\w'\\$4'
\\\\h'|\\n(Inu'\\$4
.in 0				\" break, reset indent
.ad b				\" adjust both sides, end of diversion
..
.de IH	\"	--- index header ---
.ie \\$1 .NH \\$1		\" start a new header
.el .CH				\" start a new, unindexed, chapter
\\$2
.IX "\\$2"			\" add header to index
..
.ds Fs .
.de F1	\"	--- begin figure ---
.ds Fp \\*(Fn
.ie \\n(Pp .ds Fn \\n+(Fi
.el .ds Fn \\n(H1\\*(Fs\\n+(Fi
.KF				\" floating keep
.sp 0.5c
.C[				\" begin centered block
..
.de F2	\"	--- end of figure, begin label ---
.C]				\" end centered block
.sp 0.5c
.Q[
.fi
.ps -2
.vs -2
\\fBFig.\ \\*(Fp.\\fP
.ft 1
..
.de F3	\"	--- end of figure label ---
.br
.ft
.vs
.ps
.Q]
.sp 0.8				\" leave some room under the figure
.KE				\" end floating keep
..
.de KW	\"	--- keyword ---
.ie \\n(.$ \{\
.   ie '\\n(.z'' .tm KW:\\$1
.   el \\!.KW "\\$1"
.\}
.el \{\
.   P]
.   tm KW
.   P[
.\}
..
.de Kx	\"	--- start list of keywords ---
.P]
.if !(\\n(.d=\\n(Tp) .sp	\" if not top of page, skip some space
.P[
..
.de Kw	\"	--- output keyword ---
.LP
.in 1c
.ti -1c
\&\\$1 \\$2
..
.nr Di 1 1			\" current definition
.Ds Dn "\\*(Dx\\n(Di"
.de D[	\"	--- begin definition ---
.sp 0.5c
.Ds Dp "\\*(Dx\\n(Di"
.Ds Dn "\\*(Dx\\n+(Di"
\\fBDefinition\ \\*(Dp.\ \ \\$1\\fP\ \ \c
..
.de D]	\"	--- end of definition ---
.sp 0.3c
..
.\"
.\"	=====> refer macros <=====
.\"
.rm <. <,
.Ds >. "."		\" reference ends with period
.Ds >, ","		\" reference ends with comma
.Ds [. " \\f1["		\" start of reference
.Ds .] "]\\fP"		\" end of reference
.de ]<	\"	--- references ---
.if \\n(Rb \{\
.   ie \\n(.$ \{\
.      ie '\\n(.z'' .tm ]<:\\$1
.      el \\!.]< "\\$1"
.   \}
.   el \{\
.      P]
.      tm ]<
.      P[
.   \}
.\}
..
.de ]>
..
.de ]-	\"	--- remove garbage before next definition ---
.rm [A [B [C [D [E [G [H [I [J [M [N [O [P [Q [R [S [T [V ]. ],
..
.de RR	\"	--- add comma + argument to reference ---
\\*(],\\$1\c
.ds ]. .
.ds ], , \&
..
.de Rc	\"	--- sited on pages ($1: all; $2: first; ...) ---
.ie \\n(.$>2 Cited on pages \\$1.
.el Cited on page \\$1.
..
.de ][	\"	--- new reference ---
.KS				\" keep together
.JP [\\*([F]			\" start indented paragraph
.if !\\*([H .RR "\\*([H"
.if !\\*([A .RR "\\*([A"
.if !\\*([Q .RR "\\*([Q"
.if !\\*([T \{\
\\*(],\(l"\\*([T\c
.ds ]. .\(r"
.ds ], ,\(r" \&
.\}
.if !\\*([R .RR "\\*([R"
.if !\\*([M .RR "\\*([M"
.if !\\*([J .RR "\\f2\\*([J\\fP"
.if !\\*([V .RR "Vol.\& \\*([V"
.if !\\*([N .RR "No.\& \\*([N"
.if !\\*([P .ie \\n([P>0 .RR "pp.\& \\*([P"
.el .RR "p.\& \\*([P"
.if !\\*([B .RR "in \\f2\\*([B\\fP"
.if !\\*([E .RR "ed.\& \\*([E"
.if !\\*([S .RR "\\*([S"
.if !\\*([I .RR "\\*([I"
.if !\\*([C .RR "\\*([C"
.if !\\*([G .RR "Gov't.\& ordering no.\& \\*([G"
.if !\\*([D .RR "\\*([D"
\&\\*(].
.if !\\*([L .Rc "\\*([L" \\*([L
.if !\\*([O \&\\*([O
.KE
..
.\"
.\"	=====> accents <=====
.\"
.ds - \(em
.ds ' \h'\w'e'u-\w'\(aa'u/2+.06m'\z\(aa\h'-\w'e'u+\w'\(aa'u/2-.06m'
.ds ` \h'\w'e'u-\w'\(ga'u/2+.06m'\z\(ga\h'-\w'e'u+\w'\(ga'u/2-.06m'
.ds : \h'\w'u'u-\w'\(ad'u/2+.06m'\z\(ad\h'-\w'u'u+\w'\(ad'u/2-.06m'
.ds ^ \h'\w'a'u-\w'^'u/2+.06m'\z^\h'-\w'a'u+\w'^'u/2-.06m'
.ds ~ \h'\w'a'u-\w'~'u/2+.06m'\z~\h'-\w'a'u+\w'~'u/2-.06m'
.ds C \h'\w'e'u-\w'\(ah'u/2+.06m'\z\(ah\h'-\w'e'u+\w'\(ah'u/2-.06m'
.ds v \h'\w'e'u-\w'\(ah'u/2+.06m'\z\(ah\h'-\w'e'u+\w'\(ah'u/2-.06m'
.ds , \h'\w'c'u-\w'\(ac'u/2'\z\(ac\h'-\w'c'u+\w'\(ac'u/2'
.ds -- \*-
.ds q[ \(l"
.ds ]q \(r"
.\"
.\"	=====> user settable definitions <=====
.\"
.cs 5 20u			\" font 5, constant width
.nr PS 11			\" point size
.nr VS 13			\" vertical spacing
.nr LL 6.8i			\" line length
.nr FL 15c			\" footnote length (no effect currently)
.nr LT 15c			\" title length
.nr PO \n(.o			\" page offset
.nr PD 0			\" interparagraph spacing
.\"
.\"	=====> -ms init <=====
.\"
.nr FM 1			\" ms hack: remove page traps
.ch FO				\" remove bottom of page trap
.ch FX				\" remove footnote trap
.rm PT BT			\" remove other traps
.nr YE 1			\" causes break in .EQ
.\"
.\"	=====> initialization <=====
.\"
.RT				\" set these variables
.JR				\" reset indentation
.hw packet re-start trans-par-ent trans-par-ently trans-par-ency work-station trans-action time-stamp
.wh 0 T[			\" top of page macro
.em E]				\" end of text macro
.P[				\" begin paragraph
.nr PO 2c
.nr LL 16.5c
.tr ~
.TL
.ps 16
\fBREPLICATION TECHNIQUES FOR SPEEDING UP
PARALLEL APPLICATIONS
ON DISTRIBUTED SYSTEMS\fR
.ps
.AU
Henri E. Bal *
M. Frans Kaashoek
Andrew S. Tanenbaum
.AI
Dept. of Mathematics and Computer Science,
Vrije Universiteit,
Amsterdam, The Netherlands
.AU
Jack Jansen
.AI
Centrum voor Wiskunde en Informatica
Amsterdam, The Netherlands
.sp 1.5
Email: kaashoek@cs.vu.nl
.FS
\s-2* This research was supported in part by the Netherlands organization
for scientific research (N.W.O.) under grant 125-30-10.\s+2
.FE
.nr PS 10
.nr VS 12
.sp 2.5
.AB
.LP
Traditional programming methods for loosely-coupled systems are based on 
message-passing.
More recently, methods have emerged based for ``virtually'' sharing data.
These methods simplify distributed programming, but are hard to implement 
efficiently, as loosely-coupled systems do not contain physical shared
memory.
We introduce a new model, \fIthe shared data-object model\fR,
that eases the implementation of parallel applications on loosely-coupled
systems, but can still be implemented efficiently.
.PP
In the shared data-object model, shared data are encapsulated in data objects,
which are variables of user-defined abstract data types. 
To speed up access to shared data, data objects are replicated.
This paper discusses the design choices involved in replicating
objects and their effect on performance.
We have implemented several options to determine which strategy is
most efficient.
.AE
.nr VS 14
.nr PS 12
.ps 12
.vs 14
.rm Rc
.LP
Index terms: Amoeba, distributed systems, multicast, Orca, 
parallel programming, replication, shared data-object model.
.NH 1
INTRODUCTION
.PP
Distributed systems are becoming increasingly popular for running
large-grain parallel applications.
These systems are easy to build and extend, and offer a
good price/performance ratio.
The issue of how to program parallel applications that use many loosely-coupled
machines is still open.
Traditional programming methods are based on some form of message-passing.
More recently, methods have emerged based on sharing data.
Since distributed systems lack shared memory, this sharing of data
is logical, not physical.
.PP
For many applications, support for shared data makes programming easier,
since it allows processes on different machines to share state information.
The main problem, however, is how to implement it efficiently
on memory-disjunct architectures.
In this paper we introduce a new model providing shared data
and we discuss efficient implementation techniques for this model,
based on \fIdata replication\fR.
.PP
Several systems exist that use replication for implementing shared data.
Probably the best known example is Kai Li's Shared Virtual Memory (SVM)\*(<.\*([.Li 1988\*(.]\*(>.
.]< 0
This system gives the user the illusion of a shared memory.
It stores multiple read-only copies of the same page on different processors.
Each processor having a copy can read the page  as if it were
in normal local memory.
Other systems providing replicated shared data are described in\*([.Carriero and Gelernter 1986; Bisiani and Forin 1987; Cheriton 1985; Stenstr\(:om et al. 1988; Fleisch and Popek 1989\*(.]
.]< 1
.]< 2
.]< 3
.]< 4
.]< 5
.PP
The model studied in this paper is called the \fIshared data-object model\fR.
It is intended for implementing parallel applications on distributed systems.
The unit of replication in our model is not dictated by the system
(as in the SVM), but is determined by the programmer.
Shared data are encapsulated in \fIdata-objects\fR*, which are
variables of user-defined abstract data types.
.FS
\s-2* We will sometimes use the term \(l"object\(r" as a shorthand notation
for data-objects. Note, however, that unlike in most parallel object-based
systems, objects in our model are purely passive.\s+2
.FE
An abstract data type has two parts:
.RS
.nr LL -1c
.IP "\(bu"
A specification of the operations that can be applied to objects of this type.
.IP "\(bu"
The implementation, consisting of declarations for the
local variables of the object and code implementing the operations.
.nr LL +1c
.LP
.RE
.sp 0.5v
Instances (objects) of an abstract data type can be created dynamically, each
encapsulating the variables defined in the implementation part.
These objects can be shared among multiple processes, typically running on
different machines. Each process can apply operations to the object,
which are listed in the specification part of the abstract type.
In this way, the object becomes a communication channel between
the processes that share it.
.PP
The shared data-object model uses two important principles related to
operations on objects:
.RS
.nr LL -1c
.IP "1."
All operations on a given object are executed  \fIatomically\fR
(i.e., \fIindivisibly\fR).
To be more precise, the model guarantees \fIserializability\fR\*([.Eswaran et al. 1976\*(.]
.]< 6
of operation invocations:
if two operations are executed simultaneously,
then the result is as if one of them is executed before the other;
the order of invocation, however, is nondeterministic.
.IP "2."
All operations apply to \fIsingle\fR objects, so an operation invocation
can modify at most one object.
Making \fIsequences\fR of operations on different objects indivisible
is the responsibility of the programmer.
.nr LL +1c
.LP
.RE
.sp 0.5v
These two principles make the model easy to understand and efficient.
Moreover, in our experience, they provide sufficient support for
many parallel applications.
Distributed applications like banking and airline reservation
systems can profit from more support (e.g., atomic multi-object operations),
but such applications are not our major concern here.
Also, parallel applications on \fIclosely-coupled\fR (shared-memory) systems
can use a finer grain of parallelism (e.g., parallelism within objects),
but again these are not the type of applications we are interested in here.
These issues are addressed by other models, such as atomic transactions
and concurrent object-oriented programming and are not the topic of this
paper.
.PP
We have designed a new programming language called \fIOrca\fR,
based on this model.
Orca is intended for implementing distributed \fIuser\fR applications.
In particular, the language is intended for parallel, high-performance
applications.
Orca is a simple, procedural, type-secure language.
It supports abstract data types, processes, a variety of data structures,
modules, and generics.
The language, its implementation, and use are
described elsewhere\*(<.\*([.Bal and Tanenbaum 1988; Bal et al. 1990, 1989a, 1989b; Bal\*(.]\*(>.
.]< 7
.]< 8
.]< 9
.]< 10
.]< 11
.PP
In the rest of this paper we will study replication techniques
for the shared data-object model.
In Section 2, we will describe the space of possible design choices.
The most important issues are
(1) updating versus invalidation of copies,
(2) the protocols used for updating or invalidating copies,
and (3) the degree of replication.
As we will see, the best choice depends on the communication primitives
supported by the underlying distributed system.
We will study two important cases.
In Section 3, we will look at an implementation of the model using
point-to-point message passing.
In Section 4, we will discuss a second implementation, based on
reliable multicast messages.
The two implementations cover a broad spectrum of design choices.
In Section 5, we will measure the performance of the two implementations.
In Section 6, we will present our conclusions.
.NH 1
DESIGN SPACE
.PP
The technique of data replication in distributed systems is typically used
to increase the availability and reliability of the data
in the presence of processor failures and network partitions\*(<.\*([.Gifford 1979; Bernstein and Goodman 1981; Joseph and Birman 1987; Van Renesse and Tanenbaum 1988; Davidson et al. 1985\*(.]\*(>.
.]< 12
.]< 13
.]< 14
.]< 15
.]< 16
For example, if multiple copies of the same logical data are stored on different
processors, the data can still be accessed if some of the processors are 
down.
.PP
In contrast, we use replication primarily for speeding up access to shared data
and for decreasing the communication overhead involved in sharing data.
The general idea is to replicate an object on those processors
that frequently access it.
A copy may be accessed by all processes running on the same processor,
without sending any messages, as shown in Figure 1.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS 4.5i
M: box wid 4*boxwid ht 4*boxht
B1: box wid 1.25*boxwid "process-1" with .nw at M.nw+(0.3*boxwid, -0.3*boxwid)
B2: box wid 1.25*boxwid "process-n" with .sw at M.sw+(0.3*boxwid, 0.3*boxwid)
X: box ht 1.5*boxht "copy" "of" "X" with .e at M.e - (0.3*boxwid, 0)
arrow from B1.e to X.w+(0, 0.2*boxwid)
arrow from B2.e to X.w-(0, 0.2*boxwid)
"\fB.\fR" at 1/4 <B1.s, B2.n>
"\fB.\fR" at 2/4 <B1.s, B2.n>
"\fB.\fR" at 3/4 <B1.s, B2.n>
"CPU 1" at M.n + (0, 0.3*boxwid)
N: box wid 4*boxwid ht 4*boxht with .w at M.e+(boxwid, 0)
C1: box wid 1.25*boxwid "process-1" with .nw at N.nw+(0.3*boxwid, -0.3*boxwid)
C2: box wid 1.25*boxwid "process-n" with .sw at N.sw+(0.3*boxwid, 0.3*boxwid)
Y: box ht 1.5*boxht "copy" "of" "X" with .e at N.e - (0.3*boxwid, 0)
arrow from C1.e to Y.w+(0, 0.2*boxwid)
arrow from C2.e to Y.w-(0, 0.2*boxwid)
"\fB.\fR" at 1/4 <C1.s, C2.n>
"\fB.\fR" at 2/4 <C1.s, C2.n>
"\fB.\fR" at 3/4 <C1.s, C2.n>
"CPU 2" at N.n + (0, 0.3*boxwid)
line "n e t w o r k" below from M.sw-(0,boxht) to N.se-(0,boxht)
line from M.s to M.s-(0,boxht)
line from N.s to N.s-(0,boxht)
.PE
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Replication of data-objects in a distributed system.
Each processor contains multiple processes running in pseudo-parallel.
These processes belong to a single job and run in a single address space,
so they can share copies of objects.
.F3
.PP
It is useful to distinguish between \fIread\fR operations and \fIwrite\fR
operations on replicated data:
a read operation does not modify the data,
while a write operation (potentially) does\*(<.\*([.Joseph and Birman 1987\*(.]\*(>.
.]< 14
For our model, we define a read operation as an operation that does not change
the internal data of the object it is applied to.
.PP
The primary goal of replicating shared data-objects
is to apply read operations to a local copy of the object, if available,
without doing any interprocess communication.
On a write operation, all copies of the object except the one just modified
must be invalidated or updated.
To deal with this problem, communication will be needed,
so write operations involve communication.
This is a departure from most of the replication techniques cited above,
which in general need interprocess communication
for every read and write operation.
.PP
The second goal of replication is to increase parallelism.
If an object is stored on only one processor, each operation
must be executed by that processor.
This processor may easily become a sequential bottleneck.
With replicated objects, on the other hand, all processors can simultaneously
read their own copies.
Since a read operation does not change its object, it can be executed
concurrently with other read operations without violating the
serializability principle.
.PP
The effectiveness of replication depends on many factors.
One important factor is the ratio of read and write operations
on objects, which is determined by the user application.
Another factor is the overhead in execution time
for reading or writing objects.
These costs are determined by the implementation of the model.
They depend on:
.RS
.nr LL -1c
.IP "\(bu"
The action undertaken after each write.
If each write operation \fIinvalidates\fR all copies, a subsequent read
operation will need to do communication.
If, on the other hand,
all copies are \fIupdated\fR, this disadvantage disappears, but
write operations will become more expensive
.IP "\(bu"
The protocol used for invalidating or updating copies.
Many protocols exist (e.g., owner protocols, two-phase update protocols),
each with their own advantages and disadvantages.
.IP "\(bu"
The replication strategy.
If an object is replicated everywhere, each
read operation can be applied to a local copy,
which is much cheaper than doing the operation remotely.
On the other hand, writing an object that has many copies will
be more expensive than writing a non-replicated object.
.nr LL +1c
.LP
.RE
.sp 0.5v
In the following subsections we will study these design choices in
more detail.
.NH 2
Invalidation versus Updating of Copies
.PP
If a write operation is applied to a replicated object, 
its copies will no longer be up-to-date.
There are two different approaches for dealing with this problem.
The first scheme is to \fIinvalidate\fR all-but-one copies of the object.
The second scheme is to \fIupdate\fR all copies in a consistent way.
.PP
With invalidation (or \fIwrite-once\fR),
each object is initially stored on only one processor, say \fIP\fR.
If another processor wants to do a read operation on the object,
it fetches a copy of the object from \fIP\fR.
In this way, the object automatically gets replicated.
On a write operation, all the copies are thrown away, except the one being
modified.
.PP
The alternative scheme is to update (or \fIwrite-through\fR)
all copies of an object after each write operation.
A problem here is how to update all copies \fIin a consistent way\fR.
The shared data-object model guarantees that all operations on objects
are executed indivisibly. Hence, updating of all copies should
appear as one indivisible action.
On systems supporting only point-to-point communication, this is hard
to do.
In essence, a \fI2-phase\fR protocol is needed, as we will see.
If reliable indivisible multicast messages are available, updates
become much simpler, as we will discuss in Section 4.
.PP
There are several important differences between invalidation and
update schemes.
For one thing, keeping copies up-to-date is more complicated than
invalidating copies, so the update scheme may require more messages
to implement a write operation.
Also, update messages will be larger than invalidation messages.
An update message will either contain:
.RS
.nr LL -1c
.IP "1."
The new value of the object, or
.IP "2."
The write operation and its actual parameters.
.nr LL +1c
.LP
.RE
.sp 0.5v
In the latter case,
each processor applies the operation to its local copy of the object.
The first approach is most effective for small objects, which do not
contain large amounts of data.
If an object contains a large data structure
of which the operation modifies only a small part,
applying the operation to all copies will be more efficient.
Whichever scheme is chosen, the update message will always be larger
than an invalidation message, which merely
needs to specify the object to be invalidated.
.PP
On the other hand, the update scheme also has several advantages.
If an object is read after it has been written,
the invalidation scheme will have to fetch the current value of the
object from a remote processor.
With the update scheme, this value will still be stored locally, so
no messages need be sent at all.
.PP
In conclusion, which of the two schemes is most efficient depends on:
.RS
.nr LL -1c
.IP "1."
The costs of the update protocol.
.IP "2."
The size of the object.
.IP "3."
The size of the parameters of the write operation.
.IP "4."
Whether the write operation is followed by a read operation or
by another write operation.
.nr LL +1c
.LP
.RE
Kai Li argues that, for the Shared Virtual Memory system,
an update scheme is inappropriate.
In addition to being almost impossible to implement, it will cause a page 
fault on \fIevery\fR write instruction\*(<.\*([.Li and Hudak 1989\*(.]\*(>.
.]< 17
In our model, however, this disadvantage is far less severe.
Users can define write operations of any complexity on shared objects.
As replicas are updated after each operation\(emrather than each machine
instruction\(emupdating will be less expensive than in the SVM.
In addition, the SVM would require a whole page to be transmitted
after every write.
With our approach, shared objects frequently are much smaller than
a page; furthermore, large objects can usually be updated efficiently
by transmitting the operation and its parameter, instead of the new
value of the object.
.NH 2
Invalidation and Update Protocols
.PP
The protocol used for invalidating or updating copies of objects
must make sure that simultaneous operations on the same
object are executed indivisibly.
The simplest way to implement this is to serialize all write operations
(i.e., to execute them one at a time, in a mutually exclusive way).
This is the approach taken by all our implementations.
.PP
In an invalidation scheme, mutual exclusion is achieved by selecting one copy
of each object as the \fIprimary copy\fR.
In the simplest scheme, all write operations are directed to the processor 
containing the primary copy.
On receiving a write operation, the processor first invalidates all secondary
copies and then applies the operation to the primary copy.
When a processor executes a read operations, it locates the primary copy and
asks for the value of the object.
A more sophisticated scheme allows the primary copy to move from one processor 
to another.
Kai Li compares several of these schemes and analyzes their performance.
.PP
In an update scheme, mutual exclusion can be achieved in two ways.
One way is to appoint one copy of each object as \fIprimary copy\fR and
direct all write operations to the processor containing the primary copy.
This node will execute the write operations one by one and
propagate their effects to all other copies, called \fIsecondary copies\fR.
An alternative approach is to treat all copies
as equals and use a \fIdistributed protocol\fR that takes care of mutual
exclusion.
.PP
The first approach is conceptually the simplest.
Moreover, it allows one important optimization:
the primary copy can be migrated to the processor that
changes the object most frequently, making updates more efficient.
In particular, if only one processor changes the object, the overhead
of mutual exclusion can be eliminated by storing the
primary copy on that processor.
In general, however, the primary copy method requires one extra message,
from the process invoking the write operation to the primary-copy site.
.PP
With the second approach\(ema distributed protocol\(emthere is no
distinction between primary and secondary copies.
Instead, each processor can initiate a write operation on an object.
Clearly, some cooperation among the processors is needed to prevent
simultaneous write operations on the same object from
interfering with each other.
.NH 2
Replication Strategies
.PP
Replicating a shared data object is only useful if it
is read relatively often.
Thus, simply replicating all objects on all processors is unlikely to be 
efficient.
In general, we can distinguish between several \fIstrategies\fR for replication:
.sp
.RS
.nr LL -1c
.IP "~~No replication:" 18
Each object is stored on one specific processor.
.IP "~~Full replication:"
Each object is replicated on all processors.
.IP "~~Partial replication:"
Each object is replicated on some of the processors, based on
.RS
.IP "~~(a)" 5
compile-time information,
.IP "~~(b)"
run-time information, or
.IP "~~(c)"
a combination of both.
.RE
.nr LL +1c
.LP
.RE
.sp 0.5v
The first approach is used in most parallel object-based languages.
In this case, all operations on a given object are executed by
the same processor.
For many applications, this may easily lead to sequential bottlenecks
and high communication overhead.
.PP
The second approach indiscriminately replicates all shared objects on
all processors.
It will be most effective for architectures supporting fast reliable
multicast messages, since these will allow efficient updating 
or invalidation of all copies.
.PP
The third strategy selectively replicates objects, based on information
gathered by either the compiler, the run time system (RTS), or both.
With this approach, several scenarios are possible.
For example, the compiler may disable replication of objects that do not
have any read operations at all.
Also, if a processor does not contain any processes that share
a given object, it is unnecessary to store a copy of the object
on that processor.
.PP
The most advanced scheme based on partial replication is to let the
RTS decide dynamically where to replicate each object.
For example, the RTS may keep track of read and write
operations on an object issued by each processor, to determine which
processors frequently read the object.
If the read/write ratio exceeds a certain threshold, a replica
of the object is created dynamically on that processor.
This strategy is most suitable if communication is slow,
so the overhead of maintaining statistics is worthwhile.
.PP
Note that an invalidation scheme automatically leads to partial replication,
since the copies are thrown away after each write operation.
Still, there are several strategies for deciding when to re-install the copy.
The RTS may, for example, obtain a new copy the next time the object is read.
If the object is large, it may be better to replicate it only
after a certain number of successive read operations.
.NH 2
Discussion
.PP
We have discussed several design choices related to replication of
objects.
In general, it is hard to determine which ones will give the best
overall performance.
Furthermore, different types of distributed systems may require different
design decisions.
In particular, the communication primitives provided by the system
are very important.
.PP
In the next two sections we will examine two existing implementations
of the shared data-object model.
Each implementation basically is a run time system for Orca.
Both RTSs use the same hardware: a collection of
10 MC68020 CPUs connected by a 10 Mbit/sec Ethernet\(rg, but use different
communication primitives and consistency protocols.
.NH 1
AN IMPLEMENTATION USING POINT-TO-POINT COMMUNICATION
.PP
The first run time system we have implemented for Orca runs on top
of Amoeba\*(<.\*([.Mullender and Tanenbaum 1986\*(.]\*(>.
.]< 18
It uses point-to-point messages (RPC) for interprocess
communication.
Below, we will look at each of the three design issues discussed
in Section 2 and motivate our choices.
In Section 5.1 we will describe the performance of this system.
.NH 2
Invalidation versus Updating
.PP
The first issue is the choice between an invalidation or an update scheme.
With point-to-point messages it is expensive to update all copies of
an object in a consistent way.
Simultaneous write operations on the same object can
be serialized using a primary copy protocol, as described in Section 2.2.
A harder problem is how to achieve serializability
if a sequence of operations on \fIdifferent\fR objects is executed.
.PP
Suppose a program uses two objects, X and Y, that have their primary copies
on different processors.
If X and Y are written simultaneously, either all processors should
observe the change to X first or all processors should observe  the
change to Y first.
Under no circumstances should these two events be mixed, since that
would violate serializability.
Because of this restriction, it does \fInot\fR suffice to implement
a write operation by sending it to the primary-copy site and having
this site forward it to the secondary-copy sites\*(<.\*([.Bal and Tanenbaum 1988\*(.]\*(>.
.]< 7
.PP
The problem can be solved using a more complicated and
expensive update protocol\*(<.\*([.Bal and Tanenbaum 1988\*(.]\*(>.
.]< 7
Therefore, updating copies will be expensive, so it is not clear whether
updating will be more efficient than invalidation.
We have decided to implement both options and to determine experimentally
which of the two is best.
.NH 2
The Protocols
.PP
In this section we will discuss the protocols for invalidating or
updating copies, using point-to-point messages.
.SH
The Invalidation Protocol
.LP
The invalidation protocol is quite simple.
A process that wants to do a write operation on a shared object sends
the operation and the parameters to the processor containing the
primary copy of the object.
This processor locks the object and
sends point-to-point \fIinvalidate\fR messages to
all processors containing a secondary copy.
If a secondary-copy site receives this message, it throws away its local
copy of the object and sends back an acknowledgement.
As soon as the primary-copy site has received all acknowledgements,
it updates and unlocks the primary copy.
This protocol requires two messages for each secondary copy; if the primary
copy is on a different processor, two messages are needed for updating the 
primary copy.
.PP
If a process \fIP\fR wants to do a read operation on an object of which is
does no longer have a local copy, it sends a \fIfetch-object\fR message
to the primary-copy site.
If this processor has not yet received all acknowledgements, the primary
copy will be locked and \fIP\fR will temporarily be blocked.
When the object is unlocked, the primary-copy site sends a new copy to \fIP\fR.
.SH
The Update Protocol
.LP
Updating all copies of an object in a consistent way is more difficult
than invalidating them.
The real problem is to guarantee serializability, as discussed above.
We solve this problem using a \fI2-phase\fR primary copy update protocol.
The protocol updates copies by sending the operation and its
parameters to the secondary-copy sites.
For most programs, this is more efficient
than transmitting the new value of the object.
.PP
During the first phase, the primary copy of the object is locked and
a \fIlock-and-update\fR message is sent to all secondary-copy sites.
This message specifies an object, an operation to be applied to
the object, and the parameters of the operation.
When a site receives the \fIlock-and-update\fR, it locks the local copy
of the object and applies the operation to it.
Next, it sends an acknowledgement to the primary-copy site, while
still keeping its local copy locked.
In the mean time, the primary-copy site waits for all acknowledgements
and then sends an \fIunlock\fR message to all sites.
The \fIunlock\fR message causes all copies of the object to be unlocked.
.PP
The 2-phase update protocol guarantees that no process uses the new value of
an object while other processes are still using the old value.
The new value is not used until the second phase.
When the second phase begins, all copies contain the new value.
Simultaneous write-operations on the same object are serialized by locking
the primary copy.
The next write-operation may start before all secondary copies are unlocked.
New requests to \fIlock-and-update\fR a secondary copy are not serviced
until the \fIunlock\fR message generated by the previous write has
been handled.
.PP
Assuming no communication failures, this protocol requires two RPCs for each 
secondary copy.
In the first phase, one request to \fIlock-and-update\fR the object is
sent plus an acknowledgement for this request.
In the second phase, an \fIunlock\fR message and acknowledgement are sent.
Both in phase one and phase two a request and an acknowledgement have to be 
sent.
To update an object whose primary copy is located on a remote processor,
one extra RPC is needed.
.NH 2
Replication Strategy
.PP
With the above protocols, the costs
of invalidating or updating \fIN\fR copies of an object will grow
linearly with \fIN\fR.
As a result, it will be expensive to replicate all objects on
all processors.
Our implementation therefore uses a partial replication strategy,
based on run time statistics.
Although this incurs some overhead on operations, communication
costs can be reduced significantly.
As communication in distributed systems still is expensive
(on the order of milliseconds), this approach is attractive.
.PP
Initially the system contains one copy for each object: the primary copy.
If some processor frequently tries to read the primary, a secondary copy
will be created, so that future read operations can be applied to the
local copy without sending any messages.
Write operations are always directed to the primary copy. 
.PP
In the invalidation scheme the owner of the primary copy invalidates all
secondary copies before performing the write operation.
A subsequent read operation on the same object always has to go to 
the processor containing the primary copy.
So, the number of secondary copies of a given object is determined by
its read/write pattern.
.PP
In the update scheme,
the processor containing the primary copy of an object
keeps track of the number of remote
read and write operations issued by each processor.
The overhead of maintaining these statistics is negligible compared to
the total costs of remote operations.
As soon as the read/write ratio of a remote processor exceeds a certain
threshold, the RTS creates a copy of the object on that processor.
.PP
Each processor having a secondary copy keeps track of the ratio of
local read operations and (global) write operations.
If the overhead in updating the copy exceeds the time saved in doing
read operations locally, the RTS discards the local copy.
From then on, all operations on the object will be done remotely.
.PP
With the primary-copy update protocol, all write operations are
forwarded to the processor containing the primary copy of the object.
If the RTS discovers that an object is written frequently by a machine
different from the one containing the primary copy, the RTS may
decide to \fImigrate\fR the primary copy to that machine.
Again, statistics are used to determine to best location for an object.
If an object is migrated, precautions are taken for dealing with
machines that are unaware of the object's new location.
.NH 1
AN IMPLEMENTATION USING MULTICAST COMMUNICATION
.PP
The second RTS runs on top of the bare hardware and uses the multicast
capability of the Ethernet.
The RTS uses the indivisible reliable multicast protocol described in\*(<.\*([.Kaashoek et al. 1989b\*(.]\*(>.
.]< 19
This protocol is highly efficient and usually only requires two
packets (one point-to-point and one multicast) per reliable multicast.
Sending a short message to 10 processors, for example, takes 1.5 msec on
the hardware described above.
.PP
In a distributed system supporting only point-to-point messages,
serializability is difficult to achieve, because messages sent to
different destinations may arrive with arbitrary delays.
Some distributed systems (e.g., Ethernet-based systems) provide hardware
support for sending a single message to multiple destinations simultaneously.
More precisely, we are interested in systems supporting
\fIindivisible reliable multicasts\fR, which have the following properties:
.RS
.nr LL -1c
.IP "\(bu"
A message is sent reliably from one source to a set of destinations.
.IP "\(bu"
If two processors simultaneously multicast two messages
(say m\s-2\v'0.4m'1\v'-0.4m'\s+2 and m\s-2\v'0.4m'2\v'-0.4m'\s+2),
then either all destinations first receive m\s-2\v'0.4m'1\v'-0.4m'\s+2,
or they all receive m\s-2\v'0.4m'2\v'-0.4m'\s+2 first,
but not a mixture with some receiving m\s-2\v'0.4m'1\v'-0.4m'\s+2
first and others receiving m\s-2\v'0.4m'2\v'-0.4m'\s+2 first.
.nr LL +1c
.LP
.RE
.sp 0.5v
With this multicast facility, it becomes much easier to implement
a protocol for consistent updating of all copies of an object.
Basically, if a process wants to invoke a write operation on a shared
object, it multicasts the operation to all processors.
Since all processors receive all messages in the same order,
all operations on shared objects are executed in the same order everywhere.
.PP
We have implemented an indivisible reliable multicast protocol in software on
top of Ethernet\*(<.\*([.Kaashoek et al. 1989b\*(.]\*(>.
.]< 19
Programs need not to worry about lost messages. 
Recovery of communication failures is handled automatically and transparently
by the protocol.
Efficiency is obtained by optimizing the protocol for no communication
failures, as these rarely happen with current state of microprocessor and
network technology.
.NH 2
Invalidation versus Updating
.PP
Reliable multicasting is useful for invalidation as well as updating.
In both cases, a single reliable multicast message is needed for a write 
operation.
If an object is written very frequently and hardly ever read,
the invalidation scheme will be more efficient, since fewer messages are needed
and invalidation messages are shorter than update message.
.PP
In general, however, the update scheme will be more efficient.
Suppose, for example, that every processor reads a given object exactly once
after it has been written.
With \fIP\fR processors, the invalidation scheme requires a single (short)
reliable multicast message for invalidating the copies and 2\fIP\fR 
point-to-point messages for fetching the object (or doing the read operation
remotely).
As a reliable multicast usually costs two physical messages, in total there 
are 2\fIP\fR \(pl 2 messages.
In contrast, the update scheme requires only one reliable multicast message.
So, even in the case that each write is followed by a read operation, the
updating performs better than invalidation.
.PP
With the reliable multicast protocol we use, a multicast message
is hardly more expensive than an RPC\*(<.\*([.Kaashoek et al. 1989b\*(.]\*(>.
.]< 19
So, unless the read/write ratio of operations is close to zero,
the update scheme will have a better performance.
We have therefore only implemented the update scheme.
.NH 2
The Update Protocol
.PP
As in the Amoeba RTS, indivisibility of write operations is obtained
by executing them in a mutually exclusive way.
With indivisible multicast, mutual exclusion comes for free.
The communication primitive imposes a single system-wide global ordering
on all write operations.
Unlike the point-to-point scheme, there is no risk of different
processors updating their copies in an inconsistent way.
Also, there is no need to distinguish between primary and secondary
copies of an object,
.PP
The distributed update protocol we use works as follows.
Each processor maintains a
queue of messages that have arrived on the processor but that have
not yet been handled.
As all processors receive all messages in the same order, the queues on
all processors are basically the same, except that some processors may
be ahead of others in handling the messages at the head of the queue.
.PP
If a process wants to execute a write operation
on a shared object \fIX\fR, it multicasts an \fIupdate\fR message
to all processors (including its own processor) and then blocks.
The message contains the name of the object, the operation, and its parameters.
The update message will be appended to the tail of each queue.
.PP
Each processor handles incoming messages in its queue
in strict FIFO order.
A message may be handled as soon as it appears at the head of the queue.
To handle an \fIupdate\fR message,
the message is removed from the queue,
the local copy of \fIX\fR is locked,
the operation is applied to the local copy,
and finally the local copy is unlocked.
If the message was sent by a process on the same processor,
that process is made active again.
.PP
The protocol described above correctly implements the serializability
requirement.
The protocol guarantees that all processors observe
changes to shared objects \fIin the same order\fR.
Note that it does not provide a total (\fItemporal\fR)
ordering\*([.Lamport 1978\*(.]
.]< 20
among operations.
Suppose Processor P1 initiates a write operation on object \fIX\fR
and, a few microseconds later,  Processor P2 reads the value of \fIX\fR.
The \fIupdate\fR message for \fIX\fR sent by P1 need not have even reached
P2 yet, so P2 may still use the old value of \fIX\fR.
This scenario is in accordance with the semantics of our model, however,
which merely requires serializability of operations.
.NH 2
Replication Strategy
.PP
The multicast RTS replicates all objects on all processors.
In other words, it uses the full replication strategy.
This strategy was chosen, because it simplifies the
implementation.
The RTS does not have to keep track of which object is used by
which processor.
.PP
In some cases, full replication may be less efficient than partial
replication.
Suppose, for example, process P1 wants to send information to another
process P2 through an object shared between them.
As the object will be replicated everywhere, all processors in the
system will receive P1's update message, even though only P2 is really
interested in it.
.PP
The overhead of sending the message everywhere usually is
not dramatic, however.
With our reliable multicast protocol, the elapsed time for a multicast
message hardly depends on the number of destinations.
The main disadvantage of full replication then is the fact that
each processor will be interrupted once for each write operation.
With partial replication, this CPU overhead would be less.
.NH 1
PERFORMANCE
.PP
There are several ways to measure
the performances of the replication techniques.
The approach taken in\*([.Bal\*(.]
.]< 11
is to implement several user applications in Orca, execute them
on the different run time systems, and measure the speedups.
Applications we have looked at are matrix multiplication, the all-pairs
shortest paths problem, branch-and-bound, alpha-beta search, and
successive overrelaxation.
.PP
In this paper we will use an alternative approach by looking
at the basic times for reading, writing, updating, and invalidating
shared objects.
In this way we can determine under which circumstances a technique
is most effective.
.PP
To determine the performance improvements due to replication, we have performed
two experiments. 
In the first experiment, we have measured the costs of incrementing a replicated
4 byte integer object as a function of the number of replicas.
In the second experiment we measured the cost for updating an entire 1K 
array object.
These two types of objects occur frequently in application programs\*(<.\*([.Bal et al. 1990\*(.]\*(>.
.]< 8
.NH 2
Performance of the Amoeba Run Time System
.PP
Figure \*(Fn shows the basic execution times for the run time system
that uses Amoeba RPC.
The figure shows the costs for invalidating \fIN\fR copies of an object
and for updating 4-byte and 1Kb objects.
Invalidating a copy involves sending a short message containing an object
identifier, so the invalidation costs do not depend on the size of the object.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS
.lf 1634
.lf 1 /usr/lib/grap.defines
.lf 1633 -
.lf 1 updateint.grap
.lf 1641 -
.lf 1 update1k.grap
.lf 1643 -
.lf 1 invalidate.grap
.lf 1645 -
Graph: [
	# gg 1 .. 10, 0 .. 55
define xy_gg @ 	(($1)-(.37))*.292398, (($2)-(-3.85))*0.0478469 @
define x_gg @ 	(($1)-(.37))*.292398 @
define y_gg @ 	(($1)-(-3.85))*0.0478469 @
	frameht = 3
	framewid = 3
Frame:	box ht frameht wid framewid with .sw at 0,0 invis
	line from Frame.nw to Frame.ne invis
	line from Frame.sw to Frame.se 
	line from Frame.sw to Frame.nw 
	line from Frame.se to Frame.ne invis
	textht = .166667
	textwid = .733333
Label:	box invis wid 0 ht 2*textht "Time   " "(in msec)  " wid textwid with .e at Frame.w - (0.2,0)
	textht = .166667
Label:	box invis wid 0 ht 1*textht "Number of copies" with .n at Frame.s - (0,2 * textht)
	ticklen = .1
Ticks_gg:	line  left ticklen from (0,y_gg(0))
	"0 " rjust at last line.end
	line  left ticklen from (0,y_gg(10))
	"10 " rjust at last line.end
	line  left ticklen from (0,y_gg(20))
	"20 " rjust at last line.end
	line  left ticklen from (0,y_gg(30))
	"30 " rjust at last line.end
	line  left ticklen from (0,y_gg(40))
	"40 " rjust at last line.end
	line  left ticklen from (0,y_gg(50))
	"50 " rjust at last line.end
	ticklen = .1
Ticks_gg:	line  down ticklen from (x_gg(1),0)
	box invis "1" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(2),0)
	box invis "2" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(3),0)
	box invis "3" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(4),0)
	box invis "4" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(5),0)
	box invis "5" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(6),0)
	box invis "6" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(7),0)
	box invis "7" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(8),0)
	box invis "8" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(9),0)
	box invis "9" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(10),0)
	box invis "10" ht .25 wid 0 with .n at last line.end
Lgg: xy_gg(1,.2)
"\s-3\(sq\s+3" at xy_gg(1,.2)
line  from Lgg to xy_gg(2,5.5); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,5.5)
line  from Lgg to xy_gg(3,8.1); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,8.1)
line  from Lgg to xy_gg(4,11.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(4,11.3)
line  from Lgg to xy_gg(5,14.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(5,14.3)
line  from Lgg to xy_gg(6,17.5); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,17.5)
line  from Lgg to xy_gg(7,20.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,20.6)
line  from Lgg to xy_gg(8,23.5); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(8,23.5)
line  from Lgg to xy_gg(9,26.8); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(9,26.8)
line  from Lgg to xy_gg(10,30.7); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(10,30.7)
Lgg: xy_gg(1,.17)
"\s-3\(*D\s+3" at xy_gg(1,.17)
line  from Lgg to xy_gg(2,11.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,11.2)
line  from Lgg to xy_gg(3,14.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,14.2)
line  from Lgg to xy_gg(4,17.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(4,17.2)
line  from Lgg to xy_gg(5,20.1); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(5,20.1)
line  from Lgg to xy_gg(6,23.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,23.4)
line  from Lgg to xy_gg(7,26.8); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,26.8)
line  from Lgg to xy_gg(8,30.6); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(8,30.6)
line  from Lgg to xy_gg(9,34.6); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(9,34.6)
line  from Lgg to xy_gg(10,37.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(10,37.9)
Lgg: xy_gg(1,.3)
"\s-3\(mu\s+3" at xy_gg(1,.3)
line  from Lgg to xy_gg(2,2.8); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(2,2.8)
line  from Lgg to xy_gg(3,4); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(3,4)
line  from Lgg to xy_gg(4,5.7); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(4,5.7)
line  from Lgg to xy_gg(5,7.2); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(5,7.2)
line  from Lgg to xy_gg(6,8.9); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(6,8.9)
line  from Lgg to xy_gg(7,10.5); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(7,10.5)
line  from Lgg to xy_gg(8,12.1); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(8,12.1)
line  from Lgg to xy_gg(9,13.7); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(9,13.7)
line  from Lgg to xy_gg(10,15.7); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(10,15.7)
Lgg: xy_gg(1,45)
"\s-3\(sq\s+3" at xy_gg(1,45)
line  from Lgg to xy_gg(2,45); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,45)
line  from Lgg to xy_gg(3,45); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,45)
box invis wid 0 ht 1*textht "\s-2Time for 4-byte update\s+2" ljust at xy_gg(3.5,45)
Lgg: xy_gg(1,50)
"\s-3\(*D\s+3" at xy_gg(1,50)
line  from Lgg to xy_gg(2,50); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,50)
line  from Lgg to xy_gg(3,50); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,50)
box invis wid 0 ht 1*textht "\s-2Time for 1Kb update\s+2" ljust at xy_gg(3.5,50)
Lgg: xy_gg(1,55)
"\s-3\(mu\s+3" at xy_gg(1,55)
line  from Lgg to xy_gg(2,55); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(2,55)
line  from Lgg to xy_gg(3,55); Lgg: Here
"\s-3\(mu\s+3" at xy_gg(3,55)
box invis wid 0 ht 1*textht "\s-2Time for invalidation\s+2" ljust at xy_gg(3.5,55)

] 
.PE
.lf 1662
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Time for updating replicated objects using invalidation and 2-phase update 
protocol.
.F3
.PP
We have also measured the costs for doing a read operation on a remote
object.
For a 4-byte object, these costs are 2.7 msec; for a 1Kb object, the
costs are 5.5 msec.
.PP
As expected, the costs to update or invalidate copies after a write
operation grow linearly with the number of copies.
Therefore, selective replication is worth while.
.PP
For a small (4-byte) object, updating 10 copies costs 30.7 msec.
Invalidating 10 copies takes 15.7 msec; in addition, re-installing copies
costs 2.7 msec per copy (i.e, the costs of a remote read operation).
If the object is read by 6 or more processors immediately after it has
been written, updating will outperform invalidation, since
15.7 + 6*2.7 > 30.7.
(We assume that re-installations are done sequentially; in reality,
there may be some overlap, so the re-installation costs may be less.
However, this effect is rather small, since the
processor containing the primary copy will become a sequential bottleneck
if a large number of processors re-install the secondary copy.)
.PP
In contrast, if a small object is written twice without being read,
the invalidation scheme is more efficient. In this case, the update costs
are 2*30.7 = 61.4 msec. Invalidating and re-installing all 10 copies
takes 15.7 + 10*2.7 = 42.7 msec.
.PP
For large (1 Kb) objects, updating 10 copies takes 37.9 msec.
If 5 processors re-install the object after a write operation,
the invalidation scheme costs 15.7 + 5*5.5 = 43.2, which is slower
than updating. If fewer than 5 processors read the object, invalidation
is more efficient.
If a large object is written two or more times successively, invalidation
is also better than updating.
.PP
A case that occurs frequently in user programs is a large object
that is written through an operation with only a few bytes of parameters
(e.g., a 1Kb array of which only 1 element is changed).
In this case, updating will often be more effective.
For example, updating 10 such copies will take about 30.7 msec, while 
invalidating and re-installing the entire array will cost 15.7 + 10*5.5
= 70.7 msec.
Even if the object is changed twice before being read,
updating is more efficient.
.PP
We can also compare the partial replication scheme with a 
scheme that does not replicate objects.
If, for example, a given 4-byte object is not replicated,
each (remote) read operation will take 2.7 msec.
For an object that is read relatively frequently, partial replication
clearly pays off.
.NH 2
Performance of the Multicast Run Time System
.PP
The cost for updating replicated objects using the distributed update
protocol described in Section 4 are depicted in Figure \*(Fn. 
As can be seen, the costs are almost independent of the number of replicas. 
This is what we should expect, since in our multicast protocol sending a
reliable multicast message costs only two physical messages,
independent of the number
of receivers.
The only overhead is sending one \fIstate message\fR after a processor
has received a certain number of messages.
(This state message is only required if a processor does not multicast messages
itself; if it does multicast messages, the state message is piggybacked.)
.LP
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS
.lf 1742
.lf 1 mc_bench1.grap
.lf 1749 -
.lf 1 mc_bench2.grap
.lf 1751 -
Graph: [
	# gg 1 .. 10, 0 .. 16
define xy_gg @ 	(($1)-(.37))*.292398, (($2)-(-1.12))*.164474 @
define x_gg @ 	(($1)-(.37))*.292398 @
define y_gg @ 	(($1)-(-1.12))*.164474 @
	frameht = 3
	framewid = 3
Frame:	box ht frameht wid framewid with .sw at 0,0 invis
	line from Frame.nw to Frame.ne invis
	line from Frame.sw to Frame.se 
	line from Frame.sw to Frame.nw 
	line from Frame.se to Frame.ne invis
	textht = .166667
	textwid = .6
Label:	box invis wid 0 ht 2*textht "Time" "(in msec)" wid textwid with .e at Frame.w - (0.2,0)
	textht = .166667
Label:	box invis wid 0 ht 1*textht "Number of copies" with .n at Frame.s - (0,2 * textht)
	ticklen = .1
Ticks_gg:	line  left ticklen from (0,y_gg(0))
	"0 " rjust at last line.end
	line  left ticklen from (0,y_gg(2))
	"2 " rjust at last line.end
	line  left ticklen from (0,y_gg(4))
	"4 " rjust at last line.end
	line  left ticklen from (0,y_gg(6))
	"6 " rjust at last line.end
	line  left ticklen from (0,y_gg(8))
	"8 " rjust at last line.end
	line  left ticklen from (0,y_gg(10))
	"10 " rjust at last line.end
	line  left ticklen from (0,y_gg(12))
	"12 " rjust at last line.end
	line  left ticklen from (0,y_gg(14))
	"14 " rjust at last line.end
	line  left ticklen from (0,y_gg(16))
	"16 " rjust at last line.end
	ticklen = .1
Ticks_gg:	line  down ticklen from (x_gg(1),0)
	box invis "1" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(2),0)
	box invis "2" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(3),0)
	box invis "3" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(4),0)
	box invis "4" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(5),0)
	box invis "5" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(6),0)
	box invis "6" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(7),0)
	box invis "7" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(8),0)
	box invis "8" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(9),0)
	box invis "9" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(10),0)
	box invis "10" ht .25 wid 0 with .n at last line.end
Lgg: xy_gg(1,.994)
"\s-3\(sq\s+3" at xy_gg(1,.994)
line  from Lgg to xy_gg(2,1.023); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,1.023)
line  from Lgg to xy_gg(3,1.059); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,1.059)
line  from Lgg to xy_gg(4,1.094); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(4,1.094)
line  from Lgg to xy_gg(5,1.129); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(5,1.129)
line  from Lgg to xy_gg(6,1.162); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,1.162)
line  from Lgg to xy_gg(7,1.199); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,1.199)
line  from Lgg to xy_gg(8,1.221); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(8,1.221)
line  from Lgg to xy_gg(9,1.269); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(9,1.269)
line  from Lgg to xy_gg(10,1.304); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(10,1.304)
Lgg: xy_gg(1,12.68)
"\s-3\(*D\s+3" at xy_gg(1,12.68)
line  from Lgg to xy_gg(2,12.71); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,12.71)
line  from Lgg to xy_gg(3,12.75); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,12.75)
line  from Lgg to xy_gg(4,12.79); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(4,12.79)
line  from Lgg to xy_gg(5,12.81); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(5,12.81)
line  from Lgg to xy_gg(6,12.86); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,12.86)
line  from Lgg to xy_gg(7,12.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,12.9)
line  from Lgg to xy_gg(8,12.93); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(8,12.93)
line  from Lgg to xy_gg(9,12.96); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(9,12.96)
line  from Lgg to xy_gg(10,13); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(10,13)
Lgg: xy_gg(1,14.5)
"\s-3\(sq\s+3" at xy_gg(1,14.5)
line  from Lgg to xy_gg(2,14.5); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,14.5)
line  from Lgg to xy_gg(3,14.5); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,14.5)
box invis wid 0 ht 1*textht "\s-2Time for 4-byte operation\s+2" ljust at xy_gg(3.5,14.5)
Lgg: xy_gg(1,15.5)
"\s-3\(*D\s+3" at xy_gg(1,15.5)
line  from Lgg to xy_gg(2,15.5); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,15.5)
line  from Lgg to xy_gg(3,15.5); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,15.5)
box invis wid 0 ht 1*textht "\s-2Time for 1Kb operation\s+2" ljust at xy_gg(3.5,15.5)

] 
.PE
.lf 1763
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Time for updating replicated objects using the distributed update protocol.
.F3
.PP
For small objects, the update costs are lower than the costs for remote
read and write operations (see Section 5.1), because there is no operating
system overhead.
Consequently, for small objects, replication usually reduces
the communication costs.
One exception is an object with a low read/write ratio.
In this case, the overhead of updating the replicas after each write
will invalidate the gains of replication.
The second exception is an object that is hardly ever accessed by remote
processors.
In both cases, it would be better not to replicate the object at all.
In conclusion, a good strategy for small objects would be to either replicate
a given object everywhere or not at all.
.PP
For larger objects, our multicast protocol is slightly less efficient,
since it uses two large network packets for updating copies.
So, the update costs are higher than the costs
for reading or writing remote objects.
With 10 CPUs, for example, updating a 1Kb object costs 13 msec; reading
a remote 1 Kb object using Amoeba RPC costs approximately 5.5 msec
(including marshalling overhead).
Again, it depends on the read/write ratio whether replication is
advantageous.
If, for example, 3 of the 10 CPUs read the object immediately after it has been
changed, replication is advantageous, since 3*5.5 > 13.
Replication has the additional advantage of allowing more parallelism,
since read operations on multiple local copies can be done simultaneously.
.NH 1
CONCLUSIONS
.PP
The model discussed in this paper allows programmers to define operations
of arbitrary complexity on shared data-objects.
In a loosely-coupled system, the model is implemented
by replicating objects in the local memories of the processors.
We have studied several protocols for keeping all these copies consistent
and we have looked at replication strategies.
.PP
We have described two implementations of the model.
One implementation replicates objects everywhere and
updates copies through a fast multicast protocol.
The other implementation uses only
point-to-point messages. In this case, partial replication
and migration may be useful.
.PP
Which protocol or strategy for replication is most efficient depends on
many factors, such as the costs of the update protocols, the size of
the object and the parameters of the operations, and the read/write
pattern of the application.
In the future we intend to do a more detailed analysis of our protocols
and strategies, using a large set of user applications.
Also, we will look at the differences and resemblances between protocols
for replication and coherence protocols for CPU caches\*(<,\*([.Eggers and Katz 1989; Owicki and Agarwal 1989\*(.]\*(>,
.]< 21
.]< 22
file caches\*(<,\*([.Noe et al. 1985; Morris et al. 1986; Ousterhout et al. 1988\*(.]\*(>,
.]< 23
.]< 24
.]< 25
and distributed database systems\*(<.\*([.Bernstein and Goodman 1981\*(.]\*(>.
.]< 13
Based on this analysis, we will try to improve our implementations.
.PP
Our model has several advantages over other models based on
logically shared data, such as Shared Virtual Memory and Linda's Tuple Space\*(<.\*([.Ahuja et al. 1986\*(.]\*(>.
.]< 26
Our model provides a higher level of abstraction and, in many cases,
is more efficient.
.PP
The SVM, for example, can only invalidate but not update copies of data.
Also, the SVM will perform very poorly if processes on many
different processors repeatedly write on the same page.
This situation arises if multiple processors write the same variable, or if
they write different variables placed on the same page.
.PP
Linda's Tuple Space provides only a fixed number of low-level operations
on shared data (tuples)\*(<.\*([.Kaashoek et al. 1989a\*(.]\*(>.
.]< 27
Logical operations on shared data structures frequently
consist of several low-level operations, each of which can require
physical communication.
In our model, the programmer can define a single high-level operation
that does the job at less communication costs.
.NH 1
REFERENCES
.in 0.3i
.nr [W \w'10'
.LP
.]<
.ds [F Ahuja et al. 1986
.]-
.ds [T Linda and Friends
.ds [A \*([(A\*()]huja, S.
.as [A ", \*([(C\*()]arriero, N.
.as [A ", and \*([(G\*()]elernter, D.
.ds [J IEEE Computer
.ds [V 19
.ds [N 8
.ds [P 26-34
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1986
.ds [L 17
.][ 1 journal-article
.ds [F Bal
.]-
.ds [T Programming Distributed Systems
.ds [A \*([(B\*()]al, H. E.
.ds [I Silicon Press
.ds [C Summit, NJ
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D 1990 (a minor revision of the author's Ph.D. thesis ``The Shared Data-Object Model as a Paradigm for Programming Distributed Systems'')
.ds [L 3, 14
.][ 2 book
.ds [F Bal et al. 1989a
.]-
.ds [T A Distributed Implementation of the Shared Data-object Model
.ds [A \*([(B\*()]al, H. E.
.as [A ", \*([(K\*()]aashoek, M. F.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J First USENIX/SERC Workshop on Experiences with Building Distributed and Multiprocessor Systems
.ds [C Ft. Lauderdale, FL.
.ds [P 1-19
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1989a
.ds [L 3
.][ 1 journal-article
.ds [F Bal et al. 1990
.]-
.ds [T Experience with Distributed Programming in Orca
.ds [A \*([(B\*()]al, H. E.
.as [A ", \*([(K\*()]aashoek, M. F.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J Proc. IEEE CS 1990 Int. Conf. on Computer Languages
.ds [C New Orleans, LA
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1990
.ds [L 3, 14
.][ 1 journal-article
.ds [F Bal et al. 1989b
.]-
.ds [T Programming Languages for Distributed Computing Systems
.ds [A \*([(B\*()]al, H. E.
.as [A ", \*([(S\*()]teiner, J. G.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J ACM Computing Surveys
.ds [V 21
.ds [N 3
.ds [P 261-322
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1989b
.ds [L 3
.][ 1 journal-article
.ds [F Bal and Tanenbaum 1988
.]-
.ds [T Distributed Programming with Shared Data
.ds [A \*([(B\*()]al, H. E.
.as [A " and \*([(T\*()]anenbaum, A. S.
.ds [J Proc. IEEE CS 1988 Int. Conf. on Computer Languages
.ds [C Miami, Fl.
.ds [P 82-91
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1988
.ds [L 3, 9
.][ 1 journal-article
.ds [F Bernstein and Goodman 1981
.]-
.ds [T Concurrency Control in Distributed Database systems
.ds [A \*([(B\*()]ernstein, P. A.
.as [A " and \*([(G\*()]oodman, N.
.ds [J ACM Comping Surveys
.ds [V 13
.ds [N 2
.ds [P 185-221
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m6 1981
.ds [L 4, 17
.][ 1 journal-article
.ds [F Bisiani and Forin 1987
.]-
.ds [T Architectural Support for Multilanguage Parallel Programming on
.as [T " Heterogenous Systems
.ds [A \*([(B\*()]isiani, R.
.as [A " and \*([(F\*()]orin, A.
.ds [J Proc. 2nd Int. Conf. on Architectural Support for Programming Languages and Operating Systems
.ds [C Palo Alto, Calif.
.ds [P 21-30
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1987
.ds [L 2
.][ 1 journal-article
.ds [F Carriero and Gelernter 1986
.]-
.ds [T The S/Net's Linda Kernel
.ds [A \*([(C\*()]arriero, N.
.as [A " and \*([(G\*()]elernter, D.
.ds [J ACM Trans. Comp. Syst.
.ds [V 4
.ds [N 2
.ds [P 110-129
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m5 1986
.ds [L 2
.][ 1 journal-article
.ds [F Cheriton 1985
.]-
.ds [T Preliminary Thoughts on Problem-oriented Shared Memory:
.as [T " A Decentralized Approach to Distributed Systems
.ds [A \*([(C\*()]heriton, D. R.
.ds [J ACM Operating Systems Review
.ds [V 19
.ds [N 4
.ds [P 26-33
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1985
.ds [L 2
.][ 1 journal-article
.ds [F Davidson et al. 1985
.]-
.ds [T Consistency in Partitioned Networks
.ds [A \*([(D\*()]avidson, S. B.
.as [A ", \*([(G\*()]arcia-Molina, H.
.as [A ", and \*([(S\*()]keen, D.
.ds [J ACM Comping Surveys
.ds [V 17
.ds [N 3
.ds [P 341-370
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1985
.ds [L 4
.][ 1 journal-article
.ds [F Eggers and Katz 1989
.]-
.ds [T A Characterization of Sharing in Parallel Programs and Its Application to Coherency Protocol Evaluation
.ds [A \*([(E\*()]ggers, S. J.
.as [A " and \*([(K\*()]atz, R. H.
.ds [J 15th Int. Symp. on Computer Architecture
.ds [C Israel
.ds [P 373-382
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m6 1989
.ds [L 17
.][ 1 journal-article
.ds [F Eswaran et al. 1976
.]-
.ds [T The Notions of Consistency and Predicate Locks in a Database System
.ds [A \*([(E\*()]swaran, K. P.
.as [A ", \*([(G\*()]ray, J. N.
.as [A ", \*([(L\*()]orie, R. A.
.as [A ", and \*([(T\*()]raiger, I. L.
.ds [J Commun. ACM
.ds [V 19
.ds [N 11
.ds [P 624-633
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mb 1976
.ds [L 3
.][ 1 journal-article
.ds [F Fleisch and Popek 1989
.]-
.ds [T Mirage: A Coherent Distributed Shared Memory Design
.ds [A \*([(F\*()]leisch, B.
.as [A " and \*([(P\*()]opek, G.
.ds [J Proc. 12th Symp. Operating System Principles
.ds [P 211-223
.nr [P 1
.ds [I ACM SIGOPS
.ds [C Litchfield, AZ
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mc 1989
.ds [L 2
.][ 1 journal-article
.ds [F Gifford 1979
.]-
.ds [T Weighted Voting for Replicated Data
.ds [A \*([(G\*()]ifford, D. K.
.ds [J Proc. 7th Symp. Operating Systems Principles
.ds [I ACM SIGOPS
.ds [C Pacific Grove, CA
.ds [P 150-162
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mc 1979
.ds [L 4
.][ 1 journal-article
.ds [F Joseph and Birman 1987
.]-
.ds [T Low Cost Management of Replicated Data in Fault-Tolerant Distributed Systems
.ds [A \*([(J\*()]oseph, T. A.
.as [A " and \*([(B\*()]irman, K. P.
.ds [J ACM Trans. Comp. Syst.
.ds [V 4
.ds [N 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1987
.ds [L 4
.][ 1 journal-article
.ds [F Kaashoek et al. 1989a
.]-
.ds [T Experience with the Distributed Data Structure Paradigm in Linda
.ds [A \*([(K\*()]aashoek, M. F.
.as [A ", \*([(B\*()]al, H. E.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J First USENIX/SERC Workshop on Experiences with Building Distributed and Multiprocessor Systems
.ds [C Ft. Lauderdale, FL.
.ds [P 175-191
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1989a
.ds [L 18
.][ 1 journal-article
.ds [F Kaashoek et al. 1989b
.]-
.ds [T An Efficient Reliable Broadcast Protocol
.ds [A \*([(K\*()]aashoek, M. F.
.as [A ", \*([(T\*()]anenbaum, A. S.
.as [A ", \*([(F\*()]lynn~Hummel, S.
.as [A ", and \*([(B\*()]al, H. E.
.ds [J ACM Operating Systems Review
.ds [V 23
.ds [N 4
.ds [P 5-20
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1989b
.ds [L 11, 12
.][ 1 journal-article
.ds [F Lamport 1978
.]-
.ds [T Time, Clocks, and the Ordering of Events in a Distributed System
.ds [A \*([(L\*()]amport, L.
.ds [J Commun. ACM
.ds [V 21
.ds [N 7
.ds [P 558-565
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m7 1978
.ds [L 13
.][ 1 journal-article
.ds [F Li 1988
.]-
.ds [T IVY: A Shared Virtual Memory System for Parallel Computing
.ds [A \*([(L\*()]i, K.
.ds [J Proc. 1988 Int. Conf. Parallel Processing (Vol. II)
.ds [C St. Charles, Ill.
.ds [P 94-101
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1988
.ds [L 2
.][ 1 journal-article
.ds [F Li and Hudak 1989
.]-
.ds [T Memory Coherence in Shared Virtual Memory Systems
.ds [A \*([(L\*()]i, K.
.as [A " and \*([(H\*()]udak, P.
.ds [J ACM Trans. Comp. Syst.
.ds [V 7
.ds [N 4
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mb 1989
.ds [L 6
.][ 1 journal-article
.ds [F Morris et al. 1986
.]-
.ds [T Andrew a Distributed Personal Computing Environment
.ds [A \*([(M\*()]orris, J. H.
.as [A ", \*([(S\*()]atyanarayan, M.
.as [A ", \*([(C\*()]onner, M. H.
.as [A ", \*([(H\*()]oward, J. H.
.as [A ", \*([(R\*()]osenthal, D. S. H.
.as [A ", and \*([(S\*()]mith, F. D.
.ds [J Commun. ACM
.ds [V 29
.ds [N 3
.ds [P 184-201
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1986
.ds [L 17
.][ 1 journal-article
.ds [F Mullender and Tanenbaum 1986
.]-
.ds [T The Design of a Capability-Based Distributed Operating System
.ds [A \*([(M\*()]ullender, S. J.
.as [A " and \*([(T\*()]anenbaum, A. S.
.ds [J The Computer Journal
.ds [V 29
.ds [N 4
.ds [P 289-300
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1986
.ds [L 9
.][ 1 journal-article
.ds [F Noe et al. 1985
.]-
.ds [T Replication in Distributed Systems: The Eden Experience
.ds [A \*([(N\*()]oe, J. D.
.as [A ", \*([(P\*()]roudfoot, A. B.
.as [A ", and \*([(P\*()]u, C.
.ds [R TR-85-08-06
.ds [I Dept. of Computer Science, University of Washington
.ds [C Seattle
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1985
.ds [L 17
.][ 4 tech-report
.ds [F Ousterhout et al. 1988
.]-
.ds [T The Sprite Network Operating System
.ds [A \*([(O\*()]usterhout, J. K.
.as [A ", \*([(C\*()]herenson, A. R.
.as [A ", \*([(D\*()]ouglis, F.
.as [A ", \*([(N\*()]elson, M. N.
.as [A ", and \*([(W\*()]elch, B. B.
.ds [J IEEE Computer
.ds [P 23-37
.nr [P 1
.ds [V 21
.ds [N 2
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1988
.ds [L 17
.][ 1 journal-article
.ds [F Owicki and Agarwal 1989
.]-
.ds [T Evaluating the Performance of Software Cache Coherence
.ds [A \*([(O\*()]wicki, S.
.as [A " and \*([(A\*()]garwal, A.
.ds [J Proc. 3nd Int. Conf. on Architectural Support for Programming Languages and Operating Systems
.ds [P 230-242
.nr [P 1
.ds [C Boston, MA
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m4 1989
.ds [L 17
.][ 1 journal-article
.ds [F Stenstr\(:om et al. 1988
.]-
.ds [T Shared Data Structures in a Distributed System - Performance
.as [T " Evaluation and Practical Considerations
.ds [A \*([(S\*()]tenstr\(:om, P.
.as [A ", \*([(V\*()]rsalovic, D.
.as [A ", and \*([(S\*()]egall, Z.
.ds [J Proc. Int. Sem. on Performance of Distributed and Parallel Systems
.ds [C Kyoto, Japan
.ds [P 15-29
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mc 1988
.ds [L 2
.][ 1 journal-article
.ds [F Van Renesse and Tanenbaum 1988
.]-
.ds [T Voting with Ghosts
.ds [A \*([(V\*()]an Renesse, R.
.as [A " and \*([(T\*()]anenbaum, A. S.
.ds [J Proc. of the 8th Int. Conf. on Distributed Computing Systems
.ds [C San Jose, CA
.ds [P 456-462
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m6 1988
.ds [L 4
.][ 1 journal-article
.nr [W \w'10'
.]>
.nr [W \w'10'
