.rm tm
.lf 1 -
.nr Rb 1
.ds m1 Jan.
.ds m2 Feb.
.ds m3 Mar.
.ds m4 Apr.
.ds m5 May
.ds m6 June
.ds m7 July
.ds m8 Aug.
.ds m9 Sep.
.ds ma Oct.
.ds mb Nov.
.ds mc Dec.
.\"		Robbert's Dynamite Troff Macros
.\"
.\" Use at your own risk.  These will normally be used next to -ms.  It
.\" redefines LP, PP, IP, SH, NH, FS, KS, KF, KE, bp (!), refer macros,
.\" and page format.  Lines are aligned on vertical spacing for a perfect
.\" page mirror.  It attempts to remove widows and to balance the pages.
.\" Figure macros are available through .F1 <figure> .F2 <trailer> .F3.
.\" There's no extra spacing between paragraphs, so you can use .LP any-
.\" time to align on vertical spacing or to reset the formatting parameters
.\" (point size, ...).  .KW keyword specifies a keyword, .KW flushes them.
.\" Use my refb if you want this to work.  If you look through this file,
.\" you may find some handy definitions that you can use as well.  By the
.\" way, if there's no .TL, .NH begins a new chapter.
.\"		Good luck, brave person.
.\"
.\"
.\"	=====> Ds is like ds, but then accepts real arguments
.\"
.de Ds	\"	--- define string ---
.ds \\$1 "\\$2
..
.de As	\"	--- append to string ---
.as \\$1 "\\$2
..
.\"	=====> page formatting macros <=====
.\"
.de Al	\"	--- alignment macro ---
.br
.if !\\n(PD \{\
.   nr VV \\n(VS-(\\n(VS/11)	\" vertical spacing minus a little bit
.   sp \\n(VVu-((\\n(.du-\\n(T[u+\\n(VVu)%\\n(VSu)
.\}
..
.de T]	\"	--- bottom of page trap macro ---
.ev 1				\" switch environment to save line buffer
.ie \\n(C%%2 \{\
.   nr C% +1			\" increase column counter
.   po \\n(POu+\\n(LLu+1c	\" new page offset
.   sp |\\n(Tpu			\" to top of right column
.   ev
.\}
.el \{\
.   ch T]			\" remove trap immediately
.   if \\n(C% .nr C% +1		\" if counting columns, count columns
.   po \\n(POu			\" set page offset
.   ie e .nr Bl \\n(nl		\" save position of left page
.   el .if \\n(Rb&\\n(Tc&((\\n(nl-\\n(Bl>0.5v):(\\n(Bl-\\n(nl>0.5v)) \
.      tm WN:balancing problem (\\n(nl != \\n(Bl)
.   if \\n(Fd .\{
.      sp |\\n(Plu-\\n(Fdu	\" to bottom of page
.      Fd			\" output footnotes
.      rm Fd			\" remove footnotes
.      nr Fd 0			\" clear footnote size
.   \}
.   nr Tl 0
.   if e .if \\n(nl+1v<=\\n(Pl .nr Tl 1	\" left page was shortened
.   if !'\\*(Pf'no' \{\
.      ie \\n(Tc \{\
.         sp |2.4c		\" some room at the top of the page
.         ie \\n(Pp  .tl ''\\s-1- \\n% -\\s+1''		\" paper header
.         el .ie o   .tl '\\*(S2'\\*(T2'\\f3\\n%\\fP'	\" right page header
.         el	     .tl '\\f3\\n%\\fP'\\*(T1'\\*(S1'	\" left page header
.      \}
.      el \{\
.         sp |\\n(.pu-2c	\" bottom of page
.         if !\\n(Pp .tl ''\\s-1- \\n% -\\s+1''		\" paper header
.      \}
.   \}
.   nr Tc 1			\" page number at top of page
.   ev				\" restore environment
'   b[				\" skip to next page, springing T[
.\}
..
.de E]	\"	--- end of input ---
.P]				\" end of last paragraph
.nr Kf 1			\" flush floating keeps
.if \\n(Kr \c
..
.de Bt	\"	--- change bottom of page trap ---
.nr Bt \\$1			\" calculate new page trap
.ie \\n(Bt<=\\n(nl .T]		\" if before current pos, spring now
.el .ch T] \\n(Btu		\" set new page trap
..
.nr T| 0			\" busy flag
.de T[	\"	--- top of page macro ---
.if \\n(Rb .tm PG:\\n%
.nr Bt \\n(Pl-1v+1		\" bottom of page trap position
.wh \\n(Btu T]			\" set bottom of page trap
.po \\n(POu			\" page offset
.nr Fc 0 1			\" reset footnote count
.if \\n(Fe .Fa			\" append leftover footnote
.ev 1				\" switch environment to save line buffer
.nr T[ 2.4c+1v+0.7c		\" size of page header
.sp |\\n(T[u			\" end of header
.if \\n(Kr .Kr			\" release some floating keeps
.Al				\" align in case of figures
.ev				\" restore environment
.nr Tp \\n(.d			\" page start
..
.de 2C	\"	--- 2 column output ---
.P]
.nr C% 1 1			\" start column counter
.ll (\\n(LTu-1c)/2u		\" calculate line length
.nr LL \\n(.l			\" -ms compatibility
.Al				\" align
.nr Tp \\n(.d			\" new top of page
.P[
..
.de 1C	\"	--- back to 1 column output ---
.P]
.ll \\n(LTu			\" restore line length
.nr LL \\n(.l			\" -ms compatibility
.po \\n(POu			\" restore margin
.nr C% 0			\" stop column count
.P[
..
.\"
.\"	=====> paragraph macros <=====
.\"
.de P[	\"	--- begin paragraph ---
.if !\\n(Ks .di Pd		\" divert
..
.de P]	\"	--- end paragraph ---
.ce 0				\" break, turn off centering
.in 0				\" turn off indent
.if !\\n(Ks \{\
.   nr Pm \\n(.u		\" save fill mode
.   nf				\" stop filling
.   di
.   \" diversion ended.  If paragraph doesn't fit, do something special
.   \" if left page was decreased, decrease right page too, else if
.   \" paragraph doesn't fit for but one line, decrease page length
.   if \\n(.t+1v<\\n(dn .if \\n(Tl:(\\n(.t+2v>=\\n(dn) .Bt -1v
.   Pd				\" flush paragraph
.   if \\n(Pm .fi		\" restore fill mode, but don't break
.\}
..
.\"
.\"	=====> footnote macros <=====
.\"
.rm FS FE FJ FK			\" remove -ms footnote stuff
.de FS	\"	--- start footnote ---
.ev 1				\" switch environments
.da Fe				\" divert footnote to Fe
.fi
..
.de FE	\"	--- end of footnote ---
.nf				\" break and stop filling
.da
.ev				\" restore environment
.\" If footnote doesn't fit, break here and now.  If it does, append it to
.\" the other macro and move end of page trap up.  If buffering already,
.\" continue buffering.
.nr Fe +\\n(dn			\" calculate new footnote size
.if \\n(Fe=\\n(dn .ie \\n(nl+\\n(.d+1v+\\n(Fd+\\n(Fe>=\\n(Bt .Bt \\n(nl+\\n(.d
.el .Fa				\" footnote still fits
..
.de Fa	\"	--- add footnote to buffer ---
.ev 1				\" switch environments again
.da Fd				\" add footnote to Fd
.if \\n+(Fc=1 \l'1i'		\" footnote separator on first footnote
.Fe				\" ditto
.br				\" ditto
.da
.ev				\" restore environment
.nr Fd +\\n(dn			\" calculate new footnote size
.Bt \\n(Pl-\\n(Fd-1v		\" calculate new page trap
.rm Fe				\" remove old footnote
.nr Fe 0			\" clear footnote size
..
.\"
.\"	=====> keep macros <=====
.\"
.nr Kl 0			\" free list
.nr n0 1
.nr n1 2
.nr n2 3
.nr n3 4
.nr n4 5
.nr n5 6
.nr n6 7
.nr n7 8
.nr n8 9
.nr n9 (-1)			\" end of free list
.nr Ko (-1)			\" queue in
.nr Ki (-1)			\" queue out
.de Bp	\"	--- begin a new page ---
.if !(\\n(.d=(\\n(T[) .T]	\" if not top of page, go to bottom
..
.de Kg	\"	--- output keep \\$1 with height \\$2
.nr Pm \\n(.u			\" save fill mode
.nf				\" don't fill
.\\$1				\" output keep
.if \\n(Pm .fi			\" restore fill mode
.if \\n(.t<2v .Bp		\" if little room left, begin new page
..
.de KS	\"	--- begin static keep ---
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Ks				\" divert keep to Ks
.P[
..
.de KF	\"	--- begin floating keep --
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Kf				\" divert keep to Kf
.P[
..
.de KE	\"	--- end keep --
.P]				\" break
.ie '\\n(.z'Ks' \{\
.   di
.   if \\n(dn>=\\n(.t .Bp	\" if it doesn't fit, begin a new page
.   Kg Ks \\n(dn		\" release static keep
.\}
.el \{\
.   di
.   ie (\\n(Ki<0)&(\\n(dn<\\n(.t) .Kg Kf \\n(dn
.   el \{\
.      if \\n(Kl<0 .Kr		\" free list exhausted, flush some entries
.      if \\n(Ki>=0 .nr n\\n(Ki (\\n(Kl)	\" if (Ki != NIL) n[Ki] = Kl
.      nr Ki (\\n(Kl)		\" Ki = Kl
.      nr Kl (\\n(n\\n(Kl)	\" Kl = n[Kl]
.      rn Kf d\\n(Ki		\" d[Ki] = Kf	diversion
.      nr h\\n(Ki (\\n(dn)	\" h[Ki] = dn	height
.      nr n\\n(Ki (-1)		\" n[Ki] = -1	(end of list)
.      if \\n(Ko<0 .nr Ko (\\n(Ki)	\" if (Ko < 0) Ko = Ki
.      nr Kr 1			\" entries to release
.   \}
.\}
.nr Ks -1
.P[				\" start a new paragraph
..
.de Kr	\"	--- release floating keep ---
.in 0				\" no indentation
.nf				\" no filling
.nr Kr 0			\" don't release while releasing
.Kg d\\n(Ko \\n(h\\n(Ko		\" output it
.fi				\" restore filling
.in				\" restore indentation
.nr Kt \\n(Ko			\" Kt = Ko
.nr Ko (\\n(n\\n(Kt)		\" Ko = n[Kt]	remove from queue
.nr n\\n(Kt (\\n(Kl)		\" n[Kt] = Kl	put on free list
.nr Kl (\\n(Kt)			\" Kl = Kt
.nr Kr (\\n(Ko>=0)		\" Kr = (Ko >= 0)
.ie !\\n(Kr .nr Ki (-1)		\" if Ko < 0 then Ki = end of list
.el .if \\n(h\\n(Ko<\\n(.t .Kr	\" release another one
.if \\n(Kf .T]			\" if flushing, begin new page
..
.de KK	\"	--- flush floating keeps ---
.nr Kf 1			\" flush floating keeps
.Bp				\" begin a new page
.nr Kf 0			\" don't flush anymore
..
.\"
.\"	=====> user macros <=====
.\"
.rn bp b[			\" rename begin page request
.de bp	\"	--- begin page for users ---
.P]				\" end paragraph
.T]				\" to bottom of page
.P[				\" begin new paragraph
..
.de B[	\"	--- begin block ---
.br
.ds Bw \\$2
.ds Bh \\$3
.di \\$1
.nf
..
.de B]	\"	--- end block ---
.fi
.di
.nr \\*(Bw \\n(dl
.nr \\*(Bh \\n(dn
..
.de B|	\"	--- position block ---
.nf
.mk B|				\" remember vertical position
.nr Bw \\$2			\" copy x argument
.nr Bh \\$3			\" copy y argument
.in +\\n(Bwu			\" go to horizontal position
.sp \\n(Bhu			\" go to vertical position
.\\$1				\" output block
.in				\" return to horizontal position
.sp |\\n(B|u			\" return to vertical position
.fi
..
.de C[	\"	--- begin centered block ---
.P]				\" end paragraph
.nr Ks +1			\" mark keep
.di Cd				\" divert to Cd
.P[
..
.de C]	\"	--- end centered block ---
.P]				\" break
.di
.if \\n(dl<\\n(.l .in (\\n(.lu-\\n(dlu)/2u	\" indent to center
.Kg Cd \\n(dl			\" get diverted text
.in 0				\" no indentation
.nr Ks -1			\" end of keep
.P[				\" begin normal paragraph
..
.de Q[	\"	--- begin quote ---
.C[				\" begin centered block
.nr Ql \\n(.l			\" save line length
.ll \\n(.lu*3u/4u		\" set line length to 3/4 of current ll
..
.de Q]	\"	--- end quote ---
.ll \\n(Qlu			\" restore line length
.C]				\" end centered block
..
.
.de SZ	\"	--- size change ---
.br				\" first break
.ps \\$1			\" change point size
.vs \\$1			\" change vertical spacing
..
.de JR	\"	--- reset indentation ---
.nr Jn 0			\" current indent
.nr Ji 0			\" index of indented paragraphs
.nr J0 5n			\" reset indent sizes
.nr J1 5n
.nr J2 5n
.nr J3 5n
.nr J4 5n
.nr J5 5n
..
.de RT	\"	--- reset fonts and such ---
.ps \\n(PS			\" point size
.vs \\n(VS			\" vertical spacing
.ll \\n(LLu			\" line length
.nr Pl 27c+0.5v			\" length of page
.ll \\n(LLu			\" line length
.ev 1				\" parameters in environment 1 (title)
.ps 12				\" point size
.ll \\n(LLu			\" line length
.lt \\n(LTu			\" title length in environment 1
.ev
.ft 1				\" reset font
..
.de RS	\"	--- increase indent ---
.nr Jn +\\n(J\\n(Ji
.nr Ji +1
..
.de RE	\"	--- decrease indent ---
.nr Ji -1
.nr Jn -\\n(J\\n(Ji
..
.de JP	\"	--- begin unlabeled, indented paragraph ---
.P]				\" end paragraph
.if \\n(.t<1v .Bp		\" if not enough room, begin page
.if !(\\n(.d=\\n(Tp) .sp 0.3v 	\" if not top of page, skip some space
.nr Jj \\n(J\\n(Ji		\" increase in indent
.fi				\" start filling
.in \\n(Jnu+\\n(Jju		\" set new indent
.ta \\n(Jju			\" set tab
.ti -\\n(Jju			\" set temporary indent
.P[
..
.de IP	\"	--- begin labeled, indented paragraph ---
.if \\n(.$>1 .nr J\\n(Ji \\$2n	\" set indent if specified in ens
.JP				\" do indented paragraph
.RT				\" restore -ms variables
\&\\$1	\c
..
.de QP	\"	--- begin quotation ---
.if \\n(.$>1 .nr J\\n(Ji \\$2n	\" set indent if specified in ens
.ll -\\n(J\\n(Jiu		\" decrease line length
.JP				\" do indented paragraph
.RT				\" restore -ms variables
\&\\$1	\c
..
.de LP	\"	--- begin paragraph ---
.P]				\" end last paragraph
.Al				\" align
.sp \\n(PDu			\" interparagraph spacing
.JR				\" reset indentation
.RT				\" restore -ms variables
.fi				\" start filling
.P[				\" begin next
..
.de PP	\"	--- begin paragraph with temporary indent ---
.P]				\" end last paragraph
.Al				\" align
.sp \\n(PDu			\" interparagraph spacing
.JR				\" reset indentation
.RT				\" restore -ms variables
.fi				\" start filling
.ti +5n				\" temporary indent
.P[				\" begin next
..
.de CH	\"	--- chapter heading ---
.P]				\" break, start chapter
.KK				\" flush floating keeps
.Bp				\" begin page
.nr Tc 0			\" page number at bottom of page
.nr Fi 1 1			\" current figure
.RT				\" restore -ms variables
.ps 18				\" set point size
.vs 24				\" set vertical spacing
.ce 1000			\" center all lines
.nr Hi 0			\" header index 0
.rm HS				\" remove header string
.rm Rc				\" do not count pages
.fi				\" filling
.P[				\" start a new paragraph
..
.de SH	\"	--- section heading ---
.P]				\" end last paragraph
.Al				\" align
.if \\n(.t<5v .Bp		\" if not enough room, begin new page
.if !(\\n(.d=\\n(Tp) .sp 	\" if not top of page, skip some space
.RT				\" restore -ms variables
.ft 3				\" bold font
.nr Hi 0			\" header index 0
.rm HS				\" remove header string
.fi				\" start filling
.P[				\" start a new paragraph
..
.de NH	\"	--- numbered section header ---
.ie \\n(.$=0 .nr Ha 1		\" if no argument, Ha = 1
.el .nr Ha \\$1			\" Ha is argument
.if \\n(Ha<1 .nr H1 0		\" reset subsection numbers
.if \\n(Ha<2 .nr H2 0
.if \\n(Ha<3 .nr H3 0
.if \\n(Ha<4 .nr H4 0
.if \\n(Ha<5 .nr H5 0
.if \\n(Ha=0 .nr Ha 1		\" .NH 0 is like .NH 1, but then resets
.nr H\\n(Ha +1			\" H[Ha]++
.ie (\\n(Pp=0)&(\\n(Ha=1) \{\
.   CH
.   if !\\n(Pp .Ds Fn "\\n(H1\\*(Fs1"	\" reset next figure string
.   ds HS \\n(H1
\\s+6\\*(HS\\s-6
.   sp 0.5
.\}
.el \{\
.   SH
.   ds HS \\n(H1
.   if \\n(Ha>1 .as HS .\\n(H2
.   if \\n(Ha>2 .as HS .\\n(H3
.   if \\n(Ha>3 .as HS .\\n(H4
.   if \\n(Ha>4 .as HS .\\n(H5
\\*(HS.
.\}
.ds H0 \\*(HS.
.nr Hi \\n(Ha			\" header index
..
.de TL	\"	--- title of paper ---
.nr Pp 1			\" mark it's a paper
.CH
.Ds Fn "1"			\" next figure string
.ps -2
..
.de AU	\"	--- authors ---
.sp
.ft 2
.ps \\n(PS
.vs \\n(VS
..
.de AI	\"	--- author's institution
.sp
.ft 1
.ps \\n(PS
.vs \\n(VS
..
.de AB
.AI
.if !\\n(.$ ABSTRACT
.sp
.ce 0
.Q[
..
.de AE
.Q]
.sp
..
.de PS	\"	--- start picture ---
.\" $1 is height, $2 is width in units
..
.de PE	\"	--- end of picture ---
..
.de UX	\"	--- UNIX macro ---
.ie \\n(U1 \\$2\s-1UNIX\s0\\$1
.el \{\
\\$2\s-1UNIX\\s0\\$1\(dg
.   FS
\(dg UNIX is a Registered Trademark of AT&T in the USA and other countries.
.   FE
.nr U1 1
.\}
..
.de IX	\"	--- add to index, update page headers ---
.LP				\" end header, define page headers
.if \\n(Hi=0 \{\
.   ds T1 \\$1
.   ds T2 \\$1
.   rm S1 S2			\" no chapter or section number
.\}
.if \\n(Hi=1 \{\
.   ds T1 \\$1
.   ds S1 \s-2CHAP.\& \\*(HS\s+2
.   ds T2 \\$1
.   ds S2 \\*(S1
.\}
.if \\n(Hi=2 \{\
.   ds T2 \\$1
.   ds S2 \s-2SEC.\& \\*(HS\s+2
.\}
.da IO				\" divert to index
\\!.In \\n(Hi "\\*(HS" "\\$1" \\n%
.da
..
.de In	\"	--- output index ---
.P]				\" end of paragraph
.if !(\\n(.d=\\n(Tp) .ie \\$1<2 .sp 1.7
.el .if \\$1=2 .sp 0.3
.in 0
.ad l				\" adjust only left side
.ll -5n				\" decrease line length
.nr J0 0
.P[
.ie \\$1 \{\
.   nr In \\$1-1
.   nr J\\$1 \\n(J\\n(In+\\w'\\$2'+3.5n
.   in \\n(J\\$1u		\" set indent
.   ta \\w'\\$2'u+3.5n
.   ti -\\w'\\$2'u+3.5n
.   ie \\$1<2 \\s+3\\f3\\$2	\\$3\\f1\\s-3\&\c
.   el \\$2	\\$3\&\c
.\}
.el \\s+3\\f3\\$3\\f1\\s-3\&\c
.ll +5n				\" reset line length
.nr In \\n(.l-\w'\\$4'
\\\\h'|\\n(Inu'\\$4
.in 0				\" break, reset indent
.ad b				\" adjust both sides, end of diversion
..
.de IH	\"	--- index header ---
.ie \\$1 .NH \\$1		\" start a new header
.el .CH				\" start a new, unindexed, chapter
\\$2
.IX "\\$2"			\" add header to index
..
.ds Fs .
.de F1	\"	--- begin figure ---
.ds Fp \\*(Fn
.ie \\n(Pp .ds Fn \\n+(Fi
.el .ds Fn \\n(H1\\*(Fs\\n+(Fi
.KF				\" floating keep
.sp 0.5c
.C[				\" begin centered block
..
.de F2	\"	--- end of figure, begin label ---
.C]				\" end centered block
.sp 0.5c
.Q[
.fi
.ps -2
.vs -2
\\fBFig.\ \\*(Fp.\\fP
.ft 1
..
.de F3	\"	--- end of figure label ---
.br
.ft
.vs
.ps
.Q]
.sp 0.8				\" leave some room under the figure
.KE				\" end floating keep
..
.de KW	\"	--- keyword ---
.ie \\n(.$ \{\
.   ie '\\n(.z'' .tm KW:\\$1
.   el \\!.KW "\\$1"
.\}
.el \{\
.   P]
.   tm KW
.   P[
.\}
..
.de Kx	\"	--- start list of keywords ---
.P]
.if !(\\n(.d=\\n(Tp) .sp	\" if not top of page, skip some space
.P[
..
.de Kw	\"	--- output keyword ---
.LP
.in 1c
.ti -1c
\&\\$1 \\$2
..
.nr Di 1 1			\" current definition
.Ds Dn "\\*(Dx\\n(Di"
.de D[	\"	--- begin definition ---
.sp 0.5c
.Ds Dp "\\*(Dx\\n(Di"
.Ds Dn "\\*(Dx\\n+(Di"
\\fBDefinition\ \\*(Dp.\ \ \\$1\\fP\ \ \c
..
.de D]	\"	--- end of definition ---
.sp 0.3c
..
.\"
.\"	=====> refer macros <=====
.\"
.rm <. <,
.Ds >. "."		\" reference ends with period
.Ds >, ","		\" reference ends with comma
.Ds [. " \\f1["		\" start of reference
.Ds .] "]\\fP"		\" end of reference
.de ]<	\"	--- references ---
.if \\n(Rb \{\
.   ie \\n(.$ \{\
.      ie '\\n(.z'' .tm ]<:\\$1
.      el \\!.]< "\\$1"
.   \}
.   el \{\
.      P]
.      tm ]<
.      P[
.   \}
.\}
..
.de ]>
..
.de ]-	\"	--- remove garbage before next definition ---
.rm [A [B [C [D [E [G [H [I [J [M [N [O [P [Q [R [S [T [V ]. ],
..
.de RR	\"	--- add comma + argument to reference ---
\\*(],\\$1\c
.ds ]. .
.ds ], , \&
..
.de Rc	\"	--- sited on pages ($1: all; $2: first; ...) ---
.ie \\n(.$>2 Cited on pages \\$1.
.el Cited on page \\$1.
..
.de ][	\"	--- new reference ---
.KS				\" keep together
.JP [\\*([F]			\" start indented paragraph
.if !\\*([H .RR "\\*([H"
.if !\\*([A .RR "\\*([A"
.if !\\*([Q .RR "\\*([Q"
.if !\\*([T \{\
\\*(],\(l"\\*([T\c
.ds ]. .\(r"
.ds ], ,\(r" \&
.\}
.if !\\*([R .RR "\\*([R"
.if !\\*([M .RR "\\*([M"
.if !\\*([J .RR "\\f2\\*([J\\fP"
.if !\\*([V .RR "Vol.\& \\*([V"
.if !\\*([N .RR "No.\& \\*([N"
.if !\\*([P .ie \\n([P>0 .RR "pp.\& \\*([P"
.el .RR "p.\& \\*([P"
.if !\\*([B .RR "in \\f2\\*([B\\fP"
.if !\\*([E .RR "ed.\& \\*([E"
.if !\\*([S .RR "\\*([S"
.if !\\*([I .RR "\\*([I"
.if !\\*([C .RR "\\*([C"
.if !\\*([G .RR "Gov't.\& ordering no.\& \\*([G"
.if !\\*([D .RR "\\*([D"
\&\\*(].
.if !\\*([L .Rc "\\*([L" \\*([L
.if !\\*([O \&\\*([O
.KE
..
.\"
.\"	=====> accents <=====
.\"
.ds - \(em
.ds ' \h'\w'e'u-\w'\(aa'u/2+.06m'\z\(aa\h'-\w'e'u+\w'\(aa'u/2-.06m'
.ds ` \h'\w'e'u-\w'\(ga'u/2+.06m'\z\(ga\h'-\w'e'u+\w'\(ga'u/2-.06m'
.ds : \h'\w'u'u-\w'\(ad'u/2+.06m'\z\(ad\h'-\w'u'u+\w'\(ad'u/2-.06m'
.ds ^ \h'\w'a'u-\w'^'u/2+.06m'\z^\h'-\w'a'u+\w'^'u/2-.06m'
.ds ~ \h'\w'a'u-\w'~'u/2+.06m'\z~\h'-\w'a'u+\w'~'u/2-.06m'
.ds C \h'\w'e'u-\w'\(ah'u/2+.06m'\z\(ah\h'-\w'e'u+\w'\(ah'u/2-.06m'
.ds v \h'\w'e'u-\w'\(ah'u/2+.06m'\z\(ah\h'-\w'e'u+\w'\(ah'u/2-.06m'
.ds , \h'\w'c'u-\w'\(ac'u/2'\z\(ac\h'-\w'c'u+\w'\(ac'u/2'
.ds -- \*-
.ds q[ \(l"
.ds ]q \(r"
.\"
.\"	=====> user settable definitions <=====
.\"
.cs 5 20u			\" font 5, constant width
.nr PS 12			\" point size
.nr VS 14			\" vertical spacing
.nr LL 6.5i			\" line length
.nr FL 15c			\" footnote length (no effect currently)
.nr LT 15c			\" title length
.nr PO \n(.o			\" page offset
.nr PD 0			\" interparagraph spacing
.\"
.\"	=====> -ms init <=====
.\"
.nr FM 1			\" ms hack: remove page traps
.ch FO				\" remove bottom of page trap
.ch FX				\" remove footnote trap
.rm PT BT			\" remove other traps
.nr YE 1			\" causes break in .EQ
.\"
.\"	=====> initialization <=====
.\"
.RT				\" set these variables
.JR				\" reset indentation
.hw packet re-start trans-par-ent trans-par-ently trans-par-ency work-station trans-action time-stamp
.wh 0 T[			\" top of page macro
.em E]				\" end of text macro
.P[				\" begin paragraph
.tr ~
.TL
A DISTRIBUTED IMPLEMENTATION OF THE
.br
SHARED DATA-OBJECT MODEL
.br
.AU
Henri E. Bal *
M. Frans Kaashoek
Andrew S. Tanenbaum
.AI
Dept. of Mathematics and Computer Science
Vrije Universiteit
Amsterdam, The Netherlands
Email: bal@cs.vu.nl
.nr PS 10
.nr VS 12
.AB
.PP
The shared data-object model is designed to ease the implementation
of parallel applications on loosely coupled distributed systems.
Unlike most other models for distributed programming (e.g., RPC),
the shared data-object model allows processes on different machines
to share data.
Such data are encapsulated in data-objects, which are instances
of user-defined abstract data types.
The shared data-object model forms the basis of a new language
for distributed programming, \fIOrca\fR, which gives linguistic
support for parallelism and data-objects.
A distributed implementation of the shared data-object model should take care
of the physical distribution of objects among the local memories
of the processors.
In particular, an implementation may replicate objects in order to
decrease access times to objects and increase parallelism.
.PP
The intent of this paper is to show that, for several applications,
the proposed model is both easy to use and efficient.
We first give a brief description of the shared data-object
model and Orca.
Next, we describe one of several existing implementations of Orca.
This implementation replicates all objects on all processors and
updates replicas through a reliable broadcast protocol.
We describe all three layers of this implementation: the Orca compiler,
the Orca run time system, and the reliable broadcast protocol.
Finally, we report on our experiences in using this implementation.
We describe three parallel applications written in Orca
and give performance measurements for them.
We also compare these figures with those of a nondistributed (shared-memory)
implementation of Orca.
The measurements show that significant speedups can be obtained for
all three applications.
.AE
.FS
* This research was supported in part by the
Netherlands organization for scientific research (N.W.O.) under grant 125-30-10.
.FE
.EQ
delim @@
.EN
.nr VS 14
.nr PS 12
.NH 1
INTRODUCTION
.PP
As communication in loosely coupled distributed computing systems is
getting faster, such systems become more and more attractive for running
parallel applications.
In the Amoeba system\*(<,\*([.Mullender and Tanenbaum 1986\*(.]\*(>,
.]< 0
for example, the cost of sending a short message
between Sun workstations over an Ethernet is 1.4 milliseconds\*(<.\*([.Van Renesse et al. 1989\*(.]\*(>.
.]< 1
Although this is still slower than communication in most
multicomputers (e.g., Hypercubes and transputer grids),
it is fast enough for many coarse-grained parallel applications.
In return, distributed systems are easy to build from off-the-shelf
components, by interconnecting multiple workstations or microprocessors through
a local area network (LAN).
In addition, such systems can easily be expanded to far larger
numbers of processors than shared-memory multiprocessors.
.PP
In our research, we are studying the implementation of parallel applications
on distributed systems.
We started out by implementing several coarse-grained parallel applications
on top of the Amoeba system,
using Remote Procedure Calls (RPC)\*([.Birrell and Nelson 1984\*(.]
.]< 2
for interprocess communication\*(<.\*([.Bal et al. 1987\*(.]\*(>.
.]< 3
RPC is widely used in the distributed systems community for
implementing distributed servers (e.g., file servers)\*(<.\*([.Tanenbaum and Van Renesse 1985\*(.]\*(>.
.]< 4
For parallel programming, however, RPC has several disadvantages\*(<.\*([.Tanenbaum and Van Renesse 1988\*(.]\*(>.
.]< 5
RPC is a synchronous (blocking) communication
primitive, so a separate mechanism is needed for obtaining parallelism.
Of more significance, the programming model of RPC is based on
message passing, which is conceptually input/output.
This makes efficient sharing of data among processes very hard.
.PP
The RPC model does not provide (logically) shared data, since
processes on different machines run in separate address spaces.
Data that are shared among multiple processes have to be encapsulated
by a server process and can only be accessed indirectly through a remote
call to this server.
Parallel applications, however, often need a finer level of
sharing, with a much lower overhead.
.PP
As an example of such a parallel application, consider
parallel branch-and-bound algorithms.
Such algorithms store the current best solution (the bound)
in a global variable accessed by all processors.
This is not to say the algorithms actually need physical shared memory;
as the bound is updated only once in a while,
parallel branch-and-bound algorithms can be implemented efficiently
on distributed systems.
In our experience, however, implementing the algorithms efficiently
using RPC is complicated.
.PP
In this paper, we will look at an alternative model for distributed
programming that supports logically shared data.
This model, the \fIshared data-object model\fR\*(<,\*([.Bal and Tanenbaum 1988\*(.]\*(>,
.]< 6
allows processes to share data without requiring physical shared memory.
Also, we have designed a new programming language, \fIOrca\fR\*(<,\*([.Bal and Tanenbaum 1988; Bal et al. 1989\*(.]\*(>,
.]< 6
.]< 7
based on this model.
The intent of this paper is to show that, for several applications,
the model is both easy to use and efficient.
We do so by describing an implementation of Orca on a loosely coupled
system and reporting on our experiences in using this implementation
for several small-scale but realistic applications.
.PP
The issue of providing logically shared data in an
environment without shared memory has been addressed by several
other languages and operating systems.
Linda's Tuple Space\*(<,\*([.Ahuja et al. 1986\*(.]\*(>,
.]< 8
for example,
is a global, content-addressable shared memory,
which has been implemented on various types of parallel systems.
For many applications this model is much easier to use than RPC.
The operations defined on Tuple Space provide a low level
of abstraction, however, which we feel is a disadvantage for distributed
programming\*(<.\*([.Kaashoek et al.\*(.]\*(>.
.]< 9
Other interesting proposals include parallel object-oriented languages
(e.g., Emerald\*([.Jul et al. 1988\*(.]
.]< 10
), which provide a uniform address space for objects,
and Kai Li's shared virtual memory\*(<,\*([.Li 1988\*(.]\*(>,
.]< 11
which simulates physical shared memory.
(These and other systems are surveyed in\*([.Bal and Tanenbaum 1988\*(.]
.]< 6
).
Also, several researchers have looked at distributed applications that
can be implemented with logically shared data.
Example applications are: speech recognition\*(<,\*([.Bisiani and Forin 1987\*(.]\*(>,
.]< 12
linear-equation solving, three-dimensional partial differential equations\*(<,\*([.Li 1988\*(.]\*(>,
.]< 11
and global scheduling and replicated files\*(<.\*([.Cheriton 1985\*(.]\*(>.
.]< 13
.PP
The rest of the paper is structured as follows.
In Section 2, we will give a brief description of the shared data-object model
and Orca.
In Section 3, we will discuss one implementation of the model, based on
reliable broadcast. We will also describe how to implement
this broadcast primitive on top of LANs that only support
unreliable broadcast.
In Section 4, we will report on our experiences in using this implementation
of Orca. We will give performance measurements for several applications.
Also, we will compare these performance figures with those of a nondistributed
(shared-memory) implementation of Orca.
Finally, in Section 5 we present our conclusions.
.NH 1
THE SHARED DATA-OBJECT MODEL
.PP
The most important issue addressed by our model is how data structures
can be shared among distributed processes in an efficient way.
In languages for multiprocessors, shared data structures are stored in
the shared memory and accessed in basically the same way as local variables,
namely through simple load and store instructions.
If a process is going to change part of a shared data structure
and it does not want other processes to interfere, it locks that part.
All these operations (loads, stores, locks) on shared data structures involve
very little overhead, because access
to shared memory is hardly more expensive than access to local memory.
.PP
In a distributed system, on the other hand,
the time needed to access data very much
depends on the location of the data.
Accessing data on remote processors may be orders of magnitude more
expensive than accessing local data.
It is therefore infeasible to apply the multiprocessor model of programming
to distributed systems.
The operations used in this model are far too low-level
and will have tremendous overhead on distributed systems.
.PP
The starting-point in our model is to access shared data structures through
higher level operations.
Instead of using low-level instructions for reading, writing, and locking
shared data, we propose to let programmers define composite operations
for manipulating shared data structures.
Shared data structures in our model are encapsulated in
so-called \fIdata-objects\fR\s-2\v'-0.4m'1\v'0.4m'\s+2
that are manipulated through a set of user-defined operations.
.FS
\s-2\v'-0.4m'1\v'0.4m'\s+2
We will sometimes use the term \(l"object\(r" as a shorthand notation.
Note, however, that this term is used in many other languages and systems,
with various different meanings.
.FE
Data-objects are best thought of as instances (variables)
of \fIabstract data types\fR.
The programmer specifies an abstract data type by defining
operations that can be applied to instances (data-objects) of that type.
The actual data contained in the object and the executable code
for the operations are hidden in the implementation of the abstract data type.
.PP
Although data-objects logically are shared among processes,
their implementation does not need physical shared memory.
In worst case, an operation on a remote object
can be implemented with a remote procedure call.
The general idea, however, is for the implementation to take care of the
physical distribution of data-objects among processors.
As we will see in Section 3, one way to achieve this goal
is to replicate shared data-objects.
By replicating objects, access control to shared objects is decentralized,
which decreases access costs and increases parallelism.
This is a major difference with, say, monitors\*(<,\*([.Hoare 1974\*(.]\*(>,
.]< 14
which centralize control to shared data.
.PP
In the following sections, we will elaborate the basic idea
by looking at the issue of synchronization.
Two types of synchronization can be distinguished:\*([.Andrews and Schneider 1983\*(.]
.]< 15
mutual exclusion synchronization prevents multiple simultaneous
writes (or reads and writes) to the same data from interfering with each other;
condition synchronization allows processes to wait for a certain
condition to become true.
We discuss both types of synchronization in turn, in Sections 2.1 and 2.2.
Finally, in Section 2.3 we describe a language based on this model.
.NH 2
Mutual exclusion synchronization
.PP
Shared-variable languages usually provide some kind of \fIlocking\fR
construct for mutual exclusion synchronization.
In a distributed environment, however, such locking primitives
are too low-level and have a high overhead.
In our model, mutual exclusion is done implicitly, by executing
all operations on objects \fIindivisibly\fR.
Conceptually, each operation locks the entire object it is applied
to and releases the lock only when it is finished.
To be more precise, the model guarantees \fIserializability\fR\*([.Eswaran et al. 1976\*(.]
.]< 16
of operation invocations:
if two operations are applied simultaneously to the same data-object,
then the result is as if one of them is executed before the other;
the order of invocation, however, is nondeterministic.
.PP
An implementation of the model
need not actually execute all operations one by one.
To increase the degree of parallelism, it may execute multiple operations
on the same object simultaneously, as long as the effect
is the same as for serialized execution.
For example, operations that only read (but do not change) the
data stored in an object can easily be executed in parallel.
.PP
As operations are indivisible,
mutual exclusion synchronization to shared data-objects is taken care
of automatically.
As a simple example, consider an object encapsulating an integer variable,
as specified in Figure 1.
.F1
.nr VS 12
.nr PS 10
.vs 12
.ps 10
.ta 2.1i
.nf
\fBobject specification\fI IntObject;
~~~\fBoperation\fI Value(): integer;	# return current value
~~~\fBoperation\fI Assign(val: integer);	# assign new value
~~~\fBoperation\fI Add(val: integer);	# add val to current value
~~~\fBoperation\fI Min(val: integer);	# set value to minimum of current value and val
\fBend\fI;
.fi
.ft R
.nr VS 14
.nr PS 12
.vs 14
.ps 12
.F2
Specification part of an object type IntObject.
.F3
.PP
Suppose two processes P\s-2\v'0.4m'1\v'-0.4m'\s+2 and P\s-2\v'0.4m'2\v'-0.4m'\s+2 
share an object \fIX\fR of this type.
If they simultaneously try to apply
the \fIAssign\fR operation to \fIX\fR, the resulting
value will either be that of P\s-2\v'0.4m'1\v'-0.4m'\s+2's or
P\s-2\v'0.4m'2\v'-0.4m'\s+2's invocation,
but the value will never be some strange mixture of the bits.
Similarly, if P\s-2\v'0.4m'1\v'-0.4m'\s+2 and P\s-2\v'0.4m'2\v'-0.4m'\s+2
simultaneously increment the value of \fIX\fR by invoking the operation
.DS
\fIX$Add(1);\fR
.DE
the value will always be
incremented twice, because the operations are serialized.
.PP
On the other hand, \fIsequences\fR of operations are not executed indivisibly.
For example, the sequence
.nr VS 13
.DS
.ft I
tmp := X$Value();	#~get value of object X
X$Assign(tmp+1);	#~increment value and store result back in X
.ft R
.nr VS 14
.DE
.LP
is not an indivisible action.
If two processes execute this sequence simultaneously, the value of \fIX\fR
may be incremented once or twice.
This rule for defining which actions are indivisible and which are not
is both easy to understand and flexible:
single operations are indivisible; sequences of operations are not.
Orca does not provide mutual exclusion at a granularity lower
than the object level.
.PP
Our model does not support indivisible operations on multiple objects.
Operations on multiple objects would require a distributed locking protocol,
which is complicated to implement efficiently.
Instead, we prefer to keep our basic model as simple as possible
and implement more complicated actions on top of it.
Operations in our model therefore apply to single objects and are
always executed indivisibly.
However, the model is sufficiently powerful to allow users to
construct locks for multi-operation sequences on different objects,
so arbitrary actions can be performed indivisibly.
.NH 2
Condition synchronization
.PP
Condition synchronization allows processes to wait (\fIblock\fR)
until a certain condition becomes true.
The simplest form of condition synchronization
is repeated testing (\fIbusy waiting\fR) of a shared variable, until
it has a certain value.
Since busy waiting wastes computing cycles, most parallel languages use
a separate condition synchronization mechanism, such as a semaphore,
eventcount, or condition variable\*(<.\*([.Andrews and Schneider 1983\*(.]\*(>.
.]< 15
.PP
In the shared data-object model, condition synchronization is integrated
with operation invocations by allowing operations to block.
Processes synchronize implicitly through operations on shared objects.
A blocking operation consists of one or more guarded commands:
.nr VS 13
.DS
\fBoperation\fI op(formal-parameters): ResultType;
~~~local declarations
\fBbegin\fI
~~~~~\fBguard\fI condition\s-2\v'0.4m'1\v'-0.4m'\s+2 \fBdo\fI statements\s-2\v'0.4m'1\v'-0.4m'\s+2 \fBod\fI;
~~~~~\fBguard\fI condition\s-2\v'0.4m'2\v'-0.4m'\s+2 \fBdo\fI statements\s-2\v'0.4m'2\v'-0.4m'\s+2 \fBod\fI;
~~~~~...
~~~~~\fBguard\fI condition\s-2\v'0.4m'n\v'-0.4m'\s+2 \fBdo\fI statements\s-2\v'0.4m'n\v'-0.4m'\s+2 \fBod\fI;
\fBend\fI;
.ft R
.nr VS 14
.DE
The conditions must be side-effect free boolean expressions.
The operation initially blocks (suspends) until at least one of
the conditions (guards) evaluates to \(l"true.\(r"
Next, one true guard is selected nondeterministically,
and its sequence of statements is executed.
.NH 2
Orca
.PP
We have used the shared data-object model for designing a new language
called \fIOrca\fR for distributed application programming.
Unlike the majority of other languages for parallel or distributed programming,
Orca is not an extension to an existing sequential language.
Instead, its sequential and distributed constructs have been designed
together, in such a way that they integrate well.
.PP
Orca is a procedural, strongly typed language.
Its statements and expressions are fairly conventional and comparable
to those of Modula-2.
The data structuring facilities of Orca, however, are substantially different
from those used in Modula-2.
Orca supports records, unions, dynamic arrays, sets, bags, general graphs,
and generic types.
Pointers have intentionally been omitted to provide type-security.
.PP
Parallelism in Orca is based on explicit creation of sequential processes.
Processes are conceptually similar to procedures, except
that procedure invocations are serial and
process invocations are parallel.
.PP
Processes communicate through shared data-objects, which are instances of
\fIabstract data types\fR.
An abstract data type definition consists of two parts: a \fIspecification\fR
part and an \fIimplementation\fR part.
The specification part defines the operations applicable
to objects of the given type.
(An example of a specification part was given in Figure 1.)
The implementation part contains the data of objects of this type, the code to
initialize the data of new instances of the type, and the code implementing the
operations.
.PP
Objects are created by declaring variables of an abstract data type.
The declaration does not specify whether the object will be shared.
When an object is created, the run time system allocates memory for
the local variables of the object and executes the
initialization code.
.PP
Objects declared local to a process may be shared with other (child) processes
by passing them as shared parameters when the children are created.
For example, if a process \fIchild\fR is declared as
.DS
\fBprocess\fI child(Id: integer; X: \fBshared\fI IntObject);
.ft R
.DE
a new child process can be created as follows
.nr VS 13
.DS
\fBfork\fI child(12, X);
~~~#~create a new child process, passing the constant 12 as
~~~#~value parameter and the object X as shared parameter.
.ft R
.nr VS 14
.DE
The children can pass shared objects to \fItheir\fR children, and so on.
In this way, the objects get distributed among some of the descendants of
the process that created them.
If any of these processes performs an operation on the object,
they all observe the same effect,
as if the object were in shared memory, protected by a lock variable.
.PP
In summary, Orca allows processes to share data encapsulated in objects,
which are instances of abstract data types.
Sharing of objects is only possible between a parent and its descendants,
which is sufficient for the applications Orca intends to support.
Each process sharing an object may apply operations to the object,
as defined by the object's abstract data type.
The effects of operation invocations are observed by all processes sharing
the object.
Simultaneous invocations of operations on the same object are conceptually
serialized.
Condition synchronization is expressed through operations that block.
.NH 1
A DISTRIBUTED IMPLEMENTATION OF ORCA
.PP
Although Orca is a language for programming distributed systems,
its communication model is based on shared data.
The implementation of the language therefore should hide the physical
distribution of the hardware and simulate shared data in an efficient way.
We have designed several different models for implementing the language\*(<.\*([.Bal and Tanenbaum 1988\*(.]\*(>.
.]< 6
The implementation described in this paper is based on
\fIreplication\fR and \fIreliable broadcasting\fR.
.PP
Replication of data is used in several fault-tolerant systems (e.g., ISIS\*([.Joseph and Birman 1987\*(.]
.]< 17
).
to increase the availability of data in the presence of processor failures.
Orca, in contrast, is not intended for fault-tolerant applications.
In our implementation, replication is used to decrease the access costs
to shared data.
.PP
Very briefly stated, each processor keeps a local copy of each shared
data-object.
This copy can be accessed by all processes running on that processor
(see Figure 2).
Operations that do not change the object (called \fIread\fR operations)
use this copy directly, without any messages being sent.
Operations that do change the object (called \fIwrite\fR operations)
broadcast the new values (or the operations) to all the other
processors, so they are updated simultaneously.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS 4.25i
M: box wid 4*boxwid ht 4*boxht
B1: box wid 1.25*boxwid "process-1" with .nw at M.nw+(0.3*boxwid, -0.3*boxwid)
B2: box wid 1.25*boxwid "process-n" with .sw at M.sw+(0.3*boxwid, 0.3*boxwid)
X: box ht 1.5*boxht "copy" "of" "X" with .e at M.e - (0.3*boxwid, 0)
arrow from B1.e to X.w+(0, 0.2*boxwid)
arrow from B2.e to X.w-(0, 0.2*boxwid)
"CPU 1" at M.n + (0, 0.3*boxwid)
N: box wid 4*boxwid ht 4*boxht with .w at M.e+(boxwid, 0)
C1: box wid 1.25*boxwid "process-1" with .nw at N.nw+(0.3*boxwid, -0.3*boxwid)
C2: box wid 1.25*boxwid "process-n" with .sw at N.sw+(0.3*boxwid, 0.3*boxwid)
Y: box ht 1.5*boxht "copy" "of" "X" with .e at N.e - (0.3*boxwid, 0)
arrow from C1.e to Y.w+(0, 0.2*boxwid)
arrow from C2.e to Y.w-(0, 0.2*boxwid)
"CPU 2" at N.n + (0, 0.3*boxwid)
line "n e t w o r k" below from M.sw-(0,boxht) to N.se-(0,boxht)
line from M.s to M.s-(0,boxht)
line from N.s to N.s-(0,boxht)
.PE
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Replication of data-objects in a distributed system
.F3
.PP
The implementation is best thought of as a three layer
software system, as shown below:
.KS
.TS
center, tab(%), box;
c.
compiled application programs
_
run time system
_
reliable broadcasting
.TE
.KE
The top layer is concerned with applications, which are written in Orca
and compiled to machine code by the Orca compiler.
The executable code contains calls to the Orca run time system,
for example for creating and manipulating processes and objects.
.PP
The middle layer is the run time system (RTS).
It implements the primitives called by the upper layer.
For example, if an application performs an operation on a shared data-object,
it is up to the RTS to ensure that the system behaves as if the object
was placed in shared memory.
To achieve this, the RTS of each processor maintains copies of shared objects,
which are updated using reliable broadcasting.
.PP
The bottom layer is concerned with implementing the reliable broadcast,
so that the RTS does not have to worry about what happens if a broadcast
message is lost.
As far as the RTS is concerned, broadcast is error free.
It is the job of the bottom layer to make it work.
.PP
Below, we will describe the protocols and algorithms in each layer.
This section is structured top down: we first discuss the applications
layer, then the RTS layer, and finally the reliable broadcast layer.
.NH 2
Top layer: Orca application programs
.PP
Application programs are translated by the Orca compiler into
executable code for the target system.\s-2\v'-0.4m'2\v'0.4m'\s+2
.FS
\s-2\v'-0.4m'2\v'0.4m'\s+2
We assume the target system does not contain multiple types of CPUs.
Although a heterogeneous implementation of Orca is conceivable, we
do not address this issue here.
.FE
Most of the compiler is based on conventional compiler technology.
In fact, our compiler has been built using the \fIAmsterdam
Compiler Kit\fR, which is a toolkit
for implementing portable compilers\*(<.\*([.Tanenbaum et al. 1983\*(.]\*(>.
.]< 18
Up until now ACK has mainly been used for sequential languages like C and Pascal
and for uniprocessor implementations of parallel (or pseudo-parallel)
languages like Modula-2, occam, and Ada\u\s-2\(rg\s0\d.
As it turns out, ACK is useful for distributed languages like Orca as well.
.PP
The code produced by the compiler contains calls to RTS routines
that manage processes, shared data-objects, and complex data structures
(e.g., dynamic arrays, sets, and graphs).
In this paper, we will only discuss how operation invocations are compiled.
.PP
As described above, it is very important to distinguish between \fIread\fR and
\fIwrite\fR operations on objects.
The compiler therefore analyses the implementation code of each operation
and checks whether the operation modifies the object to which
it is applied.\s-2\v'-0.4m'3\v'0.4m'\s+2
.FS
\s-2\v'-0.4m'3\v'0.4m'\s+2
The actual implementation is somewhat more complicated, since an operation
may have multiple guards (alternatives), some of which may be read-only.
.FE
It stores this information in an \fIoperation descriptor\fR.
This descriptor also specifies the sizes and modes (input or output)
of the parameters of the operation.
.PP
If an Orca program applies an operation on a given object, the compiler
generates a call to the RTS primitive \fIINVOKE\fR.
This routine is called as follows:
.DS
.ft I
INVOKE(object, operation-descriptor, parameters ...);
.ft R
.DE
The first argument identifies the object to which the operation is applied.
The second argument is the operation descriptor.
The remaining arguments of \fIINVOKE\fR are the parameters of the operation.
The implementation of this primitive is discussed below.
.PP
.NH 2
Middle layer: The Orca run time system
.PP
The middle layer implements the Orca run time system.
As mentioned above, its primary job is to manage shared data-objects.
In particular, it implements the \fIINVOKE\fR primitive described above.
For efficiency, the RTS replicates objects so it can apply operations
to local copies of objects whenever possible.
.PP
There are many different design choices to be made related to replication.
The most important ones are:
.IP"~~~\fBReplication strategy\fR:"
.br
The RTS may either replicate all objects on all processors
(\fIfull replication\fR) or it may try to replicate objects only on those
processors that frequently read the object (\fIpartial replication\fR).
In the latter case, the RTS may use compile-time information as well
as run-time statistics for deciding where to store replicas of objects.
.IP "~~~\fBUpdating of replicas\fR:"
.br
After a write operation, the replicas of an object should either be
invalidated or updated.
Updating can either be implemented
by sending the new value of the object to the other processors
or by applying the operation itself to each copy.
.IP "~~~\fBMutual exclusion synchronization\fR:"
.br
Write operations on a given object can be synchronized in at least
two different ways.
One way is to appoint some copy of the object as \fIprimary copy\fR and
direct all write operations to this primary copy.
An alternative way is to treat all copies as equals and
use a \fIdistributed update protocol\fR that takes care of mutual exclusion.
.LP
Each of these alternatives has its own advantages and disadvantages,
as discussed in\*(<.\*([.Bal and Tanenbaum 1988\*(.]\*(>.
.]< 6
The RTS described in this paper uses full replication of objects,
updates replicas by applying write operations to all replicas, and implements
mutual exclusion through a distributed update protocol.
(We have also implemented a second RTS, which uses partial replication
based on run-time statistics and which updates copies through a
primary-copy update protocol. In addition, we have implemented
a third RTS on a true shared-memory multiprocessor, for comparison purposes.)
.PP
We have chosen to use an update scheme rather than an invalidation scheme
for two reasons.
First, in many applications objects contain large amounts
of data (e.g., a 100K bitvector).
Invalidating a copy of such an object is wasteful, since the next
time the object is replicated its entire value must be transmitted.
Second, in many cases updating a copy will take just as much CPU time
and network bandwidth as sending invalidation messages.
.PP
The presence of multiple copies of the same logical data introduces the
so-called \fIinconsistency problem\fR.
If the data are modified, all copies are modified too.
If this updating is not done as one indivisible action, different processors
temporarily have different values for the same logical data.
(The inconsistency problem appears in many other areas where data
are replicated, for example replicated file servers and CPU caches.)
.PP
The semantics of the shared data-object model define that simultaneous
operations on the same object must conceptually be serialized.
The exact order in which they are to be executed is not defined, however.
If, for example, a read operation and a write operation are applied
to the same object simultaneously, the read operation may either observe
the value before or after the write, but not an intermediate value.
However, all processes having access to the object must see the events
happen in the same order.
.PP
The RTS described here solves the inconsistency problem by using a
distributed update protocol that guarantees that all processes observe
changes to shared objects \fIin the same order\fR.
One way to achieve this would be to lock all copies of an object prior
to changing the object.
Unfortunately, distributed locking is quite expensive and complicated.
.PP
Our update protocol does not use locking.
The key to avoid locking is the use of an \fIindivisible, reliable
broadcast\fR primitive, which has the following properties:
.IP "~~~\(bu"
Each message is sent reliably from one source to all destinations.
.IP "~~~\(bu"
If two processors simultaneously broadcast two messages
(say m\s-2\v'0.4m'1\v'-0.4m'\s+2 and m\s-2\v'0.4m'2\v'-0.4m'\s+2),
then either all destinations first receive m\s-2\v'0.4m'1\v'-0.4m'\s+2,
or they all receive m\s-2\v'0.4m'2\v'-0.4m'\s+2 first.
Mixed forms (some get m\s-2\v'0.4m'1\v'-0.4m'\s+2 first, some
get m\s-2\v'0.4m'2\v'-0.4m'\s+2 first) are excluded by the software
protocols.
.LP
This primitive is implemented by the bottom layer of our system,
as will be described in Section~3.3,
Here, we simply assume the indivisible, reliable broadcast exists.
.PP
The RTS uses an \fIobject-manager\fR for each processor.
The object-manager is a light-weight process (thread) that
takes care of updating the local copies of all objects
stored on its processor.
We assume the object-manager and user processes on the same processor
can share part of their address space.
Objects (and replicas) are stored in this shared address space.
User processes can \fIread\fR local copies directly, without
intervention by object-managers.
Write operations on shared objects, on the other hand,
are marshalled and then broadcast to all object-managers in the system.
A user process that broadcasts a write operation suspends until
the message has been handled by its local object-manager.
This is illustrated in Figure 3.
.F1
.nr VS 12
.nr PS 10
.vs 12
.ps 10
.ta 0.3i 0.6i 2.6i
.nf
.ft I
INVOKE(obj, op, parameters)
	\fBif\fI op.ReadOnly \fBthen\fI	# check if it's a read operation
		set read-lock on local copy of obj;
		\fBcall\fI op.code(obj, parameters);	# do operation locally
		unlock local copy of obj
	\fBelse\fI
		\fBbroadcast\fI \fIGlobalOperation\fI(obj, op, parameters) \fBto\fI all managers;
		block current process;
	\fBfi\fI;
.fi
.ft R
.nr VS 14
.nr PS 12
.vs 14
.ps 12
.F2
Implementation of the \fIINVOKE\fR run time system primitive.
This routine is called by user processes.
.F3
.PP
Each object-manager maintains a queue of messages that have arrived
but that have not yet been handled.
As all processors receive all messages in the same order, the queues of
all managers are basically the same, except that some managers may
be ahead of others in handling the messages at the head of the queue.
.PP
The object-manager of each processor handles the messages of its queue
in strict FIFO order.
A message may be handled as soon as it appears at the head of the queue.
To handle a message \fIGlobalOperation(obj, op, parameters)\fR
the message is removed from the queue, unmarshalled,
the local copy of the object is locked,
the operation is applied to the local copy,
and finally the copy is unlocked.
If the message was sent by a process on the same processor,
the manager unblocks that process (see Figure 4).
.F1
.nr VS 12
.nr PS 10
.vs 12
.ps 10
.ta 0.3i 0.6i 0.9i 1.2i 1.5i 1.8i 2.1i 2.4i 2.7i
.nf
.ft I
\fBreceive\fR \fIGlobalOperation\fI(obj, op, parameters) \fBfrom\fI W \fR\(->\fI
	set write-lock on local copy of obj;
	\fBcall\fI op.code(obj, parameters);~~~# apply operation to local copy
	unlock local copy of obj
	\fBif\fI W is a local process \fBthen\fI
		unblock(W);
	\fBfi\fI;
.fi
.ft R
.nr VS 14
.nr PS 12
.vs 14
.ps 12
.F2
The code to be executed by the object-managers for handling
\fIGlobalOperation\fR messages.
.F3
.PP
Write operations are executed by all object-managers in the same order.
If a read operation is executed concurrently with a write operation,
the read may either be executed before or after the write, but not during it.
Note that this is in agreement with the serialization principle described
above.
.NH 2
Bottom layer: Reliable broadcast
.PP
In this section we describe the protocol that allows a group of nodes
on an unreliable broadcast network to broadcast messages reliably.
The protocol guarantees that all of the receivers in the group receive
all broadcast messages and that all receivers accept the
messages in the same order.
The main purpose of this section is to show that a protocol with the
required semantics is feasible; for a detailed description we refer the reader
to\*(<.\*([.Kaashoek et al.\*(.]\*(>.
.]< 19
.PP
With current microprocessors and LANs, lost or damaged packets and processor
crashes occur very infrequently.
Nevertheless, the probability of an error is not zero, so they must be dealt
with.
For this reason our approach to achieving reliable broadcast is to make
the normal case highly efficient,
even at the expense of making error-recovery more complex, since
error recovery will not be done very often.
.PP
The basic reliable broadcast protocol works as follows.
When the RTS wants to broadcast a message, 
.I M ,
it hands the message to its kernel.
The kernel then encapsulates
.I M
in an ordinary point-to-point message and sends it to a special kernel
called the 
.I sequencer .
The sequencer's node contains the same hardware and kernel as all the others.
The only difference is that a flag in the kernel tells it to process messages 
differently.
If the sequencer should crash, the protocol provides for the election of a new 
sequencer on a different node.
.PP
The sequencer determines the ordering of all broadcast messages by assigning a
\fIsequence number\fR to each message.
When the sequencer receives the point-to-point message containing
.I M ,
it allocates the next sequence number,
.I s
and broadcasts a packet containing
.I M 
and
.I s .
Thus all broadcasts are issued from the same node, by the sequencer.
Assuming that no packets are lost, it is easy to see that if two RTSs
simultaneously want to broadcast, one of them will reach the
sequencer first and its message will be broadcast to all the other nodes first.
Only when that broadcast has been completed will the other broadcast be started.
The sequencer provides a global ordering in time.
In this way, we can easily guarantee the atomicity of broadcasting.
.PP
Although most modern networks are highly reliable, they are not perfect, so
the protocol must deal with errors.
Suppose some node misses a broadcast packet, either due to a communication 
failure or lack of buffer space when the packet arrived.
When the following broadcast packet eventually arrives, the kernel will
immediately notice a gap in the sequence numbers.
It was expecting
.I s
next, and it got @s ~+~ 1@, so it knows it has missed one.
.PP
The kernel then sends a special point-to-point message to the sequencer asking
it for copies of the missing message (or messages, if several have been 
missed).
To be able to reply to such requests, the sequencer stores old 
broadcast messages in its
.I history
.I buffer .
The missing messages are sent point-to-point to the process requesting them.
.PP
As a practical matter, the sequencer has a finite amount of space in its
history buffer, so it cannot store broadcast messages forever.
However, if it could somehow discover that all machines have received
broadcasts up to and including
.I k ,
it could then purge the first 
.I k
broadcast messages from the history buffer.
.PP
The protocol has several ways of letting the sequencer discover this
information.
For one thing, each point-to-point message to the sequencer (e.g., a
broadcast request), contains, in a header field, the sequence number of the 
last broadcast received by the sender of the message.
In this way, the sequencer can maintain a table, indexed by node number,
showing that node
.I i
has received all broadcast messages 0 up to @T sub i@, and perhaps more.
At any moment, the sequencer can compute the lowest value in this table, and
safely discard all broadcast messages up to and including that value.
For example, if the values of this table are 8, 7, 9, 8, 6, and 8, the
sequencer knows that everyone has received broadcasts 0 through 6, so they
can be deleted from the history buffer.
.PP
If a node does not need to do any broadcasting for a while, the sequencer
will not have an up-to-date idea of which broadcasts it has received.
To provide this information, nodes that have been quiet for a certain
interval, \(*Dt, can just send the sequencer a special packet acknowledging all
received broadcasts.
.PP
If, despite all precautions, the sequencer gets out of history space,
it enters a synchronization phase to empty its history buffer.
The synchronization phase consists of a two-phase commit protocol,
during which all nodes are brought up-to-date.
In practice, the synchronization phase is hardly ever entered.
.PP
In short, to do a broadcast, an application process sends the data to the
sequencer, which gives it a sequence number and broadcasts it.
There are no separate acknowledgement packets, but all messages to the
sequencer carry piggybacked acknowledgements.
When a node receives an out of sequence broadcast, it buffers the broadcast
temporarily, and asks the sequencer for the missing broadcasts. 
Since broadcasts are expected to be common\(emmany per second\(emthe only
effect that a missed broadcast has is causing some application process to get
behind by a few tens of milliseconds once in a while, hardly a serious
problem.
.PP
In philosophy, the protocol resembles the one described by Chang
and Maxemchuk\*(<,21 \*(>,
but differs in some major aspects. 
Messages can be delivered to the user as soon as one (special) node has 
acknowledged the message.
In addition, fewer control messages are needed in the normal 
case (no lost messages).
Our protocol therefore is highly efficient, since, during normal operation,
only two packets are needed (assuming that a message fits in a single packet),
one point-to-point packet from the sender to the sequencer and one broadcast
packet from the sequencer to everyone.
A comparison between our protocol and other well known protocols
(e.g., those of
Birman and Joseph\*(<,\*([.Birman and Joseph 1987\*(.]\*(>,
.]< 21
Garcia-Molina and Spauster\*(<,\*([.Garcia-Molina and Spauster 1989\*(.]\*(>,
.]< 22
and several others).
is given in\*([.Kaashoek et al.\*(.]
.]< 19
.sp 1
.NH 1
EXPERIENCE WITH THE ORCA IMPLEMENTATION
.PP
We have built a prototype implementation of the shared data-object model,
using the layered approach described in the previous section.
The prototype runs on the bare hardware, rather than on top of an operating
system.
In effect, it is a new kind of operating system
designed specifically for parallel applications.
It uses the Amoeba protocols\*([.Mullender and Tanenbaum 1986\*(.]
.]< 0
to communicate with our local UNIX\u\s-2\(rg\s0\d
and Amoeba systems.
.PP
The prototype runs on two different systems.
One implementation runs on a multiprocessor with 10 16 Mhz MC68020 CPUs.
The system contains 8Mb of shared memory, which is accessible through a VME bus.
This implementation uses the shared memory to simulate unreliable broadcast
messages.
The reliability of the network (i.e., the percentage of broadcast messages
delivered at a destination) is an adjustable parameter of the system.
In this way, we are able to test our protocol with different degrees of
reliability.
The second implementation runs on a distributed system, containing 10 16 Mhz
MC68020 CPUs connected to each other through an 10 Mbit/s Ethernet\*(<.\*([.Metcalfe and Boggs 1976\*(.]\*(>.
.]< 23
This implementation uses Ethernet multicast communication
to broadcast a message to a group of processors.
All processors are on one Ethernet and are connected to the network by Lance
chip interfaces.
.PP
The performance of the broadcast protocol on the Ethernet system is
described in\*(<.\*([.Kaashoek et al.\*(.]\*(>.
.]< 19
The time needed for multicasting a short message reliably to two processors is
1.3 msec.
With 10 receivers, a multicast takes 1.5 msec.
The time also depends on the number of senders that are active
simultaneously.
If, for example, 7 processors are simultaneously sending a message
to 10 processors, the average time per multicast is 4.6 msec.
This high performance is due to the fact that our protocol is optimized
for the common case (i.e., no lost messages).
During the experiments described below, the number of lost messages
was found to be zero.
.PP
We have used the Ethernet implementation for developing several
parallel applications written in Orca.
Some of these are small, but others are larger.
The largest application we currently have is a parallel chess program,
consisting of about 2500 lines of code. Smaller applications include matrix
multiplication, prime number generation, sorting, and successive
overrelaxation.
In this section we give preliminary performance measurements of
three sample programs running on the Ethernet implementation.
.PP
An implementation of Orca designed for a shared-memory multiprocessor
would be simpler and, in general, faster than a distributed implementation,
since it could put shared objects in the shared memory.
Systems with physical shared memory, however, are much harder to build than
memory-disjoint systems, especially if a large number of processors
(e.g., thousands) is required.
To build highly parallel shared-memory systems, a switching network is
required, which may be very costly\*(<.\*([.Almasi and Gottlieb 1989\*(.]\*(>.
.]< 24
It is interesting to compare the performance of our model
on distributed and shared-memory systems,
and see how much performance is lost by using simpler
and less expensive hardware.
For this purpose, we also wrote a shared-memory implementation of Orca.
This implementation runs on the VME-based multiprocessor described above.
.PP
Below, we will compare the performances of the distributed and nondistributed
implementations.
Both implementations use exactly the same processor boards.
The distributed implementation uses the Ethernet
for point-to-point and broadcast communication.
The nondistributed implementation
uses the shared memory for storing shared objects.
.NH 2
Parallel branch-and-bound
.PP
The first application we will discuss is parallel branch-and-bound.
As a representative example, consider the traveling salesman problem (TSP).
A salesman is given an initial city in which to start, and a list
of cities to visit.
Each city must be visited once and only once.
The objective is to find the shortest path that visits all the cities.
.PP
The algorithm we have implemented in Orca uses
one \fImanager\fR process to generate
initial paths for the salesman, starting at the initial city but visiting
only part of the other cities.
A number of \fIworker\fR processes further expand these initial paths,
using the \(l"nearest-city-first\(r" heuristic.
A worker systematically generates all paths starting with a given
initial path and checks if they are better than the current shortest
full path.
The length of the current best path is stored in a data-object
of type \fIIntObject\fR (see Figure 1).
This object is shared among all worker processes.
The manager and worker processes communicate through a shared
queue data structure, as shown in Figure 5.
.F1
.sp 1
.PS 4.25i
right
X: box "Manager"
arrow
J: box wid 1.5*boxwid
arrow
Wm: box "Worker"
arrow <->
M: box wid 0.75*boxwid ht 0.75*boxht
Wh:box "Worker" with .s at Wm.n+(0, 0.5*boxht)
Wl:box "Worker" with .n at Wm.s-(0, 0.5*boxht)
arrow from J.e to Wh.w
arrow from J.e to Wl.w
arrow <-> from Wh.e to 1/2 <M.w, M.nw>
arrow <-> from Wl.e to 1/2 <M.w, M.sw>
box "job" wid 0.35*boxwid ht 0.75*boxht with .w at J.w+(0.1*boxwid,0)
box "job" same with .e at J.e-(0.1*boxwid,0)
box "..." invis same at J
box invis "JobQueue" ht 0.5*boxht with .n at J.s
box invis "Minimum" wid 0.75*boxwid ht 0.5*boxht with .n at M.s
box wid 0.9*boxwid ht 0.9*boxht at X
box same at Wm
box same at Wh
box same at Wl
.PE
.sp 1
.F2
Structure of the Orca implementation of TSP.
The Manager and Workers are processes.
The JobQueue is a data-object shared among all these processes.
Minimum is a data-object of type IntObject; it is read and written
by all workers.
.sp 1
.F3
.PP
Every time a worker finds a shorter full path, it updates this variable,
using the (indivisible) operation \fIMin\fR.
On the other hand, if a worker ever finds a partial path that is longer than
the current best path, it is pointless to continue, so the path being
investigated is abandoned.
.PP
It should be clear that reading of the current best path length will
be done very often, but since this is a local operation, there is
no communication overhead.
Updating the best path happens much less often, but still only requires
two broadcast messages (one update message and one acknowledgement).
.PP
Although updates of the best path happen infrequently, it is very important
to broadcast any improvements immediately.
If a worker uses an old (i.e., inferior) value of the best path, it
will investigate paths that could have been pruned if the new value
had been known.
In other words, the worker will search more nodes than necessary.
This \fIsearch overhead\fR may easily become a dominating factor
and cause a severe performance degradation.
.PP
In the RPC model, it is very difficult to let processes share data
that are always kept up-to-date.
A halfway solution is to let each worker maintain its own local
minimum and update this local variable whenever the worker gets
a new job.
This approach still suffers from a significant search overhead,
however\*(<.\*([.Bal et al. 1987\*(.]\*(>.
.]< 3
With the shared data-object model, on the other hand, sharing data is easy.
.PP
The performance of the traveling salesman program (for a randomly
generated graph with 12 cities) on the shared-memory
and distributed implementations of Orca are given in Figure 6.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS
.lf 1840
.lf 1 /usr/lib/grap.defines
.lf 1839 -
.lf 1 sm_tsp.grap
.lf 1846 -
.lf 1 mc_tsp.grap
.lf 1848 -
Graph: [
	# gg 1 .. 10, 35.1 .. 350
define xy_gg @ 	(($1)-(.37))*.292398, (($2)-(13.057))*0.00835687 @
define x_gg @ 	(($1)-(.37))*.292398 @
define y_gg @ 	(($1)-(13.057))*0.00835687 @
	frameht = 3
	framewid = 3
Frame:	box ht frameht wid framewid with .sw at 0,0 invis
	line from Frame.nw to Frame.ne invis
	line from Frame.sw to Frame.se 
	line from Frame.sw to Frame.nw 
	line from Frame.se to Frame.ne invis
	textht = .166667
	textwid = .8
Label:	box invis wid 0 ht 2*textht "Time" "(in seconds)" wid textwid with .e at Frame.w - (0.2,0) + (-.2,0)
	textht = .166667
Label:	box invis wid 0 ht 1*textht "number of processors" with .n at Frame.s - (0,2 * textht)
	ticklen = .1
Ticks_gg:	line  left ticklen from (0,y_gg(50))
	"50 " rjust at last line.end
	line  left ticklen from (0,y_gg(100))
	"100 " rjust at last line.end
	line  left ticklen from (0,y_gg(150))
	"150 " rjust at last line.end
	line  left ticklen from (0,y_gg(200))
	"200 " rjust at last line.end
	line  left ticklen from (0,y_gg(250))
	"250 " rjust at last line.end
	line  left ticklen from (0,y_gg(300))
	"300 " rjust at last line.end
	line  left ticklen from (0,y_gg(350))
	"350 " rjust at last line.end
	ticklen = .1
Ticks_gg:	line  down ticklen from (x_gg(1),0)
	box invis "1" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(2),0)
	box invis "2" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(3),0)
	box invis "3" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(4),0)
	box invis "4" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(5),0)
	box invis "5" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(6),0)
	box invis "6" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(7),0)
	box invis "7" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(8),0)
	box invis "8" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(9),0)
	box invis "9" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(10),0)
	box invis "10" ht .25 wid 0 with .n at last line.end
Lgg: xy_gg(1,319.6)
"\s-3\(sq\s+3" at xy_gg(1,319.6)
line  from Lgg to xy_gg(2,161); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,161)
line  from Lgg to xy_gg(3,109.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,109.3)
line  from Lgg to xy_gg(4,83.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(4,83.6)
line  from Lgg to xy_gg(5,68.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(5,68.3)
line  from Lgg to xy_gg(6,59.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,59.3)
line  from Lgg to xy_gg(7,54); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,54)
line  from Lgg to xy_gg(8,50); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(8,50)
line  from Lgg to xy_gg(9,49); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(9,49)
line  from Lgg to xy_gg(10,47.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(10,47.6)
Lgg: xy_gg(1,334.2)
"\s-3\(*D\s+3" at xy_gg(1,334.2)
line dashed from Lgg to xy_gg(2,168.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,168.2)
line dashed from Lgg to xy_gg(3,112.7); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,112.7)
line dashed from Lgg to xy_gg(4,85.3); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(4,85.3)
line dashed from Lgg to xy_gg(5,68); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(5,68)
line dashed from Lgg to xy_gg(6,56.7); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,56.7)
line dashed from Lgg to xy_gg(7,48.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,48.9)
line dashed from Lgg to xy_gg(8,43.1); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(8,43.1)
line dashed from Lgg to xy_gg(9,38.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(9,38.9)
line dashed from Lgg to xy_gg(10,35.1); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(10,35.1)
Lgg: xy_gg(5,330)
"\s-3\(sq\s+3" at xy_gg(5,330)
line  from Lgg to xy_gg(6,330); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,330)
line  from Lgg to xy_gg(7,330); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,330)
box invis wid 0 ht 1*textht "\s-2Shared-memory RTS\s+2" ljust at xy_gg(7.5,330)
Lgg: xy_gg(5,300)
"\s-3\(*D\s+3" at xy_gg(5,300)
line dashed from Lgg to xy_gg(6,300); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,300)
line dashed from Lgg to xy_gg(7,300); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,300)
box invis wid 0 ht 1*textht "\s-2Distributed RTS\s+2" ljust at xy_gg(7.5,300)

] 
.PE
.lf 1861
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Measured execution times for the distributed
and shared-memory implementations of the Traveling Salesman Problem.
.F3
.PP
With fewer than 5 processors, the shared-memory implementation is
slightly faster.
This performance difference is caused by the relatively high
computational overhead of
operations in our prototype distributed implementation.
With 6 or more processors, however, the distributed system is faster.
(Note that Figure 6 shows the performance for one specific TSP graph;
for other randomly generated graphs we have observed similar behavior.)
.PP
Although surprising at first sight, this behavior is easy to explain.
In the distributed RTS, each processor will have its own local copy
of the shared object \fIMinimum\fR.
Thus, all processors can simultaneously read their copies.
In the shared-memory RTS, on the other hand,
the object is put in the shared memory and protected by locks,
so it becomes a sequential bottleneck.
.PP
In our prototype implementation of the RTS, the situation is
particularly bad, because:               
.IP "1."
Operations are implemented inefficiently and thus are expensive.
The \fIValue\fR operation, which is used to read the
current value of \fIMinimum\fR, takes about 40 \fS\(*m\fRsec.
.IP "2."
Exclusive locks\(emrather than readers/writer locking\(emare used.
.IP "3."
The hardware we use allows only one processor
at a time to access the shared memory.
.LP
As the \fIValue\fR operation is executed very frequently, it
will often have to wait for the lock to be free.
Undoubtedly, the contention problem would be less severe in a
well-tuned shared-memory implementation on more advanced hardware.
Still, it is not clear whether the problem can be eliminated
entirely in this way, without using local copies of objects.
.PP
The distributed implementation achieves almost perfect speedup.
With 10 CPUs it is 9.52 times faster than with 1 CPU.
The shared-memory implementation achieves a speedup of only 6.71.
For comparison, the RPC-based implementation of TSP described in\*([.Bal et al. 1987\*(.]
.]< 3
achieves a speedup of only 6.29 for the same input graph, using the same
hardware.
The lower speedup of the RPC implementation is caused by its high
search overhead.
.NH 2
Parallel alpha-beta search
.PP
Alpha-beta search is an efficient method for searching game trees for
two-person, zero-sum games (e.g., chess).
A node in such a game tree corresponds to a position in the game.
Each node has one branch for every possible move in that position.
A value associated with the node indicates how good that position is
for the player who is about to move.
At even levels of the tree, this value is the \fImaximum\fR of the
values of its children; at odd levels it is the \fIminimum\fR, as
the search algorithm assumes each player will choose the move that is
least profitable for his or her opponent.
The alpha-beta algorithm finds the best move in the current position,
searching only part of the tree.
It prunes moves that cannot lead to optimal positions.
.sp 0.5
.PP
We have implemented a parallel version of alpha-beta in Orca, using
essentially the same algorithm as in\*(<.\*([.Bal et al. 1987\*(.]\*(>.
.]< 3
Like the TSP program, the alpha-beta program consists of one manager
process and a number of worker processes, one for each processor.
The manager builds the top part of the search tree, up to a
certain depth. This part of the tree is stored in a data-object shared
among the manager and workers.
Each worker repeatedly takes a leaf node of the top part of the tree
and analyses the corresponding board position, using the normal (sequential)
alpha-beta algorithm.
After the evaluation has been finished, it uses the resulting value
to update the alphas and betas of nodes in the (shared) top part of the tree.
.sp 0.5
.PP
The performance of the parallel alpha-beta program for a
randomly generated search tree of depth 6 and fan-out 38 is shown in Figure 7.
The speedup obtained (6.4 with 10 CPUs) is less than for branch-and-bound.
This is not surprising, since alpha-beta search is hard to
parallelize efficiently\*(<.\*([.Bal and Van Renesse 1986\*(.]\*(>.
.]< 25
However, the performance differences between the distributed and nondistributed
implementations of Orca are very small.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS
.lf 1962
.lf 1 sm_ab.grap
.lf 1968 -
.lf 1 mc_ab.grap
.lf 1970 -
Graph: [
	# gg 1 .. 10, 200 .. 2200
define xy_gg @ 	(($1)-(.37))*.292398, (($2)-(60))*0.00131579 @
define x_gg @ 	(($1)-(.37))*.292398 @
define y_gg @ 	(($1)-(60))*0.00131579 @
	frameht = 3
	framewid = 3
Frame:	box ht frameht wid framewid with .sw at 0,0 invis
	line from Frame.nw to Frame.ne invis
	line from Frame.sw to Frame.se 
	line from Frame.sw to Frame.nw 
	line from Frame.se to Frame.ne invis
	textht = .166667
	textwid = .8
Label:	box invis wid 0 ht 2*textht "Time" "(in seconds)" wid textwid with .e at Frame.w - (0.2,0) + (-.2,0)
	textht = .166667
Label:	box invis wid 0 ht 1*textht "number of processors" with .n at Frame.s - (0,2 * textht)
	ticklen = .1
Ticks_gg:	line  left ticklen from (0,y_gg(200))
	"200 " rjust at last line.end
	line  left ticklen from (0,y_gg(400))
	"400 " rjust at last line.end
	line  left ticklen from (0,y_gg(600))
	"600 " rjust at last line.end
	line  left ticklen from (0,y_gg(800))
	"800 " rjust at last line.end
	line  left ticklen from (0,y_gg(1000))
	"1000 " rjust at last line.end
	line  left ticklen from (0,y_gg(1200))
	"1200 " rjust at last line.end
	line  left ticklen from (0,y_gg(1400))
	"1400 " rjust at last line.end
	line  left ticklen from (0,y_gg(1600))
	"1600 " rjust at last line.end
	line  left ticklen from (0,y_gg(1800))
	"1800 " rjust at last line.end
	line  left ticklen from (0,y_gg(2000))
	"2000 " rjust at last line.end
	line  left ticklen from (0,y_gg(2200))
	"2200 " rjust at last line.end
	ticklen = .1
Ticks_gg:	line  down ticklen from (x_gg(1),0)
	box invis "1" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(2),0)
	box invis "2" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(3),0)
	box invis "3" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(4),0)
	box invis "4" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(5),0)
	box invis "5" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(6),0)
	box invis "6" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(7),0)
	box invis "7" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(8),0)
	box invis "8" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(9),0)
	box invis "9" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(10),0)
	box invis "10" ht .25 wid 0 with .n at last line.end
Lgg: xy_gg(1,2058.6)
"\s-3\(sq\s+3" at xy_gg(1,2058.6)
line  from Lgg to xy_gg(2,1087.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,1087.3)
line  from Lgg to xy_gg(3,804.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,804.3)
line  from Lgg to xy_gg(4,612.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(4,612.3)
line  from Lgg to xy_gg(5,522.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(5,522.3)
line  from Lgg to xy_gg(6,469); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,469)
line  from Lgg to xy_gg(7,404.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,404.3)
line  from Lgg to xy_gg(8,369.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(8,369.3)
line  from Lgg to xy_gg(9,344.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(9,344.6)
line  from Lgg to xy_gg(10,321.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(10,321.6)
Lgg: xy_gg(1,2052.9)
"\s-3\(*D\s+3" at xy_gg(1,2052.9)
line dashed from Lgg to xy_gg(2,1084.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,1084.2)
line dashed from Lgg to xy_gg(3,802.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,802.4)
line dashed from Lgg to xy_gg(4,610.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(4,610.9)
line dashed from Lgg to xy_gg(5,521.5); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(5,521.5)
line dashed from Lgg to xy_gg(6,468.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,468.4)
line dashed from Lgg to xy_gg(7,403.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,403.4)
line dashed from Lgg to xy_gg(8,369); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(8,369)
line dashed from Lgg to xy_gg(9,344.8); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(9,344.8)
line dashed from Lgg to xy_gg(10,321.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(10,321.4)
Lgg: xy_gg(5,2000)
"\s-3\(sq\s+3" at xy_gg(5,2000)
line  from Lgg to xy_gg(6,2000); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,2000)
line  from Lgg to xy_gg(7,2000); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,2000)
box invis wid 0 ht 1*textht "\s-2Shared-memory RTS\s+2" ljust at xy_gg(7.5,2000)
Lgg: xy_gg(5,1800)
"\s-3\(*D\s+3" at xy_gg(5,1800)
line dashed from Lgg to xy_gg(6,1800); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,1800)
line dashed from Lgg to xy_gg(7,1800); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,1800)
box invis wid 0 ht 1*textht "\s-2Distributed RTS\s+2" ljust at xy_gg(7.5,1800)

] 
.PE
.lf 1982
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Measured execution times for the distributed
and shared-memory implementations of Alpha-Beta search.
.F3
.NH 2
Parallel all-pairs shortest paths problem
.PP
The third and last application we describe here is the All-pairs
Shortest Paths problem.
In this problem it is desired to find the length of the shortest path
from any node \fIi\fR to any other node \fIj\fR in a given graph.
The parallel algorithm we use is similar to the one given in\*(<,\*([.Jenq and Sahni 1987\*(.]\*(>,
.]< 26
which is a parallel version of Floyd's algorithm.
The distances between the nodes are represented in a matrix.
Each processor computes part of the result matrix.
The algorithm requires a nontrivial amount of communication and synchronization
among the processors.
.PP
The performance of the program (for a graph with 200 nodes)
on our two implementations is given in Figure 8.
The shared-memory implementation is slightly more efficient.
The performance difference is caused by the high communication overhead
of the algorithm.
The parallel algorithm performs 200 iterations; after each iteration, an
array of 200 integers is sent from one processor to all other processors.
In spite of this high communication overhead, the distributed
implementation still has a good performance.
With 10 CPUs, it achieves a speedup of 9.17 (as opposed to 9.48
for the shared-memory system).
One of the main reasons for this good performance
is the use of broadcast messages for transferring
the array to all processors.
.F1
.nr PS 10
.nr VS 12
.ps 10
.vs 12
.PS
.lf 2025
.lf 1 sm_asp.grap
.lf 2031 -
.lf 1 mc_asp.grap
.lf 2033 -
Graph: [
	# gg 1 .. 10, 42 .. 450
define xy_gg @ 	(($1)-(.37))*.292398, (($2)-(13.44))*0.00644995 @
define x_gg @ 	(($1)-(.37))*.292398 @
define y_gg @ 	(($1)-(13.44))*0.00644995 @
	frameht = 3
	framewid = 3
Frame:	box ht frameht wid framewid with .sw at 0,0 invis
	line from Frame.nw to Frame.ne invis
	line from Frame.sw to Frame.se 
	line from Frame.sw to Frame.nw 
	line from Frame.se to Frame.ne invis
	textht = .166667
	textwid = .8
Label:	box invis wid 0 ht 2*textht "Time" "(in seconds)" wid textwid with .e at Frame.w - (0.2,0) + (-.2,0)
	textht = .166667
Label:	box invis wid 0 ht 1*textht "number of processors" with .n at Frame.s - (0,2 * textht)
	ticklen = .1
Ticks_gg:	line  left ticklen from (0,y_gg(50))
	"50 " rjust at last line.end
	line  left ticklen from (0,y_gg(100))
	"100 " rjust at last line.end
	line  left ticklen from (0,y_gg(150))
	"150 " rjust at last line.end
	line  left ticklen from (0,y_gg(200))
	"200 " rjust at last line.end
	line  left ticklen from (0,y_gg(250))
	"250 " rjust at last line.end
	line  left ticklen from (0,y_gg(300))
	"300 " rjust at last line.end
	line  left ticklen from (0,y_gg(350))
	"350 " rjust at last line.end
	line  left ticklen from (0,y_gg(400))
	"400 " rjust at last line.end
	line  left ticklen from (0,y_gg(450))
	"450 " rjust at last line.end
	ticklen = .1
Ticks_gg:	line  down ticklen from (x_gg(1),0)
	box invis "1" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(2),0)
	box invis "2" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(3),0)
	box invis "3" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(4),0)
	box invis "4" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(5),0)
	box invis "5" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(6),0)
	box invis "6" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(7),0)
	box invis "7" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(8),0)
	box invis "8" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(9),0)
	box invis "9" ht .25 wid 0 with .n at last line.end
	line  down ticklen from (x_gg(10),0)
	box invis "10" ht .25 wid 0 with .n at last line.end
Lgg: xy_gg(1,398)
"\s-3\(sq\s+3" at xy_gg(1,398)
line  from Lgg to xy_gg(2,200.6); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(2,200.6)
line  from Lgg to xy_gg(3,134.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(3,134.3)
line  from Lgg to xy_gg(4,101); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(4,101)
line  from Lgg to xy_gg(5,80.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(5,80.3)
line  from Lgg to xy_gg(6,69.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,69.3)
line  from Lgg to xy_gg(7,60); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,60)
line  from Lgg to xy_gg(8,51); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(8,51)
line  from Lgg to xy_gg(9,47.3); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(9,47.3)
line  from Lgg to xy_gg(10,42); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(10,42)
Lgg: xy_gg(1,432.1)
"\s-3\(*D\s+3" at xy_gg(1,432.1)
line dashed from Lgg to xy_gg(2,218.9); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(2,218.9)
line dashed from Lgg to xy_gg(3,148); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(3,148)
line dashed from Lgg to xy_gg(4,111.4); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(4,111.4)
line dashed from Lgg to xy_gg(5,90); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(5,90)
line dashed from Lgg to xy_gg(6,77.1); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,77.1)
line dashed from Lgg to xy_gg(7,66.3); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,66.3)
line dashed from Lgg to xy_gg(8,58.2); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(8,58.2)
line dashed from Lgg to xy_gg(9,53.6); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(9,53.6)
line dashed from Lgg to xy_gg(10,47.1); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(10,47.1)
Lgg: xy_gg(5,400)
"\s-3\(sq\s+3" at xy_gg(5,400)
line  from Lgg to xy_gg(6,400); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(6,400)
line  from Lgg to xy_gg(7,400); Lgg: Here
"\s-3\(sq\s+3" at xy_gg(7,400)
box invis wid 0 ht 1*textht "\s-2Shared-memory RTS\s+2" ljust at xy_gg(7.5,400)
Lgg: xy_gg(5,360)
"\s-3\(*D\s+3" at xy_gg(5,360)
line dashed from Lgg to xy_gg(6,360); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(6,360)
line dashed from Lgg to xy_gg(7,360); Lgg: Here
"\s-3\(*D\s+3" at xy_gg(7,360)
box invis wid 0 ht 1*textht "\s-2Multicast RTS\s+2" ljust at xy_gg(7.5,360)

] 
.PE
.lf 2045
.nr PS 12
.nr VS 14
.ps 12
.vs 14
.F2
Measured execution times for the distributed
and shared-memory implementations
of the All-pairs Shortest Paths problem.
.F3
.NH 1
CONCLUSION
.PP
We have described a new model and programming language for implementing
parallel applications on distributed systems.
In contrast with most other models for distributed programming
(e.g., the RPC model), our model allows processes on different machines
to share data.
The implementation of the model takes care of the physical distribution
of shared data among processors.
In particular, the implementation replicates shared data, so each process
can directly read the local copy on its own processor.
.PP
The main purpose of this paper was to show that, for several applications,
our model is both easy to use and efficient.
We have studied one distributed implementation of our language and
measured the performance of three applications.
Our model is best suited for moderate-grained parallel applications in which
processes share data that are read frequently and modified infrequently.
As a good example, the TSP program of Section 4.1 uses a global variable
that is read very frequently and is changed only a few times.
This program shows an excellent performance.
In the two other applications (Alpha-Beta search and the All-pairs
Shortest Paths problem), the shared data are changed more frequently.
Still, the performances of these applications are high, because we
use an efficient mechanism for updating replicas, based on broadcasting
rather than point-to-point messages.
.SH
ACKNOWLEDGEMENTS
.LP
We would like to thank Wim van Leersum for implementing the Orca compiler
and Erik Baalbergen, Arnold Geels, and the anonymous referees
for giving useful comments on the paper.
.NH 1
REFERENCES
.in 0.3i
.nr [W \w'10'
.LP
.]<
.ds [F Ahuja et al. 1986
.]-
.ds [T Linda and Friends
.ds [A \*([(A\*()]huja, S.
.as [A ", \*([(C\*()]arriero, N.
.as [A ", and \*([(G\*()]elernter, D.
.ds [J IEEE Computer
.ds [V 19
.ds [N 8
.ds [P 26-34
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1986
.][ 1 journal-article
.ds [F Almasi and Gottlieb 1989
.]-
.ds [T Highly Parallel Computing
.ds [A \*([(A\*()]lmasi, G. S.
.as [A " and \*([(G\*()]ottlieb, A.
.ds [I The Benjamin/Cummings Publishing Company
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D 1989
.][ 2 book
.ds [F Andrews and Schneider 1983
.]-
.ds [T Concepts and Notations for Concurrent Programming
.ds [A \*([(A\*()]ndrews, G. R.
.as [A " and \*([(S\*()]chneider, F. B.
.ds [J ACM Computing Surveys
.ds [V 15
.ds [N 1
.ds [P 3-43
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1983
.][ 1 journal-article
.ds [F Bal et al. 1989
.]-
.ds [T Programming Languages for Distributed Computing Systems
.ds [A \*([(B\*()]al, H. E.
.as [A ", \*([(S\*()]teiner, J. G.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J ACM Computing Surveys
.ds [V 21
.ds [N 3
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1989
.][ 1 journal-article
.ds [F Bal and Tanenbaum 1988
.]-
.ds [T Distributed Programming with Shared Data
.ds [A \*([(B\*()]al, H. E.
.as [A " and \*([(T\*()]anenbaum, A. S.
.ds [J Proc. IEEE CS 1988 Int. Conf. on Computer Languages
.ds [C Miami, Fl.
.ds [P 82-91
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1988
.][ 1 journal-article
.ds [F Bal and Van Renesse 1986
.]-
.ds [T A Summary of Parallel Alpha-Beta Search Results
.ds [A \*([(B\*()]al, H. E.
.as [A " and \*([(V\*()]an Renesse, R.
.ds [J ICCA Journal
.ds [V 9
.ds [N 3
.ds [P 146-149
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1986
.][ 1 journal-article
.ds [F Bal et al. 1987
.]-
.ds [T Implementing Distributed Algorithms Using Remote Procedure Calls
.ds [A \*([(B\*()]al, H. E.
.as [A ", \*([(V\*()]an Renesse, R.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J Proc. AFIPS Nat. Computer Conf.
.ds [I AFIPS Press
.ds [V 56
.ds [C Chicago, Ill.
.ds [P 499-506
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m6 1987
.][ 1 journal-article
.ds [F Birman and Joseph 1987
.]-
.ds [A \*([(B\*()]irman, K. P.
.as [A " and \*([(J\*()]oseph, T. A.
.ds [T Reliable Communication in the Presence of Failures
.ds [J ACM Trans. Comp. Syst.
.ds [V 5
.ds [N 1
.ds [P 47-76
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1987
.][ 1 journal-article
.ds [F Birrell and Nelson 1984
.]-
.ds [T Implementing Remote Procedure Calls
.ds [A \*([(B\*()]irrell, A. D.
.as [A " and \*([(N\*()]elson, B. J.
.ds [J ACM Trans. Comp. Syst.
.ds [V 2
.ds [N 1
.ds [P 39-59
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1984
.][ 1 journal-article
.ds [F Bisiani and Forin 1987
.]-
.ds [T Architectural Support for Multilanguage Parallel Programming on
.as [T " Heterogenous Systems
.ds [A \*([(B\*()]isiani, R.
.as [A " and \*([(F\*()]orin, A.
.ds [J Proc. 2nd Int. Conf. on Architectural Support for Programming Languages and Operating Systems
.ds [C Palo Alto, Calif.
.ds [P 21-30
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1987
.][ 1 journal-article
.ds [F Chang and Maxemchuk 1984
.]-
.ds [T Reliable Broadcast Protocols
.ds [A \*([(C\*()]hang, J.
.as [A " and \*([(M\*()]axemchuk, N. F.
.ds [J ACM Trans. Comp. Syst.
.ds [V 2
.ds [N 3
.ds [P 251-273
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1984
.][ 1 journal-article
.ds [F Cheriton 1985
.]-
.ds [T Preliminary Thoughts on Problem-oriented Shared Memory:
.as [T " A Decentralized Approach to Distributed Systems
.ds [A \*([(C\*()]heriton, D. R.
.ds [J ACM Operating Systems Review
.ds [V 19
.ds [N 4
.ds [P 26-33
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1985
.][ 1 journal-article
.ds [F Eswaran et al. 1976
.]-
.ds [T The Notions of Consistency and Predicate Locks in a Database System
.ds [A \*([(E\*()]swaran, K. P.
.as [A ", \*([(G\*()]ray, J. N.
.as [A ", \*([(L\*()]orie, R. A.
.as [A ", and \*([(T\*()]raiger, I. L.
.ds [J Commun. ACM
.ds [V 19
.ds [N 11
.ds [P 624-633
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mb 1976
.][ 1 journal-article
.ds [F Garcia-Molina and Spauster 1989
.]-
.ds [T Message Ordering in a Multicast Environment
.ds [A \*([(G\*()]arcia-Molina, H.
.as [A " and \*([(S\*()]pauster, A.
.ds [J Proc. 9th Int. Conf. on Distr. Comp. Syst.
.ds [P 354-361
.nr [P 1
.ds [C Newport Beach, CA
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m6 1989
.][ 1 journal-article
.ds [F Hoare 1974
.]-
.ds [T Monitors: An Operating System Structuring Concept
.ds [A \*([(H\*()]oare, C. A. R.
.ds [J Commun. ACM
.ds [V 17
.ds [N 10
.ds [P 549-557
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1974
.][ 1 journal-article
.ds [F Jenq and Sahni 1987
.]-
.ds [T All Pairs Shortest Paths on a Hypercube Multiprocessor
.ds [A \*([(J\*()]enq, J.-F.
.as [A " and \*([(S\*()]ahni, S.
.ds [J Proc. of the 1987 Int. Conf. on Parallel Processing
.ds [C St. Charles, Ill.
.ds [P 713-716
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1987
.][ 1 journal-article
.ds [F Joseph and Birman 1987
.]-
.ds [T Low Cost Management of Replicated Data in Fault-Tolerant Distributed Systems
.ds [A \*([(J\*()]oseph, T. A.
.as [A " and \*([(B\*()]irman, K. P.
.ds [J ACM Trans. Comp. Syst.
.ds [V 4
.ds [N 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1987
.][ 1 journal-article
.ds [F Jul et al. 1988
.]-
.ds [T Fine-Grained Mobility in the Emerald System
.ds [A \*([(J\*()]ul, E.
.as [A ", \*([(L\*()]evy, H.
.as [A ", \*([(H\*()]utchinson, N.
.as [A ", and \*([(B\*()]lack, A.
.ds [J ACM Trans. Comp. Syst.
.ds [V 6
.ds [N 1
.ds [P 109-133
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m2 1988
.][ 1 journal-article
.ds [F Kaashoek et al.
.]-
.ds [T Experience with the Distributed Data Structure Paradigm in Linda
.ds [A \*([(K\*()]aashoek, M. F.
.as [A ", \*([(B\*()]al, H. E.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J Workshop on Experiences with Building Distributed and Multiprocessor Systems
.ds [C Ft. Lauderdale, FL.
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(ma 1989aa
.][ 1 journal-article
.ds [F Kaashoek et al.
.]-
.ds [T An Efficient Reliable Broadcast Protocol
.ds [A \*([(K\*()]aashoek, M. F.
.as [A ", \*([(T\*()]anenbaum, A. S.
.as [A ", \*([(F\*()]lynn~Hummel, S.
.as [A ", and \*([(B\*()]al, H. E.
.ds [I Vrije Universiteit
.ds [C Amsterdam, The Netherlands
.ds [R Report IR-195
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m7 1989bb
.][ 4 tech-report
.ds [F Li 1988
.]-
.ds [T IVY: A Shared Virtual Memory System for Parallel Computing
.ds [A \*([(L\*()]i, K.
.ds [J Proc. 1988 Int. Conf. Parallel Processing (Vol. II)
.ds [C St. Charles, Ill.
.ds [P 94-101
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m8 1988
.][ 1 journal-article
.ds [F Metcalfe and Boggs 1976
.]-
.ds [T Ethernet: Distributed Packet Switching for Local Computer Networks
.ds [A \*([(M\*()]etcalfe, R. M.
.as [A " and \*([(B\*()]oggs, D. R.
.ds [J Commun. ACM
.ds [V 19
.ds [N 7
.ds [P 395-404
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m7 1976
.][ 1 journal-article
.ds [F Mullender and Tanenbaum 1986
.]-
.ds [T The Design of a Capability-Based Distributed Operating System
.ds [A \*([(M\*()]ullender, S. J.
.as [A " and \*([(T\*()]anenbaum, A. S.
.ds [J The Computer Journal
.ds [V 29
.ds [N 4
.ds [P 289-300
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1986
.][ 1 journal-article
.ds [F Tanenbaum and Van Renesse 1985
.]-
.ds [T Distributed Operating Systems
.ds [A \*([(T\*()]anenbaum, A. S.
.as [A " and \*([(V\*()]an Renesse, R.
.ds [J ACM Computing Surveys
.ds [V 17
.ds [N 4
.ds [P 419-470
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(mc 1985
.][ 1 journal-article
.ds [F Tanenbaum and Van Renesse 1988
.]-
.ds [T A Critique of the Remote Procedure Call Paradigm
.ds [A \*([(T\*()]anenbaum, A. S.
.as [A " and \*([(V\*()]an Renesse, R.
.ds [J Proc. of the EUTECO 88 Conf.
.ds [E R. Speth
.ds [P 775-783
.nr [P 1
.ds [I North-Holland
.ds [C Vienna, Austria
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m4 1988
.][ 1 journal-article
.ds [F Tanenbaum et al. 1983
.]-
.ds [T A Practical Toolkit for Making Portable Compilers
.ds [A \*([(T\*()]anenbaum, A. S.
.as [A ", \*([(V\*()]an Staveren, H.
.as [A ", \*([(K\*()]eizer, E. G.
.as [A ", and \*([(S\*()]tevenson, J. W.
.ds [J Commun. ACM
.ds [V 26
.ds [N 9
.ds [P 654-660
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m9 1983
.][ 1 journal-article
.ds [F Van Renesse et al. 1989
.]-
.ds [T The Performance of the Amoeba Distributed Operating System
.ds [A \*([(V\*()]an Renesse, R.
.as [A ", \*([(V\*()]an Staveren, J. M.
.as [A ", and \*([(T\*()]anenbaum, A. S.
.ds [J Software\(emPractice and Experience
.ds [V 19
.ds [N 3
.ds [P 223-234
.nr [P 1
.nr [T 0
.nr [A 0
.nr [O 0
.ds [D \*(m3 1989
.][ 1 journal-article
.nr [W \w'10'
.]>
.nr [W \w'10'
