;md4.star -- calculates MD4 signature from input string.  MD4 is
;described in R. Rivest, "The MD4 Message-Digest Algorithm",
;available as IETF RFC 1320.  This implementation calculates the
;MD4 checksum of an HP 48 character string or hex string object,
;and returns the checksum as a string of 16 characters or 32
;nibbles (128 bits).  The data is returned as the first 32
;nibbles of a user-supplied character string or hex string of 170
;nibbles.  If the input string is of odd length (in nibbles),
;then the final nibble #x is padded so that its byte is written
;#x8 (hexadecimal), because RFC 1320 treats bytes in
;little-endian order but the bits in bytes in big-endian order.

;Copyright 1995 by Steve VanDevender.  Distributed under the
;terms of the GNU General Public License, version 1 or later.  In
;other words, you are welcome to redistribute this code, verbatim
;or with your own modifications, if you distribute this source
;file with this copyright notice intact.

SAVPTR		= #0679b	;save RPL interpreter registers
GETPTRLOOP	= #05143	;restore RPL interpreter registers, continue RPL

;offsets of items within working data block
m0	=   0	;64-byte work block for final stage of summation
m1	=   8   ;holds remaining data bytes and trailing pad bytes
m2	=  16	;m0 must be 0, because we use the m0-m15 offsets to index into
m3	=  24	;  the string data when more than 64 bytes are in the string
m4	=  32	;m0-m3 are also used to return the final MD4 sum
m5	=  40
m6	=  48
m7	=  56
m8	=  64
m9	=  72
m10	=  80
m11	=  88
m12	=  96
m13	= 104
m14	= 112
m15	= 120
lastr0	= 128	;save area for MD4 state words
lastr1	= 136
lastr2	= 144
lastr3	= 152
length	= 160	;length in bytes of data string
remain	= 165	;remaining bytes to process in data string
worksz	= 170

done	= 0	;set when ready to exit
partpad	= 1	;set when pad spans block boundary

;calculates logical operation F of MD4
;F(x, y, z) = (x & y) | (~x & z)
;leaves result in a.wp
;destroys a.w, b.wp, c.w

macro	op_f	x, y, z

	move.w	$y, a
	move.wp	a, b	;b = y
	move.w	$x, a	;a = x
	move.w	$z, c	;c = z
	and.wp	a, b	;b = x & y
	not.wp	a	;a = ~x
	and.wp	c, a	;a = ~x & z
	or.wp	b, a	;a = (x & y) | (~x & z)

endmacro

;calculates logical operation G of MD4
;G(x, y, z) = (x & y) | (x & z) | (y & z)
;leaves result in a.wp
;destroys a.w, b.wp, c.w

macro	op_g	x, y, z

	move.w	$x, a	;a = x
	move.w	$y, c	;c = y
	and.wp	a, c	;c = x & y
	move.wp	c, b	;b = x & y
	move.w	$z, c	;c = z
	and.wp	c, a	;a = x & z
	or.wp	a, b	;b = (x & y) | (x & z)
	move.w	$y, a	;a = y
	and.wp	c, a	;a = y & z
	or.wp	b, a	;a = (x & y) | (x & z) | (y & z)

endmacro

;convenience macro to xor c with a
;leaves result in a.wp
;destroys b.wp, c.wp
;uses an old PDP-8 programming trick --
;a ^ b is equivalent to (a + b) - 2 * (a & b)
;since the HP 48 lacks an XOR instruction, and
;the AND and OR instructions are longer than the
;ADD and SUB instructions, this is both faster and
;shorter than doing XOR with AND and OR and NOT.

macro	xorca

	move.wp	c, b	;b = c
	and.wp	a, b	;b = a & c
	add.wp	c, a	;a = a + c
	sub.wp	b, a	;a = a - a & c
	sub.wp	b, a	;a = a - a & c

endmacro

;calculates logical operation H of MD4
;H(x, y, z) = x ^ y ^ z
;leaves result in a.wp
;destroys a.w, b.wp, c.w

macro	op_h	x, y, z

	move.w	$x, a	;a = x
	move.w	$y, c	;c = y
	xorca		;a = x ^ y
	move.w	$z, c	;c = z
	xorca		;a = (x ^ y) ^ z

endmacro

;rotate c[0..7] left by four bits (one nibble)
;destroys p, c[8..15]
macro	rc4

	sln.w	c	;sln.w is shorter, and faster on GX
	move	c.8, p	;rotate c.8 ...
	move	p, c.0	;into c.0

endmacro

;rotate c.wp left by one bit
;affects carry

	rc1l=1

macro	rc1

	add.wp	c, c	;double c
	brcc	rc1_$rc1l ;skip if no carry out
	inc.a	c	;put carry in low bit of c
rc1_$rc1l:
	rc1l=rc1l+1

endmacro

;additional convenience macros for sequences of instructions
;to rotate c[0..7] by desired number of bits
;rlN = rotate left by n
;all rlN macros assume c.[0-7] is value to be rotated
;all rlN macros destroy c.[8-15], leave 7 in p

macro	rl3

	rc1
	rc1
	rc1

endmacro

macro	rl4

	rc4
	move	7, p	;restore p

endmacro

macro	rl5

	rl4
	rc1

endmacro

macro	rl7

	rl4
	rc1
	rc1
	rc1

endmacro

macro	rl8

	rc4
	rl4

endmacro

macro	rl9

	rl8
	rc1

endmacro

macro	rl11

	rl8
	rc1
	rc1
	rc1

endmacro

macro	rl12

	rc4
	rc4
	rl4

endmacro

macro	rl13

	rl12
	rc1

endmacro

macro	rl15

	rl12
	rc1
	rc1
	rc1

endmacro

macro	rl16

	rc4
	rc4
	rc4
	rl4

endmacro

macro	rl19

	rl16
	rc1
	rc1
	rc1

endmacro

;compute round 1 in the MD4 sum
;a = (a + F(b, c, d) + M[m]) rotate s
;assumes d.a = pointer to 64-byte sum block, aa, bb, cc, dd are all t-registers (r0-r3)
;destroys a.w, b.w, c.w, d0
;places result in c

macro	round1	aa, bb, cc, dd, m, s

	op_f	$bb, $cc, $dd	; a = F(bb, cc, dd)
	move.w	$aa, c
	add.wp	c, a		; + aa
	clr.a	c
	move	0, p
	move.p2	$m, c
	move	7, p
	add.a	d, c
	move.a	c, d0
	move.wp	@d0, c
	add.wp	a, c		; + M[m]
	$s			; rotate left by s
	move.w	c, $aa		; to aa

endmacro

;compute round 2 in the MD4 sum
;a = (a + G(b, c, d) + M[m] + #5a827999) rotate s
;assumes d.a = pointer to 64-byte sum block, aa, bb, cc, dd are all t-registers (r0-r3)
;destroys a.w, b.w, c.w, d0
;places result in c

macro	round2	aa, bb, cc, dd, m, s

	op_g	$bb, $cc, $dd	; a = G(bb, cc, dd)
	move.w	$aa, c
	add.wp	c, a		; + aa
	clr.a	c
	move	0, p
	move.p2	$m, c
	move	7, p
	add.a	d, c
	move.a	c, d0
	move.wp	@d0, c
	add.wp	c, a		; + [m]
	move	0, p
	move.p8	#5a827999, c
	move	7, p
	add.wp	a, c		; + #5a827999
	$s			; rotate left by s
	move.w	c, $aa		; to aa

endmacro

;compute round 3 in the MD4 sum
;a = (a + H(b, c, d) + M[m] + #6ed9eba1) rotate s
;assumes d.a = pointer to 64-byte sum block, aa, bb, cc, dd are all t-registers (r0-r3)
;destroys a.w, b.w, c.w, d0
;places result in a

macro	round3	aa, bb, cc, dd, m, s

	op_h	$bb, $cc, $dd	; a = G(bb, cc, dd)
	move.w	$aa, c
	add.wp	c, a		; + aa
	clr.a	c
	move	0, p
	move.p2	$m, c
	move	7, p
	add.a	d, c
	move.a	c, d0
	move.wp	@d0, c
	add.wp	c, a		; + [m]
	move	0, p
	move.p8	#6ed9eba1, c
	move	7, p
	add.wp	a, c		; + #6ed9eba1
	$s			; rotate left by s
	move.w	c, $aa		; to aa

endmacro

	header	`E'

;code assumes a 170-nibble string of working data in level 1,
;and the string to sum in level 2.

	code

	call.a	SAVPTR		;save RPL interpreter registers
	move.x	st, c		;get status flags
	push.a	c		;save them

	clrb	done, st	;clear done flag
	clrb	partpad, st	;clear partial pad flag
	move.a	@d1, a		;get pointer to work string from top of stack
	add.a	10, a		;a has pointer to work area

	clr.a	c
	move.p2	length, c
	add.a	a, c
	move.a	c, d0		;d0 points to work[remain]

	add	5, d1		;up one element on RPL stack
	move.a	@d1, c		;get pointer to sum string
	move.a	c, d1		;put in d1
	add	5, d1		;move to length field of string
	move.a	@d1, c		;get string length
	sub.a	5, c		;subtract 5-nibble offset
	move.a	c, @d0		;save in work[length]
	add	5, d0
	move.a	c, @d0		;and in work[remain]
	add	5, d1		;move to data area of sum string
	swap.a	c, d1
	move.a	c, d		;put sum string data pointer in d
	move.a	a, d1		;place saved copy of work ptr in d1

	move.p8	#67452301, c	;initialize starting values of MD4 state
	move.w	c, r0
	move.p8	#efcdab89, c
	move.w	c, r1
	move.p8	#98badcfe, c
	move.w	c, r2
	move.p8	#10325476, c
	move.w	c, r3

;while more than 128 nibbles remain in the string, sum 128-nibble
;blocks out of the string.
loop:
	clr.a	c
	move.p2	remain, c
	swap.a	a, d1
	move.a	a, d1
	add.a	a, c
	move	c, d0
	move.a	@d0, a		;get work[remain] in a
	clr.a	c
	move.p2	128, c
	brgt.a	c, a, pad	;if less than 128 nibbles remain, pad last block

	sub.a	c, a		;decrement remain by 128
	move.a	a, @d0		;and store
	jump.3	process

;once there are less than 128 remaining nibbles, figure
;out how to pad them.  We copy the nibbles to the buffer in the work
;area, append padding, and sum from it.
pad:
	brbs	partpad, st, pad2 ;if partpad set, then we need to finish padding 

	swap.a	c, d1		;get pointer to work area
	move.a	c, d1
	swap.a	c, d		;swap with pointer to string data
	move.a	c, d0		;put string data ptr in d0

	brz.a	a, appendpad	;if no nibbles are left, no copying needed
	move.a	a, b		;get number of nibbles left
	srn.a	b		;convert to number of words

copyremaining:
	move.w	@d0, c		;copy words to work area
	move.w	c, @d1
	add	16, d0
	add	16, d1	
	dec.a	b
	brcc	copyremaining

appendpad:
	move.a	d, c		;get saved pointer to work area
	move.a	c, d1		;put back in d1
	add.a	a, c		;add number of remaining nibbles onto work pointer
	move.a	c, d0		;put in d0
	clr.a	c
	move.p2	128, c		;c.a = 128, conveniently equal to pad byte
	move.a	a, b
	subn.a	c, b		;b = nibbles left to pad = 128 - remain

	brbc	0, a, evenpad	;if a is even (bit 0 = 0), pad even number of nibs
	sub	1, d0		;back up to last nibble of data
	inc.a	b		;effectively one more nibble to pad
	move.b	@d0, c		;get last nibble
	sln.a	c		;shift it left (write as high nibble of byte)
	move.p1	#8, c		;put 8 in low nibble (pad bits in low nibble of byte)

evenpad:
	move.b	c, @d0		;append pad byte to data block
	add	2, d0
	dec.a	b		;two less nibbles to pad
	dec.a	b
	brz.a	b, checkforlast	;if no more nibbles to pad, skip zero fill
	srn.a	b		;convert nibble count to word count
	clr.w	c		;now fill with zeros

fillpad:
	move.w	c, @d0		;this may overrun the 64-byte buffer by up to
	add	16, d0		;16 nibs, but there are 32 nibs beyond it that
	dec.a	b		;will not care if they get overwritten at this time
	brcc	fillpad

checkforlast:
	move.p2	112, c		;112 or more nibbles remaining?
	brle.a	c, a, pad1	;if so, handle partial pad

;the last block has less than 112 nibbles in it, and has already
;been padded, so we just have to append the total length and sum
;it.
	call.3	putlength
	jump.3	process

pad1:
;the last block has 56 or more bytes in it, so we have to sum it,
;then sum another block containing 0s and the total length before
;we finish.  We set the partpad flag to indicate that one more
;iteration will be required after this.
	setb	partpad, st	;set partpad flag
	jump.3	process

;we've already summed a block containing the last bytes
;of data and the beginning of the pad information.  Fill the
;data block with 0s, append the length, and sum to finish.
pad2:
	swap.a	c, d1		;get pointer to work area
	move.a	c, d1
	move.a	c, d		;save in d
	move	c, d0
	clr.w	c
	move	16-7, p		;do 7 times
endpad:
	move.w	c, @d0
	add	16, d0
	inc	p
	brcc	endpad		;carry is set when p = 0
	call.3	putlength
	jump.3	process

;putlength stores the string length (in bits) in the last 64 bits
;of the work buffer.  It also sets the done flag, since the
;length is always in the last block summed.
;destroys a.a, b.w, c.w, d0
putlength:
	clr.a	c		;get string length in a
	move.p2	length, c
	swap.a	a, d1
	move.a	a, d1
	add.a	a, c
	move.a	c, d0
	move.a	@d0, a
	clr.w	b
	move.a	a, b		;copy string length to b
	add.w	b, b		;convert to bit count (multiply by 4)
	add.w	b, b
	clr.a	c
	move.p2	m14, c		;point to buffer location of length
	swap.a	a, d1
	move.a	a, d1
	add.a	a, c
	move.a	c, d0
	move.w	b, c
	move.w	c, @d0		;store length
	setb	done, st	;set done flag
	ret

process:
	clr.a	c
	move.p2	lastr0, c
	swap.a	a, d1
	move.a	a, d1
	add.a	a, c
	move.a	c, d0	;put pointer to lastr0 in d0
	move.w	r0, a	;save current value of r0
	move.8	a, @d0
	add	8, d0
	move.w	r1, a	; ... r1
	move.8	a, @d0
	add	8, d0
	move.w	r2, a   ; ... r2
	move.8	a, @d0
	add	8, d0
	move.w	r3, a	; ... r3
	move.8	a, @d0
	move	7, p

	round1	r0, r1, r2, r3, m0,  rl3
	round1	r3, r0, r1, r2, m1,  rl7
	round1	r2, r3, r0, r1, m2,  rl11
	round1	r1, r2, r3, r0, m3,  rl19
	round1	r0, r1, r2, r3, m4,  rl3
	round1	r3, r0, r1, r2, m5,  rl7
	round1	r2, r3, r0, r1, m6,  rl11
	round1	r1, r2, r3, r0, m7,  rl19
	round1	r0, r1, r2, r3, m8,  rl3
	round1	r3, r0, r1, r2, m9,  rl7
	round1	r2, r3, r0, r1, m10, rl11
	round1	r1, r2, r3, r0, m11, rl19
	round1	r0, r1, r2, r3, m12, rl3
	round1	r3, r0, r1, r2, m13, rl7
	round1	r2, r3, r0, r1, m14, rl11
	round1	r1, r2, r3, r0, m15, rl19

	round2	r0, r1, r2, r3, m0,  rl3
	round2	r3, r0, r1, r2, m4,  rl5
	round2	r2, r3, r0, r1, m8,  rl9
	round2	r1, r2, r3, r0, m12, rl13
	round2	r0, r1, r2, r3, m1,  rl3
	round2	r3, r0, r1, r2, m5,  rl5
	round2	r2, r3, r0, r1, m9,  rl9
	round2	r1, r2, r3, r0, m13, rl13
	round2	r0, r1, r2, r3, m2,  rl3
	round2	r3, r0, r1, r2, m6,  rl5
	round2	r2, r3, r0, r1, m10, rl9
	round2	r1, r2, r3, r0, m14, rl13
	round2	r0, r1, r2, r3, m3,  rl3
	round2	r3, r0, r1, r2, m7,  rl5
	round2	r2, r3, r0, r1, m11, rl9
	round2	r1, r2, r3, r0, m15, rl13

	round3	r0, r1, r2, r3, m0,  rl3
	round3	r3, r0, r1, r2, m8,  rl9
	round3	r2, r3, r0, r1, m4,  rl11
	round3	r1, r2, r3, r0, m12, rl15
	round3	r0, r1, r2, r3, m2,  rl3
	round3	r3, r0, r1, r2, m10, rl9
	round3	r2, r3, r0, r1, m6,  rl11
	round3	r1, r2, r3, r0, m14, rl15
	round3	r0, r1, r2, r3, m1,  rl3
	round3	r3, r0, r1, r2, m9,  rl9
	round3	r2, r3, r0, r1, m5,  rl11
	round3	r1, r2, r3, r0, m13, rl15
	round3	r0, r1, r2, r3, m3,  rl3
	round3	r3, r0, r1, r2, m11, rl9
	round3	r2, r3, r0, r1, m7,  rl11
	round3	r1, r2, r3, r0, m15, rl15

	move	0, p
	clr.a	c
	move.p2	128, c
	add.a	c, d	;increment d by 128

	move.p2	lastr0, c
	swap.a	a, d1
	move.a	a, d1
	add.a	a, c
	move.a	c, d0	;put pointer to lastr0 in d0

	move	7, p
	move.8	@d0, a
	add	8, d0
	move.w	r0, c
	add.wp	a, c
	move.w	c, r0	;r0 = r0 + lastr0
	move.8	@d0, a
	add	8, d0
	move.w	r1, c
	add.wp	a, c
	move.w	c, r1	;r1 = r1 + lastr1
	move.8	@d0, a
	add	8, d0
	move.w	r2, c
	add.wp	a, c
	move.w	c, r2	;r2 = r2 + lastr2
	move.8	@d0, a
	move.w	r3, c
	add.wp	a, c
	move.w	c, r3	;r3 = r3 + lastr3

	move	0, p
	brbs	done, st, exit ;exit if done was set
	jump.4	loop

exit:
	move.w	r0, c	;store MD4 state in string
	move.8	c, @d1	;use d1 because it already points where we want
	add	8, d1
	move.w	r1, c
	move.8	c, @d1
	add	8, d1
	move.w	r2, c
	move.8	c, @d1
	add	8, d1
	move.w	r3, c
	move.8	c, @d1

	pop.a	c
	move.x	c, st		;restore status flags
	call.a	GETPTRLOOP	;restore RPL interpreter registers, return to RPL

	endcode

