poly1305-internal.asm 4.67 KB
Newer Older
Niels Möller's avatar
Niels Möller committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
C nettle, low-level cryptographics library
C 
C Copyright (C) 2013 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C 
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
C License for more details.
C 
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB.  If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.

	.file "poly1305-internal.asm"

C Registers mainly used by poly1305_block
define(<CTX>, <%rdi>)
define(<T0>, <%rcx>)
define(<T1>, <%rsi>)
define(<T2>, <%r8>)
define(<H0>, <%r9>)
define(<H1>, <%r10>)
define(<H2>, <%r11>)
	
	C poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
	.text
	C Registers:
	C  %rdi: ctx
	C  %rsi: key
	C  %r8: mask
	ALIGN(16)
PROLOGUE(nettle_poly1305_set_key)
	W64_ENTRY(2,0)
	mov	$0x0ffffffc0fffffff, %r8
	mov	(%rsi), %rax
	and	%r8, %rax
	and	$-4, %r8
	mov	%rax, (CTX)
	mov	8(%rsi), %rax
	and	%r8, %rax
	mov	%rax, P1305_R1 (CTX)
	shr	$2, %rax
	imul	$5, %rax
	mov	%rax, P1305_S1 (CTX)
	xor	XREG(%rax), XREG(%rax)
	mov	%rax, P1305_H0 (CTX)
	mov	%rax, P1305_H1 (CTX)
	mov	XREG(%rax), P1305_H2 (CTX)
	mov	XREG(%rax), P1305_INDEX (CTX)
	
	W64_EXIT(2,0)
	ret

EPILOGUE(nettle_poly1305_set_key)

C 64-bit multiplication mod 2^130 - 5
C
C (x_0 + B x_1 + B^2 x_1) * (r_0 + B r_1) =
C     1   B B^2 B^3 
C   x_0 r_0
C       x_0 r_1
C	x_1 r_0
C	    x_1 r_1
C	    x_2 r_0
C               x_2 r_1
C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1.
C and  r_1 B^3 = 5/4 B r_1
C So we get
C
C  x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)	

	C poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16])
	
PROLOGUE(nettle_poly1305_block)
	mov	(%rsi), T0
	mov	8(%rsi), T1
	mov	$1,	T2
C FIXME: Support windows ABI 
	C Registers:
	C Inputs:  CTX, T0, T1, T2,
	C Outputs: H0, H1, H2, stored into the context.

C_NAME(poly1305_block):
	add	P1305_H0 (CTX), T0
	adc	P1305_H1 (CTX), T1
	adc	P1305_H2 (CTX), XREG(T2)
	mov	P1305_R0 (CTX), %rax
	mul	T0
	mov	%rax, H0
	mov	%rdx, H1
	mov	P1305_S1 (CTX), %rax	C 5/4 r1
	mov	%rax, H2
	mul	T1
	imul	T2, H2
	imul	P1305_R0 (CTX), T2
	add	%rax, H0
	adc	%rdx, H1
	mov	P1305_R0 (CTX), %rax
	mul	T1
	add	%rax, H2
	adc	%rdx, T2
	mov	P1305_R1 (CTX), %rax
	mul	T0
	add	%rax, H2
	adc	%rdx, T2
	mov	T2, %rax
	shr	$2, %rax
	imul	$5, %rax
	and	$3, XREG(T2)
	add	%rax, H0
	adc	H2, H1
	adc	$0, XREG(T2)
	mov	H0, P1305_H0 (CTX)
	mov	H1, P1305_H1 (CTX)
	mov	XREG(T2), P1305_H2 (CTX)
	ret
EPILOGUE(nettle_poly1305_block)

	C poly1305_digest (struct poly1305_ctx *ctx,
 	C		   size_t length, uint8_t *digest,
	C		   const uint8_t *s)
	C Registers:
	C   %rdi: ctx
	C   %rsi: length
	C   %rdx: digest
	C   %rcx: s
	
PROLOGUE(nettle_poly1305_digest)
	W64_ENTRY(4, 0)
	mov	P1305_INDEX (CTX), XREG(%rax)
	push	%rsi
	push	%rdx
	push	%rcx
	test	XREG(%rax), XREG(%rax)
	jz	.Lfinal

	C Pad with a 1 byte.
	C FIXME: Or in, without storing in memory.
	inc	XREG(%rax)	C Also clears high half
	movb	$1, P1305_BLOCK-1 (CTX, %rax)
	
	mov	XREG(%rax), XREG(%rcx)
	mov	$1, T1
	and	$7, XREG(%rcx)	
	shl	$3, XREG(%rcx)
	shl	LREG(%rcx), T1
	dec	T1
	mov	P1305_BLOCK (CTX), T0
	xor	T2, T2
	cmp	$8, XREG(%rax)
	jc	.Lfinal_lt8
	C	If %rax == 16, we get T1 == 0,
	C 	tweak so we get need T1 = -1 instead.
	cmp	$16, XREG(%rax)
	adc	$-1, T1
	and	P1305_BLOCK+8 (CTX), T1
	jmp	.Lfinal_block

.Lfinal_lt8:
	and	T1, T0
	xor	T1, T1
.Lfinal_block:

	call	poly1305_block

.Lfinal:

	mov	P1305_H0 (CTX), H0
	mov	P1305_H1 (CTX), H1
	mov	P1305_H2 (CTX), XREG(H2)
	mov	XREG(H2), XREG(%rax)
	shr	$2, XREG(%rax)
	and	$3, H2
	imul	$5, XREG(%rax)
	add	%rax, H0
	adc	$0, H1
	adc	$0, XREG(H2)

	C Add 5, use result if >= 2^130
	mov	$5, T0
	xor	T1, T1
	add	H0, T0
	adc	H1, T1
	adc	$0, XREG(H2)
	cmp	$4, XREG(H2)
	cmovnc	T0, H0
	cmovnc	T1, H1

	pop	%rcx
	pop	%rdx
	pop	%rsi

	add	(%rcx), H0
	adc	8(%rcx), H1

	C Store, taking length into account
	cmp	$8, %rsi
	jc	.Ldigest_lt8
	mov	H0, (%rdx)
	jz	.Ldigest_done
	cmp	$16, %rsi
	jc	.Ldigest_lt16
	mov	H1, 8(%rdx)
	jmp	.Ldigest_done
.Ldigest_lt16:
	mov	H1, H0
	add	$8, %rdx
	sub	$8, %rsi
.Ldigest_lt8:
	movb	LREG(H0), (%rdx)
	shr	$8, H0
	inc	%rdx
	dec	%rsi
	jnz	.Ldigest_lt8
.Ldigest_done:
	xor	XREG(%rax), XREG(%rax)
	mov	%rax, P1305_H0 (CTX)
	mov	%rax, P1305_H1 (CTX)
	mov	XREG(%rax), P1305_H2 (CTX)
	mov	XREG(%rax), P1305_INDEX (CTX)
	W64_EXIT(4, 0)
	ret