From 081483815557570dee6009ae46d66ed43a42c407 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Tue, 16 Apr 2013 15:38:15 +0200
Subject: [PATCH] Switch to non-logarithmic ALIGN macro.

---
 ChangeLog                          |  4 ++++
 asm.m4                             | 17 +++++++++--------
 x86/aes-decrypt-internal.asm       |  5 ++---
 x86/aes-encrypt-internal.asm       |  5 ++---
 x86/arcfour-crypt.asm              |  4 ++--
 x86/camellia-crypt-internal.asm    |  3 +--
 x86/md5-compress.asm               |  2 +-
 x86/sha1-compress.asm              |  2 +-
 x86_64/aes-decrypt-internal.asm    |  4 ++--
 x86_64/aes-encrypt-internal.asm    |  4 ++--
 x86_64/camellia-crypt-internal.asm |  3 +--
 x86_64/ecc-192-modp.asm            |  2 +-
 x86_64/memxor.asm                  | 11 +++++------
 x86_64/salsa20-core-internal.asm   |  4 ++--
 x86_64/salsa20-crypt.asm           |  4 ++--
 x86_64/serpent-decrypt.asm         |  6 +++---
 x86_64/serpent-encrypt.asm         |  6 +++---
 x86_64/sha1-compress.asm           |  2 +-
 x86_64/sha256-compress.asm         |  4 ++--
 x86_64/sha3-permute.asm            |  6 +++---
 x86_64/sha512-compress.asm         |  4 ++--
 x86_64/umac-nh-n.asm               |  2 +-
 x86_64/umac-nh.asm                 |  2 +-
 23 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 20726f1a..3ee05f6f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2013-04-16  Niels Möller  <nisse@lysator.liu.se>
 
+	* asm.m4 (m4_log2): New macro, similar to the one in gmp.
+	(ALIGN): Changed to take alignment in bytes. Updated all callers,
+	currently used only in x86 and x86_64 files.
+
 	* umac.h (umac32_ctx, umac64_ctx, umac96_ctx, umac128_ctx): Make
 	block count an uint64_t. Reorder some elements to put short values
 	together.
diff --git a/asm.m4 b/asm.m4
index f24442da..200b1361 100644
--- a/asm.m4
+++ b/asm.m4
@@ -26,16 +26,17 @@ define(<EPILOGUE>,
 <ifelse(ELF_STYLE,yes,
 <.size C_NAME($1), . - C_NAME($1)>,<>)>)
 
-dnl Argument to ALIGN is always logarithmic
-dnl FIXME: the << operator is not supported by Solaris m4,
-dnl and ** is not supported by OpenBSD m4.
-dnl We should switch to non-logarithmic ALIGN instead.
+define(<m4_log2>, <m4_log2_internal($1,1,0)>)
+define(<m4_log2_internal>,
+<ifelse($3, 10, <not-a-power-of-two>,
+$1, $2, $3,
+<m4_log2_internal($1, eval(2*$2), eval(1 + $3))>)>)
+
+dnl Argument to ALIGN is always in bytes, and converted to a
+dnl logarithmic .align if necessary.
 
-dnl Need changequote to be able to use the << operator.
 define(<ALIGN>,
-<changequote([,])dnl
-.align ifelse(ALIGN_LOG,yes,$1,eval(1 << $1))dnl >> balance
-changequote(<,>)dnl
+<.align ifelse(ALIGN_LOG,yes,<m4_log2($1)>,$1)
 >)
 
 dnl Struct defining macros
diff --git a/x86/aes-decrypt-internal.asm b/x86/aes-decrypt-internal.asm
index c19853ff..64e59283 100644
--- a/x86/aes-decrypt-internal.asm
+++ b/x86/aes-decrypt-internal.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2001, 2002, 2005 Rafael R. Sevilla, Niels Möller
@@ -61,7 +60,7 @@ C %edi is a temporary, often used as an accumulator.
 	C	       unsigned length, uint8_t *dst,
 	C	       uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_aes_decrypt)
 	C save all registers that need to be saved
 	pushl	%ebx		C  20(%esp)
@@ -94,7 +93,7 @@ PROLOGUE(_nettle_aes_decrypt)
 
 	addl	$16,KEY		C  point to next key
 	movl	KEY,FRAME_KEY
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	AES_ROUND(T, SA,SD,SC,SB, TMP, KEY)
 	movl	TMP, TA
diff --git a/x86/aes-encrypt-internal.asm b/x86/aes-encrypt-internal.asm
index fc7d2c40..9fe32fc5 100644
--- a/x86/aes-encrypt-internal.asm
+++ b/x86/aes-encrypt-internal.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2001, 2002, 2005 Rafael R. Sevilla, Niels Möller
@@ -61,7 +60,7 @@ C %edi is a temporary, often used as an accumulator.
 	C	       unsigned length, uint8_t *dst,
 	C	       uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_aes_encrypt)
 	C save all registers that need to be saved
 	pushl	%ebx		C  20(%esp)
@@ -94,7 +93,7 @@ PROLOGUE(_nettle_aes_encrypt)
 
 	addl	$16,KEY		C  point to next key
 	movl	KEY,FRAME_KEY
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	AES_ROUND(T, SA,SB,SC,SD, TMP, KEY)
 	movl	TMP, TA
diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm
index 842ae4a5..89ee7c9e 100644
--- a/x86/arcfour-crypt.asm
+++ b/x86/arcfour-crypt.asm
@@ -23,7 +23,7 @@ C MA 02111-1301, USA.
 	C               unsigned length, uint8_t *dst,
 	C               const uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_arcfour_crypt)
 	C save all registers that need to be saved
 	pushl	%ebx		C  12(%esp)
@@ -63,7 +63,7 @@ C Register usage:
 	sarl	$1, %edx
 	jc	.Lloop_odd
 	
-	ALIGN(4)
+	ALIGN(16)
 .Lloop:
 	movb	(%ebp, %eax), %cl	C  si.
 	addb    %cl, %bl
diff --git a/x86/camellia-crypt-internal.asm b/x86/camellia-crypt-internal.asm
index e8d892d3..7766220e 100644
--- a/x86/camellia-crypt-internal.asm
+++ b/x86/camellia-crypt-internal.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2010, Niels Möller
@@ -142,7 +141,7 @@ define(<FLINV>, <
 	C	          unsigned length, uint8_t *dst,
 	C	          uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_camellia_crypt)
 	C save all registers that need to be saved
 	pushl	%ebx		C  32(%esp)
diff --git a/x86/md5-compress.asm b/x86/md5-compress.asm
index 1bdada79..ac0cd900 100644
--- a/x86/md5-compress.asm
+++ b/x86/md5-compress.asm
@@ -68,7 +68,7 @@ define(<ROUND>,<
 	C _nettle_md5_compress(uint32_t *state, uint8_t *data)
 	
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_md5_compress)
 	C save all registers that need to be saved
 
diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index afb8d8c5..777615dc 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -160,7 +160,7 @@ PROLOGUE(_nettle_sha1_compress)
 
 	C Loop-mixed to 520 cycles (for the complete function call) on
 	C AMD K7.
-ALIGN(5)
+ALIGN(32)
 	mov	88(%esp), T2
 	mov	OFFSET(2)(T2), %ecx
 	mov	OFFSET(0)(T2), %eax
diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm
index de97de32..0d4f2f92 100644
--- a/x86_64/aes-decrypt-internal.asm
+++ b/x86_64/aes-decrypt-internal.asm
@@ -53,7 +53,7 @@ define(<TMP>,<%rbp>)
 	C	       unsigned length, uint8_t *dst,
 	C	       uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_aes_decrypt)
 	W64_ENTRY(5, 0)
 	test	PARAM_LENGTH, PARAM_LENGTH
@@ -81,7 +81,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	subl	$1, COUNT
 
 	add	$16,KEY		C  point to next key
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	AES_ROUND(TABLE, SA,SD,SC,SB, TA, TMP)
 	AES_ROUND(TABLE, SB,SA,SD,SC, TB, TMP)
diff --git a/x86_64/aes-encrypt-internal.asm b/x86_64/aes-encrypt-internal.asm
index fbfcdb66..4ae0ec85 100644
--- a/x86_64/aes-encrypt-internal.asm
+++ b/x86_64/aes-encrypt-internal.asm
@@ -53,7 +53,7 @@ define(<TMP>,<%rbp>)
 	C	       unsigned length, uint8_t *dst,
 	C	       uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_aes_encrypt)
 	W64_ENTRY(5, 0)
 	test	PARAM_LENGTH, PARAM_LENGTH
@@ -81,7 +81,7 @@ PROLOGUE(_nettle_aes_encrypt)
 	subl	$1, COUNT
 
 	add	$16,KEY		C  point to next key
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	AES_ROUND(TABLE, SA,SB,SC,SD, TA, TMP)
 	AES_ROUND(TABLE, SB,SC,SD,SA, TB, TMP)
diff --git a/x86_64/camellia-crypt-internal.asm b/x86_64/camellia-crypt-internal.asm
index 8fc2fc1b..e44a3dec 100644
--- a/x86_64/camellia-crypt-internal.asm
+++ b/x86_64/camellia-crypt-internal.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2010, Niels Möller
@@ -122,7 +121,7 @@ C	xorl	XREG(TMP), XREG($1)
 	C	          unsigned length, uint8_t *dst,
 	C	          uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_camellia_crypt)
 
 	W64_ENTRY(5, 0)
diff --git a/x86_64/ecc-192-modp.asm b/x86_64/ecc-192-modp.asm
index 5812070b..f3fe4958 100644
--- a/x86_64/ecc-192-modp.asm
+++ b/x86_64/ecc-192-modp.asm
@@ -30,7 +30,7 @@ define(<C2>, <%r11>)
 
 	C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_ecc_192_modp)
 	W64_ENTRY(2, 0)
 	mov	16(RP), T2
diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
index c2ccc920..b22a4721 100644
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2010, Niels Möller
@@ -37,7 +36,7 @@ define(<USE_SSE2>, <no>)
 
 	C memxor(uint8_t *dst, const uint8_t *src, size_t n)
 	C 	          %rdi               %rsi      %rdx
-	ALIGN(4)
+	ALIGN(16)
 
 PROLOGUE(memxor)
 	W64_ENTRY(3, 0)
@@ -48,7 +47,7 @@ EPILOGUE(memxor)
 
 	C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
 	C 	          %rdi              %rsi              %rdx      %rcx
-	ALIGN(4)
+	ALIGN(16)
 	
 PROLOGUE(memxor3)
 	W64_ENTRY(4, 0)
@@ -124,7 +123,7 @@ ifelse(USE_SSE2, yes, <
 	jz	.Ldone
 	jmp 	.Lshift_next
 
-	ALIGN(4)
+	ALIGN(16)
 
 .Lshift_loop:
 	mov	8(AP, N), S0
@@ -177,7 +176,7 @@ C 	jz	.Ldone
 	
 	jmp	.Lword_next
 
-	ALIGN(4)
+	ALIGN(16)
 
 .Lword_loop:
 	mov	8(AP, N), TMP
@@ -234,7 +233,7 @@ ifelse(USE_SSE2, yes, <
 	mov	TMP, (DST, N)
 	jmp	.Lsse2_next
 
-	ALIGN(4)
+	ALIGN(16)
 .Lsse2_loop:
 	movdqu	(AP, N), %xmm0
 	movdqu	(BP, N), %xmm1
diff --git a/x86_64/salsa20-core-internal.asm b/x86_64/salsa20-core-internal.asm
index 81ca2cc8..0e0cdf6a 100644
--- a/x86_64/salsa20-core-internal.asm
+++ b/x86_64/salsa20-core-internal.asm
@@ -34,7 +34,7 @@ include_src(<x86_64/salsa20.m4>)
 
 	C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_salsa20_core)
 	W64_ENTRY(3, 9)	
 
@@ -58,7 +58,7 @@ PROLOGUE(_nettle_salsa20_core)
 
 	shrl	$1, XREG(COUNT)
 
-	ALIGN(4)
+	ALIGN(16)
 .Loop:
 	QROUND(X0, X1, X2, X3)
 	pshufd	$0x93, X1, X1	C	11 00 01 10 (least sign. left)
diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm
index 5d119804..25b7e497 100644
--- a/x86_64/salsa20-crypt.asm
+++ b/x86_64/salsa20-crypt.asm
@@ -50,7 +50,7 @@ C registers.
 	C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length,
 	C		uint8_t *dst, const uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_salsa20_crypt)
 	W64_ENTRY(4, 9)	
 
@@ -92,7 +92,7 @@ PROLOGUE(nettle_salsa20_crypt)
 	SWAP(X0, X2, M0011)	
 
 	movl	$10, XREG(COUNT)
-	ALIGN(4)
+	ALIGN(16)
 .Loop:
 	QROUND(X0, X1, X2, X3)
 	C For the row operations, we first rotate the rows, to get
diff --git a/x86_64/serpent-decrypt.asm b/x86_64/serpent-decrypt.asm
index 02a857ce..d6bacb5d 100644
--- a/x86_64/serpent-decrypt.asm
+++ b/x86_64/serpent-decrypt.asm
@@ -522,7 +522,7 @@ define(<WLTI>, <
 	C	          unsigned length, uint8_t *dst,
 	C	          const uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_serpent_decrypt)
         C save all registers that need to be saved
 	W64_ENTRY(4, 13)
@@ -557,7 +557,7 @@ PROLOGUE(nettle_serpent_decrypt)
 
 	jmp	.Lwround_start
 
-	ALIGN(4)
+	ALIGN(16)
 
 .Lwround_loop:
 	WLTI(X0,X1,X2,X3)
@@ -624,7 +624,7 @@ PROLOGUE(nettle_serpent_decrypt)
 	mov	$384, CNT
 	jmp	.Lround_start
 
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	LTI(x0,x1,x2,x3)
 .Lround_start:
diff --git a/x86_64/serpent-encrypt.asm b/x86_64/serpent-encrypt.asm
index 5362bad1..613ef41e 100644
--- a/x86_64/serpent-encrypt.asm
+++ b/x86_64/serpent-encrypt.asm
@@ -549,7 +549,7 @@ define(<WLT>, <
 	C	          unsigned length, uint8_t *dst,
 	C	          const uint8_t *src)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_serpent_encrypt)
         C save all registers that need to be saved
 	W64_ENTRY(4, 13)
@@ -583,7 +583,7 @@ PROLOGUE(nettle_serpent_encrypt)
 	mov	$-512, CNT
 	jmp	.Lwround_start
 
-	ALIGN(4)
+	ALIGN(16)
 .Lwround_loop:
 	WLT(X0,X1,X2,X3)
 .Lwround_start:
@@ -653,7 +653,7 @@ C parallell.
 	mov	$-512, CNT
 	jmp	.Lround_start
 	
-	ALIGN(4)
+	ALIGN(16)
 .Lround_loop:
 	LT(x0,x1,x2,x3)
 .Lround_start:
diff --git a/x86_64/sha1-compress.asm b/x86_64/sha1-compress.asm
index ffa28d0b..5155683c 100644
--- a/x86_64/sha1-compress.asm
+++ b/x86_64/sha1-compress.asm
@@ -123,7 +123,7 @@ C adding, and then rotating back.
 	C _nettle_sha1_compress(uint32_t *state, uint8_t *input)
 	
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_sha1_compress)
 	C save all registers that need to be saved
 	W64_ENTRY(2, 0)
diff --git a/x86_64/sha256-compress.asm b/x86_64/sha256-compress.asm
index 59f922e7..6bfb7a78 100644
--- a/x86_64/sha256-compress.asm
+++ b/x86_64/sha256-compress.asm
@@ -114,7 +114,7 @@ define(<NOEXPN>, <
 	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
 
 	.text
-	ALIGN(4)
+	ALIGN(16)
 
 PROLOGUE(_nettle_sha256_compress)
 	W64_ENTRY(3, 0)
@@ -137,7 +137,7 @@ PROLOGUE(_nettle_sha256_compress)
 	movl	24(STATE), SG
 	movl	28(STATE), SH
 	xor	COUNT, COUNT
-	ALIGN(4)
+	ALIGN(16)
 
 .Loop1:
 	NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0)
diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm
index 64c4cfc4..7f9a6b79 100644
--- a/x86_64/sha3-permute.asm
+++ b/x86_64/sha3-permute.asm
@@ -94,7 +94,7 @@ define(<ROTL64>, <
 	
 	C sha3_permute(struct sha3_state *ctx)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(nettle_sha3_permute)
 	W64_ENTRY(1, 16)
 	push	%rbp
@@ -139,7 +139,7 @@ PROLOGUE(nettle_sha3_permute)
 	pxor	A2122, C12
 	pxor	A2324, C34
 	
-	ALIGN(4)
+	ALIGN(16)
 .Loop:
 	C The theta step. Combine parity bits, then xor to state.
 	C D0 = C4 ^ (C1 <<< 1)
@@ -483,7 +483,7 @@ PROLOGUE(nettle_sha3_permute)
 
 EPILOGUE(nettle_sha3_permute)
 
-ALIGN(4)
+ALIGN(16)
 .rc:	C In reverse order
 	.quad	0x8000000080008008
 	.quad	0x0000000080000001
diff --git a/x86_64/sha512-compress.asm b/x86_64/sha512-compress.asm
index d54ebda3..21df82a2 100644
--- a/x86_64/sha512-compress.asm
+++ b/x86_64/sha512-compress.asm
@@ -114,7 +114,7 @@ define(<NOEXPN>, <
 	C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
 
 	.text
-	ALIGN(4)
+	ALIGN(16)
 
 PROLOGUE(_nettle_sha512_compress)
 	W64_ENTRY(3, 0)
@@ -137,7 +137,7 @@ PROLOGUE(_nettle_sha512_compress)
 	mov	48(STATE), SG
 	mov	56(STATE), SH
 	xor	COUNT, COUNT
-	ALIGN(4)
+	ALIGN(16)
 
 .Loop1:
 	NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0)
diff --git a/x86_64/umac-nh-n.asm b/x86_64/umac-nh-n.asm
index 06e74269..bcb99487 100644
--- a/x86_64/umac-nh-n.asm
+++ b/x86_64/umac-nh-n.asm
@@ -49,7 +49,7 @@ C aligned.
 	C umac_nh_n(uint64_t *out, unsigned n, const uint32_t *key,
 	C	    unsigned length, const uint8_t *msg)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_umac_nh_n)
 	W64_ENTRY(5, 14)
 	pxor	XY0, XY0
diff --git a/x86_64/umac-nh.asm b/x86_64/umac-nh.asm
index f9230cd8..8e88df6a 100644
--- a/x86_64/umac-nh.asm
+++ b/x86_64/umac-nh.asm
@@ -36,7 +36,7 @@ C aligned.
 	
 	C umac_nh(const uint32_t *key, unsigned length, const uint8_t *msg)
 	.text
-	ALIGN(4)
+	ALIGN(16)
 PROLOGUE(_nettle_umac_nh)
 	W64_ENTRY(3, 7)
 	pxor	XY, XY
-- 
GitLab